Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

config.json +350 -0
configuration_data2vec2.py +415 -0
model.safetensors +3 -0
modeling_data2vec2.py +1466 -0
special_tokens_map.json +51 -0
tokenizer.json +0 -0
tokenizer_config.json +855 -0
utils_data2vec2.py +439 -0

config.json ADDED Viewed

	@@ -0,0 +1,350 @@

+{
+  "_name_or_path": "",
+  "activation_dropout": 0.0,
+  "add_cross_attention": false,
+  "architectures": [
+    "Data2Vec2MultiModel"
+  ],
+  "attention_dropout": 0.1,
+  "auto_map": {
+    "AutoConfig": "configuration_data2vec2.Data2Vec2MultiConfig",
+    "AutoModel": "modeling_data2vec2.Data2Vec2MultiModel"
+  },
+  "bad_words_ids": null,
+  "begin_suppress_tokens": null,
+  "bos_token_id": null,
+  "chunk_size_feed_forward": 0,
+  "clone_batch": 8,
+  "cross_attention_hidden_size": null,
+  "decoder_start_token_id": null,
+  "depth": 12,
+  "diversity_penalty": 0.0,
+  "do_sample": false,
+  "dropout_input": 0.0,
+  "dtype": "float32",
+  "early_stopping": false,
+  "embed_dim": 768,
+  "encoder_dropout": 0.1,
+  "encoder_no_repeat_ngram_size": 0,
+  "end_drop_path_rate": 0.0,
+  "end_of_block_targets": false,
+  "eos_token_id": null,
+  "exponential_decay_length_penalty": null,
+  "finetuning_task": null,
+  "forced_bos_token_id": null,
+  "forced_eos_token_id": null,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1"
+  },
+  "is_decoder": false,
+  "is_encoder_decoder": false,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1
+  },
+  "layer_norm_first": false,
+  "layerdrop": 0.0,
+  "length_penalty": 1.0,
+  "log_norms": true,
+  "max_length": 20,
+  "min_length": 0,
+  "mlp_ratio": 4.0,
+  "modalities": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "audio": {
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "add_masks": false,
+      "alibi_max_pos": null,
+      "alibi_scale": 1.0,
+      "architectures": null,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "conv_pos_depth": 5,
+      "conv_pos_groups": 16,
+      "conv_pos_pre_ln": false,
+      "conv_pos_width": 95,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "dtype": null,
+      "early_stopping": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "encoder_zero_mask": true,
+      "end_drop_path_rate": 0.0,
+      "eos_token_id": null,
+      "exponential_decay_length_penalty": null,
+      "extractor_mode": "layer_norm",
+      "feature_encoder_spec": "[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "init_extra_token_zero": true,
+      "inverse_mask": false,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "keep_masked_pct": 0.0,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "learned_alibi": false,
+      "learned_alibi_scale": false,
+      "learned_alibi_scale_per_head": false,
+      "learned_alibi_scale_per_layer": false,
+      "length_penalty": 1.0,
+      "local_grad_mult": 1.0,
+      "mask_channel_length": 64,
+      "mask_channel_prob": 0.0,
+      "mask_dropout": 0.0,
+      "mask_length": 5,
+      "mask_noise_std": 0.01,
+      "mask_prob": 0.7,
+      "mask_prob_adjust": 0.0,
+      "mask_prob_min": null,
+      "max_length": 20,
+      "min_length": 0,
+      "model_depth": 12,
+      "model_type": "",
+      "no_repeat_ngram_size": 0,
+      "num_alibi_heads": 12,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_extra_tokens": 0,
+      "num_return_sequences": 1,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "prefix": null,
+      "prenet_depth": 4,
+      "prenet_dropout": 0.0,
+      "prenet_layerdrop": 0.0,
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "remove_masks": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "sep_token_id": null,
+      "start_drop_path_rate": 0.0,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torchscript": false,
+      "type": "AUDIO",
+      "typical_p": 1.0,
+      "use_alibi_encoder": false
+    },
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dtype": null,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "text": {
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "add_masks": false,
+      "alibi_max_pos": null,
+      "alibi_scale": 1.0,
+      "architectures": null,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": 0,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "dropout": 0.1,
+      "dtype": null,
+      "early_stopping": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "encoder_zero_mask": true,
+      "end_drop_path_rate": 0.0,
+      "eos_token_id": 2,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "init_extra_token_zero": true,
+      "inverse_mask": false,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "keep_masked_pct": 0.0,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "layernorm_embedding": true,
+      "learned_alibi": false,
+      "learned_alibi_scale": false,
+      "learned_alibi_scale_per_head": false,
+      "learned_alibi_scale_per_layer": false,
+      "learned_pos": true,
+      "length_penalty": 1.0,
+      "local_grad_mult": 1.0,
+      "mask_channel_length": 64,
+      "mask_channel_prob": 0.0,
+      "mask_dropout": 0.0,
+      "mask_length": 3,
+      "mask_noise_std": 0.01,
+      "mask_prob": 0.6,
+      "mask_prob_adjust": 0.0,
+      "mask_prob_min": null,
+      "max_length": 20,
+      "max_source_positions": 512,
+      "min_length": 0,
+      "model_depth": 12,
+      "model_type": "",
+      "no_repeat_ngram_size": 0,
+      "no_scale_embedding": true,
+      "no_token_positional_embeddings": false,
+      "num_alibi_heads": 12,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_extra_tokens": 0,
+      "num_return_sequences": 1,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": 1,
+      "prefix": null,
+      "prenet_depth": 0,
+      "prenet_dropout": 0.0,
+      "prenet_layerdrop": 0.0,
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "remove_masks": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "sep_token_id": null,
+      "start_drop_path_rate": 0.0,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torchscript": false,
+      "type": "TEXT",
+      "typical_p": 1.0,
+      "unk_token_id": 3,
+      "use_alibi_encoder": false,
+      "vocab_size": 50368
+    },
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torchscript": false,
+    "typical_p": 1.0
+  },
+  "model_type": "data2vec2",
+  "n_layers": 12,
+  "no_repeat_ngram_size": 0,
+  "norm_affine": true,
+  "norm_eps": 1e-05,
+  "num_beam_groups": 1,
+  "num_beams": 1,
+  "num_heads": 12,
+  "num_hidden_layers": 12,
+  "num_layers": 12,
+  "num_return_sequences": 1,
+  "output_attentions": false,
+  "output_hidden_states": false,
+  "output_scores": false,
+  "pad_token_id": null,
+  "post_mlp_drop": 0.1,
+  "prefix": null,
+  "problem_type": null,
+  "pruned_heads": {},
+  "remove_invalid_values": false,
+  "repetition_penalty": 1.0,
+  "return_dict": true,
+  "return_dict_in_generate": false,
+  "sep_token_id": null,
+  "start_drop_path_rate": 0.0,
+  "supported_modality": "TEXT",
+  "suppress_tokens": null,
+  "task_specific_params": null,
+  "temperature": 1.0,
+  "tie_encoder_decoder": false,
+  "tie_word_embeddings": true,
+  "tokenizer_class": null,
+  "top_k": 50,
+  "top_p": 1.0,
+  "torchscript": false,
+  "transformers_version": "4.57.0.dev0",
+  "typical_p": 1.0
+}

configuration_data2vec2.py ADDED Viewed

	@@ -0,0 +1,415 @@

+# coding=utf-8
+#
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+#
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Data2Vec2 multi configuration"""
+import os
+from typing import Union, Dict, Any, Optional
+from transformers.dynamic_module_utils import custom_object_save
+from transformers.utils import logging
+from transformers.configuration_utils import PretrainedConfig, CONFIG_NAME
+logger = logging.get_logger(__name__)
+class MyPretrainedConfig(PretrainedConfig):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def to_json_string(self, use_diff: bool = False) -> str:
+        return super().to_json_string(use_diff)
+    def update(self, config_dict):
+        for key, value in config_dict.items():
+            if not hasattr(self, key):
+                continue
+            if isinstance(getattr(self, key), MyPretrainedConfig):
+                getattr(self, key).update(config_dict[key])
+            else:
+                setattr(self, key, value)
+    # Copied from the parent class, only changed use_diff from True to False to correctly save nested config class
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+        """
+        Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~PretrainedConfig.from_pretrained`] class method.
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file will be saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        self._set_token_in_kwargs(kwargs)
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+        non_default_generation_parameters = {}
+        for parameter_name, default_value in self._get_global_generation_defaults().items():
+            if hasattr(self, parameter_name) and getattr(self, parameter_name) != default_value:
+                non_default_generation_parameters[parameter_name] = getattr(self, parameter_name)
+        if len(non_default_generation_parameters) > 0:
+            logger.warning(
+                "Some non-default generation parameters are set in the model config. These should go into a "
+                "GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) "
+                "instead. This warning will be raised to an exception in v4.41.\n"
+                f"Non-default generation parameters: {str(non_default_generation_parameters)}"
+            )
+        os.makedirs(save_directory, exist_ok=True)
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = self._create_repo(repo_id, **kwargs)
+            files_timestamps = self._get_files_timestamps(save_directory)
+        # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
+        # loaded from the Hub.
+        if self._auto_class is not None:
+            custom_object_save(self, save_directory, config=self)
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_config_file = os.path.join(save_directory, CONFIG_NAME)
+        self.to_json_file(output_config_file, use_diff=False)
+        logger.info(f"Configuration saved in {output_config_file}")
+        if push_to_hub:
+            self._upload_modified_files(
+                save_directory,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=kwargs.get("token"),
+            )
+    # Copied from the parent class, change the instantiation and updating of class from config_dict to correctly load nested config
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "MyPretrainedConfig":
+        """
+        Instantiates a [`PretrainedConfig`] from a Python dictionary of parameters.
+        Args:
+            config_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the configuration object. Such a dictionary can be
+                retrieved from a pretrained checkpoint by leveraging the [`~PretrainedConfig.get_config_dict`] method.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the configuration object.
+        Returns:
+            [`PretrainedConfig`]: The configuration object instantiated from those parameters.
+        """
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+        # Those arguments may be passed along for our internal telemetry.
+        # We remove them so they don't appear in `return_unused_kwargs`.
+        kwargs.pop("_from_auto", None)
+        kwargs.pop("_from_pipeline", None)
+        # The commit hash might have been updated in the `config_dict`, we don't want the kwargs to erase that update.
+        if "_commit_hash" in kwargs and "_commit_hash" in config_dict:
+            kwargs["_commit_hash"] = config_dict["_commit_hash"]
+        # We remove it from kwargs so that it does not appear in `return_unused_kwargs`.
+        config_dict["attn_implementation"] = kwargs.pop("attn_implementation", None)
+        # config = cls(**config_dict)
+        # My updated config
+        config = cls()
+        for key, value in config_dict.items():
+            if not hasattr(config, key):
+                continue
+            if isinstance(getattr(config, key), MyPretrainedConfig):
+                getattr(config, key).update(config_dict[key])
+            else:
+                setattr(config, key, value)
+        if hasattr(config, "pruned_heads"):
+            config.pruned_heads = {int(key): value for key, value in config.pruned_heads.items()}
+        # Update config with kwargs if needed
+        if "num_labels" in kwargs and "id2label" in kwargs:
+            num_labels = kwargs["num_labels"]
+            id2label = kwargs["id2label"] if kwargs["id2label"] is not None else []
+            if len(id2label) != num_labels:
+                raise ValueError(
+                    f"You passed along `num_labels={num_labels }` with an incompatible id to label map: "
+                    f"{kwargs['id2label']}. Since those arguments are inconsistent with each other, you should remove "
+                    "one of them."
+                )
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                current_attr = getattr(config, key)
+                # To authorize passing a custom subconfig as kwarg in models that have nested configs.
+                if isinstance(current_attr, PretrainedConfig) and isinstance(value, dict):
+                    value = current_attr.__class__(**value)
+                setattr(config, key, value)
+                if key != "torch_dtype":
+                    to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+        logger.info(f"Model config {config}")
+        if return_unused_kwargs:
+            return config, kwargs
+        else:
+            return config
+class D2v2ModalityConfig(MyPretrainedConfig):
+    def __init__(
+        self,
+        type="AUDIO",
+        prenet_depth=4,
+        prenet_layerdrop=0,
+        prenet_dropout=0.0,
+        start_drop_path_rate=0.0,
+        end_drop_path_rate=0.0,
+        num_extra_tokens=0,
+        init_extra_token_zero=True,
+        mask_noise_std=0.01,
+        mask_prob_min=None,
+        mask_prob=0.7,
+        inverse_mask=False,
+        mask_prob_adjust=0.0,
+        keep_masked_pct=0.0,
+        mask_length=5,
+        add_masks=False,
+        remove_masks=False,
+        mask_dropout=0.0,
+        encoder_zero_mask=True,
+        mask_channel_prob=0.0,
+        mask_channel_length=64,
+        local_grad_mult=1.0,
+        use_alibi_encoder=False,
+        alibi_scale=1.0,
+        learned_alibi=False,
+        alibi_max_pos=None,
+        learned_alibi_scale=False,
+        learned_alibi_scale_per_head=False,
+        learned_alibi_scale_per_layer=False,
+        num_alibi_heads=12,
+        model_depth=12,
+        ema_local_encoder=False,
+        decoder=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.type = type
+        self.prenet_depth = prenet_depth
+        self.prenet_layerdrop = prenet_layerdrop
+        self.prenet_dropout = prenet_dropout
+        self.start_drop_path_rate = start_drop_path_rate
+        self.end_drop_path_rate = end_drop_path_rate
+        self.num_extra_tokens = num_extra_tokens
+        self.init_extra_token_zero = init_extra_token_zero
+        self.mask_noise_std = mask_noise_std
+        self.mask_prob_min = mask_prob_min
+        self.mask_prob = mask_prob
+        self.inverse_mask = inverse_mask
+        self.mask_prob_adjust = mask_prob_adjust
+        self.keep_masked_pct = keep_masked_pct
+        self.mask_length = mask_length
+        self.add_masks = add_masks
+        self.remove_masks = remove_masks
+        self.mask_dropout = mask_dropout
+        self.encoder_zero_mask = encoder_zero_mask
+        self.mask_channel_prob = mask_channel_prob
+        self.mask_channel_length = mask_channel_length
+        self.local_grad_mult = local_grad_mult
+        self.use_alibi_encoder = use_alibi_encoder
+        self.alibi_scale = alibi_scale
+        self.learned_alibi = learned_alibi
+        self.alibi_max_pos = alibi_max_pos
+        self.learned_alibi_scale = learned_alibi_scale
+        self.learned_alibi_scale_per_head = learned_alibi_scale_per_head
+        self.learned_alibi_scale_per_layer = learned_alibi_scale_per_layer
+        self.num_alibi_heads = num_alibi_heads
+        self.model_depth = model_depth
+class D2v2AudioConfig(D2v2ModalityConfig):
+    """
+    Configuration including common args and args specific to audio-only pre-training
+    """
+    def __init__(
+        self,
+        extractor_mode="layer_norm",
+        feature_encoder_spec="[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
+        conv_pos_width=95,
+        conv_pos_groups=16,
+        conv_pos_depth=5,
+        conv_pos_pre_ln=False,
+        **kwargs,
+    ):
+        super().__init__(type="AUDIO", **kwargs)
+        self.extractor_mode = extractor_mode
+        self.feature_encoder_spec = feature_encoder_spec
+        self.conv_pos_width = conv_pos_width
+        self.conv_pos_groups = conv_pos_groups
+        self.conv_pos_depth = conv_pos_depth
+        self.conv_pos_pre_ln = conv_pos_pre_ln
+class D2v2TextConfig(D2v2ModalityConfig):
+    """
+    Configuration including common args and args specific to text-only pre-training
+    """
+    def __init__(
+        self,
+        vocab_size=50000,
+        unk_token_id=3,
+        bos_token_id=0,
+        eos_token_id=2,
+        pad_token_id=1,
+        max_source_positions=512,
+        learned_pos=True,
+        dropout=0.1,
+        no_scale_embedding=True,
+        layernorm_embedding=True,
+        no_token_positional_embeddings=False,
+        **kwargs,
+    ):
+        super().__init__(type="TEXT", **kwargs)
+        self.vocab_size = vocab_size
+        self.unk_token_id = unk_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.max_source_positions = max_source_positions
+        self.learned_pos = learned_pos
+        self.dropout = dropout
+        self.no_scale_embedding = no_scale_embedding
+        self.layernorm_embedding = layernorm_embedding
+        self.no_token_positional_embeddings = no_token_positional_embeddings
+class D2v2ModalitiesConfig(MyPretrainedConfig):
+    def __init__(
+        self,
+        audio_config=D2v2AudioConfig(),
+        text_config=D2v2TextConfig(),
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.audio = audio_config
+        self.text = text_config
+class Data2Vec2MultiConfig(MyPretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Data2Vec2MultiModel`]. It is used to instantiate
+    an Data2Vec2MultiModel model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        depth (`int`, *optional*, defaults to 12):
+            Number of Transformer layers in the encoder.
+    Example:
+    ```python
+    >>> from transformers import Data2Vec2MultiConfig, Data2Vec2MultiModel
+    >>> # Initializing a Data2Vec2MultiConfig for audio
+    >>> configuration = Data2Vec2MultiConfig()
+    >>> # Initializing a model (with random weights) with the configuration
+    >>> model = Data2Vec2MultiModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "data2vec2"
+    def __init__(
+        self,
+        depth=12,
+        start_drop_path_rate=0.0,
+        end_drop_path_rate=0.0,
+        num_heads=12,
+        norm_eps=1e-5,
+        norm_affine=True,
+        encoder_dropout=0.1,
+        post_mlp_drop=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.0,
+        dropout_input=0.0,
+        layerdrop=0.0,
+        embed_dim=768,
+        mlp_ratio=4.0,
+        layer_norm_first=False,
+        end_of_block_targets=False,
+        clone_batch=1,
+        log_norms=True,
+        modalities=D2v2ModalitiesConfig(),
+        supported_modality="AUDIO",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.depth = depth
+        self.start_drop_path_rate = start_drop_path_rate
+        self.end_drop_path_rate = end_drop_path_rate
+        self.num_heads = num_heads
+        self.norm_eps = norm_eps
+        self.norm_affine = norm_affine
+        self.post_mlp_drop = post_mlp_drop
+        self.encoder_dropout = encoder_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.dropout_input = dropout_input
+        self.layerdrop = layerdrop
+        self.embed_dim = embed_dim
+        self.mlp_ratio = mlp_ratio
+        self.layer_norm_first = layer_norm_first
+        self.end_of_block_targets = end_of_block_targets
+        self.clone_batch = clone_batch
+        self.log_norms = log_norms
+        self.modalities = modalities
+        self.supported_modality = supported_modality
+        # Attributes for hopsparser
+        self.hidden_size = embed_dim
+        self.num_layers = depth
+        self.n_layers = depth
+        self.num_hidden_layers = depth
+        self.auto_map = {
+            'AutoConfig': 'configuration_data2vec2.Data2Vec2MultiConfig',
+            'AutoModel': 'modeling_data2vec2.Data2Vec2MultiModel',
+        }

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1f3a5b7e501b52e0d9d4ad11a9396cb2956e01e2972e4062c2dbb844c1419b7
+size 496547472

modeling_data2vec2.py ADDED Viewed

	@@ -0,0 +1,1466 @@

+# coding=utf-8
+#
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright 2022 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Copyright from Fairseq
+""" PyTorch Data2Vec2 Multi model."""
+import math
+import warnings
+from typing import Optional, Tuple, Dict, List, Callable, Any
+from functools import partial
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch import Tensor
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import (
+    Wav2Vec2BaseModelOutput,
+)
+from .configuration_data2vec2 import (
+    Data2Vec2MultiConfig,
+    D2v2ModalityConfig,
+    D2v2AudioConfig,
+    D2v2TextConfig,
+)
+from .utils_data2vec2 import (
+    _learned_alibi_bias,
+    gather_unmasked,
+    gather_unmasked_mask,
+    masked_alibi,
+    random_masking,
+    get_alibi_bias,
+    compute_mask_indices,
+    index_put,
+    MaskInfo, MaskSeed,
+    make_positions,
+)
+#################################################
+### modeling_data2vec2_base.py
+# copied from fairseq.modules.grad_multiply
+class GradMultiply(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, scale):
+        ctx.scale = scale
+        res = x.new(x)
+        return res
+    @staticmethod
+    def backward(ctx, grad):
+        return grad * ctx.scale, None
+# Copied from fairseq.modules.transpose_last.py
+class TransposeLast(nn.Module):
+    def __init__(self, deconstruct_idx=None, tranpose_dim=-2):
+        super().__init__()
+        self.deconstruct_idx = deconstruct_idx
+        self.tranpose_dim = tranpose_dim
+    def forward(self, x):
+        if self.deconstruct_idx is not None:
+            x = x[self.deconstruct_idx]
+        return x.transpose(self.tranpose_dim, -1)
+# Copied from fairseq.modules.layer_norm.py
+class Fp32LayerNorm(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, input):
+        output = F.layer_norm(
+            input.float(),
+            self.normalized_shape,
+            self.weight.float() if self.weight is not None else None,
+            self.bias.float() if self.bias is not None else None,
+            self.eps,
+        )
+        return output.type_as(input)
+def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True):
+    return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
+# Copied from fairseq.modules.fp32_group_norm.py
+class Fp32GroupNorm(nn.GroupNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, input):
+        output = F.group_norm(
+            input.float(),
+            self.num_groups,
+            self.weight.float() if self.weight is not None else None,
+            self.bias.float() if self.bias is not None else None,
+            self.eps,
+        )
+        return output.type_as(input)
+# Copied from fairseq.modules.same_pad.py
+class SamePad(nn.Module):
+    def __init__(self, kernel_size, causal=False):
+        super().__init__()
+        if causal:
+            self.remove = kernel_size - 1
+        else:
+            self.remove = 1 if kernel_size % 2 == 0 else 0
+    def forward(self, x):
+        if self.remove > 0:
+            x = x[:, :, : -self.remove]
+        return x
+# Copied from fairseq.models.wav2vec.wav2vec2.py
+class ConvFeatureExtractionModel(nn.Module):
+    def __init__(
+        self,
+        conv_layers: List[Tuple[int, int, int]],
+        dropout: float = 0.0,
+        mode: str = "default",
+        conv_bias: bool = False,
+    ):
+        super().__init__()
+        assert mode in {"default", "layer_norm"}
+        def block(
+            n_in,
+            n_out,
+            k,
+            stride,
+            is_layer_norm=False,
+            is_group_norm=False,
+            conv_bias=False,
+        ):
+            def make_conv():
+                conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias)
+                nn.init.kaiming_normal_(conv.weight)
+                return conv
+            assert (
+                is_layer_norm and is_group_norm
+            ) == False, "layer norm and group norm are exclusive"
+            if is_layer_norm:
+                return nn.Sequential(
+                    make_conv(),
+                    nn.Dropout(p=dropout),
+                    nn.Sequential(
+                        TransposeLast(),
+                        Fp32LayerNorm(dim, elementwise_affine=True),
+                        TransposeLast(),
+                    ),
+                    nn.GELU(),
+                )
+            elif is_group_norm:
+                return nn.Sequential(
+                    make_conv(),
+                    nn.Dropout(p=dropout),
+                    Fp32GroupNorm(dim, dim, affine=True),
+                    nn.GELU(),
+                )
+            else:
+                return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU())
+        in_d = 1
+        self.conv_layers = nn.ModuleList()
+        for i, cl in enumerate(conv_layers):
+            assert len(cl) == 3, "invalid conv definition: " + str(cl)
+            (dim, k, stride) = cl
+            self.conv_layers.append(
+                block(
+                    in_d,
+                    dim,
+                    k,
+                    stride,
+                    is_layer_norm=mode == "layer_norm",
+                    is_group_norm=mode == "default" and i == 0,
+                    conv_bias=conv_bias,
+                )
+            )
+            in_d = dim
+    def forward(self, x):
+        # BxT -> BxCxT
+        x = x.unsqueeze(1)
+        for conv in self.conv_layers:
+            x = conv(x)
+        return x
+# copied from fairseq.examples.data2vec.models.modalities.modules
+class AltAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        cosine_attention=False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        # self.attn_drop = nn.Dropout(attn_drop)
+        self.attn_drop = attn_drop
+        self.proj = nn.Linear(dim, dim)
+        # self.proj_drop = nn.Dropout(proj_drop)
+        self.proj_drop = proj_drop
+        self.cosine_attention = cosine_attention
+        if cosine_attention:
+            self.logit_scale = nn.Parameter(
+                torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True
+            )
+    def forward(self, x, padding_mask=None, alibi_bias=None, fast=True):
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)  # qkv x B x H x L x D
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+        dtype = q.dtype
+        if not fast:
+            if self.cosine_attention:
+                # cosine attention
+                attn = F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1)
+                logit_scale = torch.clamp(
+                    self.logit_scale, max=torch.log(torch.tensor(1.0 / 0.01))
+                ).exp()
+                attn = attn * logit_scale
+            else:
+                q = q * self.scale
+                attn = q @ k.transpose(-2, -1) # B x C//H x L x L
+            if alibi_bias is not None:
+                attn = attn.type_as(alibi_bias)
+                attn[:, : alibi_bias.size(1)] += alibi_bias
+            if padding_mask is not None and padding_mask.any():
+                attn = attn.masked_fill(
+                    padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
+                    float("-inf"),
+                )
+            attn = attn.softmax(dim=-1, dtype=torch.float32).to(dtype=dtype)
+            # attn = self.attn_drop(attn)
+            attn = F.dropout(attn, p=self.attn_drop)
+            x = (attn @ v).transpose(1, 2)
+        else:
+            # Using pytorch 2's sdpa
+            assert not self.cosine_attention, "Not support cosine attention yet"
+            # Integrate padding_mask and alibi_bias
+            if padding_mask is not None and padding_mask.any():
+                if alibi_bias is not None:
+                    padding_mask = alibi_bias.masked_fill(
+                            padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool),
+                            float("-inf"),
+                        ).to(dtype=dtype)
+                else:
+                    padding_mask = padding_mask.unsqueeze(1).unsqueeze(2).to(
+                        torch.bool).to(dtype=dtype)
+            else:
+                if alibi_bias is not None:
+                    padding_mask = alibi_bias.to(dtype=dtype)
+                else:
+                    padding_mask = None
+            x = F.scaled_dot_product_attention(q, k, v,
+                                attn_mask=padding_mask,
+                                dropout_p=self.attn_drop if self.training else 0.0,
+                                scale=self.scale).transpose(1, 2)
+        x = x.reshape(B, N, C)
+        x = self.proj(x)
+        x = F.dropout(x, p=self.proj_drop if self.training else 0.0)
+        return x
+# copied from fairseq.examples.data2vec.models.modalities.modules.py
+class AltBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        mlp_drop=0.0,
+        post_mlp_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        layer_norm_first=True,
+        ffn_targets=False,
+        cosine_attention=False,
+    ):
+        super().__init__()
+        self.layer_norm_first = layer_norm_first
+        self.ffn_targets = ffn_targets
+        from timm.models.vision_transformer import DropPath, Mlp
+        self.norm1 = norm_layer(dim)
+        self.attn = AltAttention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            cosine_attention=cosine_attention,
+        )
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=mlp_drop,
+        )
+        self.post_mlp_dropout = nn.Dropout(post_mlp_drop, inplace=False)
+    def forward(self, x, padding_mask=None, alibi_bias=None):
+        if self.layer_norm_first:
+            x = x + self.drop_path(self.attn(self.norm1(x), padding_mask, alibi_bias))
+            r = x = self.mlp(self.norm2(x))
+            t = x
+            x = r + self.drop_path(self.post_mlp_dropout(x))
+            if not self.ffn_targets:
+                t = x
+        else:
+            x = x + self.drop_path(self.attn(x, padding_mask, alibi_bias))
+            r = x = self.norm1(x)
+            x = self.mlp(x)
+            t = x
+            x = self.norm2(r + self.drop_path(self.post_mlp_dropout(x)))
+            if not self.ffn_targets:
+                t = x
+        return x, t
+# copied from fairseq.data2vec.models.modalities.modules
+class BlockEncoder(nn.Module):
+    def __init__(self, blocks, norm_layer, layer_norm_first, layerdrop, dropout):
+        super().__init__()
+        self.blocks = blocks
+        self.norm = norm_layer
+        self.layer_norm_first = layer_norm_first
+        self.layerdrop = layerdrop
+        self.dropout = nn.Dropout(dropout, inplace=True)
+    def forward(self, x, padding_mask, alibi_bias, alibi_scale):
+        if self.norm is not None and not self.layer_norm_first:
+            x = self.norm(x)
+        x = self.dropout(x)
+        for i, blk in enumerate(self.blocks):
+            if (
+                not self.training
+                or self.layerdrop == 0
+                or (np.random.random() > self.layerdrop)
+            ):
+                ab = alibi_bias
+                if ab is not None and alibi_scale is not None:
+                    scale = (
+                        alibi_scale[i]
+                        if alibi_scale.size(0) > 1
+                        else alibi_scale.squeeze(0)
+                    )
+                    ab = ab * scale.type_as(ab)
+                x, _ = blk(x, padding_mask, ab)
+        if self.norm is not None and self.layer_norm_first:
+            x = self.norm(x)
+        return x
+class ModalitySpecificEncoder(nn.Module):
+    def __init__(
+        self,
+        modality_cfg: D2v2ModalityConfig,
+        embed_dim: int,
+        local_encoder: nn.Module,
+        project_features: nn.Module,
+        fixed_positional_encoder: Optional[nn.Module],
+        relative_positional_encoder: Optional[nn.Module],
+        context_encoder: nn.Module,
+        decoder: nn.Module,
+        get_alibi_bias: Optional[Callable[[int, int, str, str], torch.Tensor]],
+    ):
+        super().__init__()
+        self.modality_cfg = modality_cfg
+        self.local_encoder = local_encoder
+        self.project_features = project_features
+        self.fixed_positional_encoder = fixed_positional_encoder
+        self.relative_positional_encoder = relative_positional_encoder
+        self.context_encoder = context_encoder
+        self.decoder = None
+        self.get_alibi_bias = get_alibi_bias if modality_cfg.use_alibi_encoder else None
+        self.local_grad_mult = self.modality_cfg.local_grad_mult
+        self.extra_tokens = None
+        if modality_cfg.num_extra_tokens > 0:
+            self.extra_tokens = nn.Parameter(
+                torch.zeros(1, modality_cfg.num_extra_tokens, embed_dim)
+            )
+            if not modality_cfg.init_extra_token_zero:
+                nn.init.normal_(self.extra_tokens)
+            elif self.extra_tokens.size(1) > 1:
+                nn.init.normal_(self.extra_tokens[:, 1:])
+        self.alibi_scale = None
+        if self.get_alibi_bias is not None:
+            self.alibi_scale = nn.Parameter(
+                torch.full(
+                    (
+                        (modality_cfg.prenet_depth + modality_cfg.model_depth)
+                        if modality_cfg.learned_alibi_scale_per_layer
+                        else 1,
+                        1,
+                        self.modality_cfg.num_alibi_heads
+                        if modality_cfg.learned_alibi_scale_per_head
+                        else 1,
+                        1,
+                        1,
+                    ),
+                    modality_cfg.alibi_scale,
+                    dtype=torch.float,
+                ),
+                requires_grad=modality_cfg.learned_alibi_scale,
+            )
+        if modality_cfg.learned_alibi and self.get_alibi_bias is not None:
+            assert modality_cfg.alibi_max_pos is not None
+            alibi_bias = self.get_alibi_bias(
+                batch_size=1,
+                time_steps=modality_cfg.alibi_max_pos,
+                heads=modality_cfg.num_alibi_heads,
+                scale=1.0,
+                dtype=torch.float,
+                device="cpu",
+            )
+            self.alibi_bias = nn.Parameter(alibi_bias)
+            self.get_alibi_bias = partial(
+                _learned_alibi_bias, alibi_bias=self.alibi_bias
+            )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder._freeze_parameters
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+    def convert_padding_mask(self, x, padding_mask):
+        return padding_mask
+    def local_features(self, features):
+        if self.local_grad_mult > 0:
+            if self.local_grad_mult == 1.0:
+                x = self.local_encoder(features)
+            else:
+                x = GradMultiply.apply(
+                    self.local_encoder(features), self.local_grad_mult
+                )
+        else:
+            with torch.no_grad():
+                x = self.local_encoder(features)
+        x = self.project_features(x)
+        return x
+    def contextualized_features(
+        self,
+        x,
+        padding_mask,
+        mask,
+        remove_masked,
+        clone_batch: int = 1,
+        mask_seeds: Optional[torch.Tensor] = None,
+        precomputed_mask=None,
+    ):
+        if padding_mask is not None:
+            padding_mask = self.convert_padding_mask(x, padding_mask)
+        local_features = x
+        if mask and clone_batch == 1:
+            local_features = local_features.clone()
+        orig_B, orig_T, _ = x.shape
+        pre_mask_B = orig_B
+        mask_info = None
+        x_pos = None
+        if self.fixed_positional_encoder is not None:
+            x = x + self.fixed_positional_encoder(x, padding_mask)
+        if mask:
+            if clone_batch > 1:
+                x = x.repeat_interleave(clone_batch, 0)
+                if mask_seeds is not None:
+                    clone_hash = [
+                        int(hash((mask_seeds.seed, ind)) % 1e10)
+                        for ind in range(clone_batch - 1)
+                    ]
+                    clone_hash = torch.tensor([0] + clone_hash).long().view(1, -1)
+                    id = mask_seeds.ids
+                    id = id.repeat_interleave(clone_batch, 0)
+                    id = id.view(-1, clone_batch) + clone_hash.to(id)
+                    id = id.view(-1)
+                    mask_seeds = MaskSeed(
+                        seed=mask_seeds.seed, update=mask_seeds.update, ids=id
+                    )
+                if padding_mask is not None:
+                    padding_mask = padding_mask.repeat_interleave(clone_batch, 0)
+            x, mask_info = self.compute_mask(
+                x,
+                padding_mask,
+                mask_seed=mask_seeds,
+                apply=self.relative_positional_encoder is not None or not remove_masked,
+                precomputed_mask=precomputed_mask,
+            )
+        if self.relative_positional_encoder is not None:
+            x_pos = self.relative_positional_encoder(x)
+        masked_padding_mask = padding_mask
+        if mask and remove_masked:
+            x = mask_info.x_unmasked
+            if x_pos is not None:
+                x = x + gather_unmasked(x_pos, mask_info)
+            if padding_mask is not None and padding_mask.any():
+                masked_padding_mask = gather_unmasked_mask(padding_mask, mask_info)
+                if not masked_padding_mask.any():
+                    masked_padding_mask = None
+            else:
+                masked_padding_mask = None
+        elif x_pos is not None:
+            x = x + x_pos
+        alibi_bias = None
+        alibi_scale = self.alibi_scale
+        if self.get_alibi_bias is not None:
+            alibi_bias = self.get_alibi_bias(
+                batch_size=pre_mask_B,
+                time_steps=orig_T,
+                heads=self.modality_cfg.num_alibi_heads,
+                dtype=torch.float32,
+                device=x.device,
+            )
+            if alibi_scale is not None:
+                alibi_scale = alibi_scale.clamp_min(0)
+                if alibi_scale.size(0) == 1:
+                    alibi_bias = alibi_bias * alibi_scale.squeeze(0).type_as(alibi_bias)
+                    alibi_scale = None
+            if clone_batch > 1:
+                alibi_bias = alibi_bias.repeat_interleave(clone_batch, 0)
+            if mask_info is not None and remove_masked:
+                alibi_bias = masked_alibi(alibi_bias, mask_info)
+        if self.extra_tokens is not None:
+            num = self.extra_tokens.size(1)
+            x = torch.cat([self.extra_tokens.expand(x.size(0), -1, -1), x], dim=1)
+            if masked_padding_mask is not None:
+                # B x T
+                masked_padding_mask = F.pad(masked_padding_mask, (num, 0))
+            if alibi_bias is not None:
+                # B x H x T x T
+                alibi_bias = F.pad(alibi_bias, (num, 0, num, 0))
+        x = self.context_encoder(
+            x,
+            masked_padding_mask,
+            alibi_bias,
+            alibi_scale[: self.modality_cfg.prenet_depth]
+            if alibi_scale is not None
+            else None,
+        )
+        return {
+            "x": x,
+            "local_features": local_features,
+            "padding_mask": masked_padding_mask,
+            "alibi_bias": alibi_bias,
+            "alibi_scale": alibi_scale[self.modality_cfg.prenet_depth :]
+            if alibi_scale is not None and alibi_scale.size(0) > 1
+            else alibi_scale,
+            "encoder_mask": mask_info,
+        }
+    def forward(
+        self,
+        features,
+        padding_mask,
+        mask: bool,
+        remove_masked: bool,
+        clone_batch: int = 1,
+        mask_seeds: Optional[torch.Tensor] = None,
+        precomputed_mask=None,
+    ):
+        x = self.local_features(features)
+        return self.contextualized_features(
+            x,
+            padding_mask,
+            mask,
+            remove_masked,
+            clone_batch,
+            mask_seeds,
+            precomputed_mask,
+        )
+    def compute_mask(
+        self,
+        x,
+        padding_mask,
+        mask_seed: Optional[MaskSeed],
+        apply,
+        precomputed_mask,
+    ):
+        if precomputed_mask is not None:
+            mask = precomputed_mask
+            mask_info = self.make_maskinfo(x, mask)
+        else:
+            B, T, C = x.shape
+            cfg = self.modality_cfg
+            mask_prob = cfg.mask_prob
+            if (
+                cfg.mask_prob_min is not None
+                and cfg.mask_prob_min >= 0
+                and cfg.mask_prob_min < mask_prob
+            ):
+                mask_prob = np.random.uniform(cfg.mask_prob_min, mask_prob)
+            if mask_prob > 0:
+                if cfg.mask_length == 1:
+                    mask_info = random_masking(x, mask_prob, mask_seed)
+                else:
+                    if self.modality_cfg.inverse_mask:
+                        mask_prob = 1 - mask_prob
+                    mask = compute_mask_indices(
+                        (B, T),
+                        padding_mask,
+                        mask_prob,
+                        cfg.mask_length,
+                        min_masks=1,
+                        require_same_masks=True,
+                        mask_dropout=cfg.mask_dropout,
+                        add_masks=cfg.add_masks,
+                        seed=mask_seed.seed if mask_seed is not None else None,
+                        epoch=mask_seed.update if mask_seed is not None else None,
+                        indices=mask_seed.ids if mask_seed is not None else None,
+                    )
+                    mask = torch.from_numpy(mask).to(device=x.device)
+                    if self.modality_cfg.inverse_mask:
+                        mask = 1 - mask
+                    mask_info = self.make_maskinfo(x, mask)
+            else:
+                mask_info = None
+        if apply:
+            x = self.apply_mask(x, mask_info)
+        return x, mask_info
+    def make_maskinfo(self, x, mask, shape=None):
+        if shape is None:
+            B, T, D = x.shape
+        else:
+            B, T, D = shape
+        mask = mask.to(torch.uint8)
+        ids_shuffle = mask.argsort(dim=1)
+        ids_restore = ids_shuffle.argsort(dim=1).unsqueeze(-1).expand(-1, -1, D)
+        len_keep = T - mask[0].sum()
+        if self.modality_cfg.keep_masked_pct > 0:
+            len_keep += round((T - int(len_keep)) * self.modality_cfg.keep_masked_pct)
+        ids_keep = ids_shuffle[:, :len_keep]
+        if shape is not None:
+            x_unmasked = None
+        else:
+            ids_keep = ids_keep.unsqueeze(-1).expand(-1, -1, D)
+            x_unmasked = torch.gather(x, dim=1, index=ids_keep)
+        mask_info = MaskInfo(
+            x_unmasked=x_unmasked,
+            mask=mask,
+            ids_restore=ids_restore,
+            ids_keep=ids_keep,
+        )
+        return mask_info
+    def apply_mask(self, x, mask_info):
+        cfg = self.modality_cfg
+        B, T, C = x.shape
+        if mask_info is not None:
+            mask = mask_info.mask
+            if cfg.encoder_zero_mask:
+                x = x * (1 - mask.type_as(x).unsqueeze(-1))
+            else:
+                num_masks = mask.sum().item()
+                masks = x.new_empty(num_masks, x.size(-1)).normal_(
+                    0, cfg.mask_noise_std
+                )
+                x = index_put(x, mask, masks)
+        if cfg.mask_channel_prob > 0:
+            mask_channel = compute_mask_indices(
+                (B, C),
+                None,
+                cfg.mask_channel_prob,
+                cfg.mask_channel_length,
+            )
+            mask_channel = (
+                torch.from_numpy(mask_channel)
+                .to(x.device)
+                .unsqueeze(1)
+                .expand(-1, T, -1)
+            )
+            x = index_put(x, mask_channel, 0)
+        return x
+class AudioEncoder(ModalitySpecificEncoder):
+    modality_cfg: D2v2AudioConfig
+    def __init__(
+        self,
+        modality_cfg: D2v2AudioConfig,
+        embed_dim: int,
+        make_block: Callable[[float], nn.ModuleList],
+        norm_layer: Callable[[int], nn.LayerNorm],
+        layer_norm_first: bool,
+        alibi_biases: Dict,
+    ):
+        self.feature_enc_layers = eval(modality_cfg.feature_encoder_spec)
+        feature_embed_dim = self.feature_enc_layers[-1][0]
+        local_encoder = ConvFeatureExtractionModel(
+            conv_layers=self.feature_enc_layers,
+            dropout=0.0,
+            mode=modality_cfg.extractor_mode,
+            conv_bias=False,
+        )
+        project_features = nn.Sequential(
+            TransposeLast(),
+            nn.LayerNorm(feature_embed_dim),
+            nn.Linear(feature_embed_dim, embed_dim),
+        )
+        num_pos_layers = modality_cfg.conv_pos_depth
+        k = max(3, modality_cfg.conv_pos_width // num_pos_layers)
+        positional_encoder = nn.Sequential(
+            TransposeLast(),
+            *[
+                nn.Sequential(
+                    nn.Conv1d(
+                        embed_dim,
+                        embed_dim,
+                        kernel_size=k,
+                        padding=k // 2,
+                        groups=modality_cfg.conv_pos_groups,
+                    ),
+                    SamePad(k),
+                    TransposeLast(),
+                    LayerNorm(embed_dim, elementwise_affine=False),
+                    TransposeLast(),
+                    nn.GELU(),
+                )
+                for _ in range(num_pos_layers)
+            ],
+            TransposeLast(),
+        )
+        if modality_cfg.conv_pos_pre_ln:
+            positional_encoder = nn.Sequential(LayerNorm(embed_dim), positional_encoder)
+        dpr = np.linspace(
+            modality_cfg.start_drop_path_rate,
+            modality_cfg.end_drop_path_rate,
+            modality_cfg.prenet_depth,
+        )
+        context_encoder = BlockEncoder(
+            nn.ModuleList(make_block(dpr[i]) for i in range(modality_cfg.prenet_depth)),
+            norm_layer(embed_dim) if not layer_norm_first else None,
+            layer_norm_first,
+            modality_cfg.prenet_layerdrop,
+            modality_cfg.prenet_dropout,
+        )
+        decoder = None
+        alibi_bias_fn = partial(get_alibi_bias, alibi_biases=alibi_biases)
+        super().__init__(
+            modality_cfg=modality_cfg,
+            embed_dim=embed_dim,
+            local_encoder=local_encoder,
+            project_features=project_features,
+            fixed_positional_encoder=None,
+            relative_positional_encoder=positional_encoder,
+            context_encoder=context_encoder,
+            decoder=decoder,
+            get_alibi_bias=alibi_bias_fn,
+        )
+    def convert_padding_mask(self, x, padding_mask):
+        def get_feat_extract_output_lengths(input_lengths: torch.LongTensor):
+            """
+            Computes the output length of the convolutional layers
+            """
+            def _conv_out_length(input_length, kernel_size, stride):
+                return torch.floor((input_length - kernel_size) / stride + 1)
+            for i in range(len(self.feature_enc_layers)):
+                input_lengths = _conv_out_length(
+                    input_lengths,
+                    self.feature_enc_layers[i][1],
+                    self.feature_enc_layers[i][2],
+                )
+            return input_lengths.to(torch.long)
+        if padding_mask is not None:
+            input_lengths = (1 - padding_mask.long()).sum(-1)
+            # apply conv formula to get real output_lengths
+            output_lengths = get_feat_extract_output_lengths(input_lengths)
+            if padding_mask.any():
+                padding_mask = torch.zeros(x.shape[:2], dtype=x.dtype, device=x.device)
+                # these two operations makes sure that all values
+                # before the output lengths indices are attended to
+                padding_mask[
+                    (
+                        torch.arange(padding_mask.shape[0], device=padding_mask.device),
+                        output_lengths - 1,
+                    )
+                ] = 1
+                padding_mask = (
+                    1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])
+                ).bool()
+            else:
+                padding_mask = torch.zeros(
+                    x.shape[:2], dtype=torch.bool, device=x.device
+                )
+        return padding_mask
+class LearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    Padding ids are ignored by either offsetting based on padding_idx
+    or by setting padding_idx to None and ensuring that the appropriate
+    position ids are passed to the forward function.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.onnx_trace = False
+        if self.padding_idx is not None:
+            self.max_positions = self.num_embeddings - self.padding_idx - 1
+        else:
+            self.max_positions = self.num_embeddings
+    def forward(
+        self,
+        input: Tensor,
+        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+        positions: Optional[Tensor] = None,
+    ):
+        """Input is expected to be of size [bsz x seqlen]."""
+        assert (positions is None) or (
+            self.padding_idx is None
+        ), "If positions is pre-computed then padding_idx should not be set."
+        if positions is None:
+            if incremental_state is not None:
+                # positions is the same for every token when decoding a single step
+                # Without the int() cast, it doesn't work in some cases when exporting to ONNX
+                positions = torch.zeros(
+                    (1, 1), device=input.device, dtype=input.dtype
+                ).fill_(int(self.padding_idx + input.size(1)))
+            else:
+                positions = make_positions(
+                    input, self.padding_idx, onnx_trace=self.onnx_trace
+                )
+        return F.embedding(
+            positions,
+            self.weight,
+            self.padding_idx,
+            self.max_norm,
+            self.norm_type,
+            self.scale_grad_by_freq,
+            self.sparse,
+        )
+class SinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length.
+    Padding symbols are ignored.
+    """
+    def __init__(self, embedding_dim, padding_idx, init_size=1024):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx if padding_idx is not None else 0
+        self.register_buffer("weights", SinusoidalPositionalEmbedding.get_embedding(
+            init_size, embedding_dim, padding_idx
+        ), persistent=False)
+        self.max_positions = int(1e5)
+        self.onnx_trace = False
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+    def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
+        # Ignore some deprecated keys that were used in older versions
+        deprecated_keys = ["weights", "_float_tensor"]
+        for key in deprecated_keys:
+            if prefix + key in state_dict:
+                del state_dict[prefix + key]
+        super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
+    @staticmethod
+    def get_embedding(
+        num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None
+    ):
+        """Build sinusoidal embeddings.
+        This matches the implementation in tensor2tensor, but differs slightly
+        from the description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(
+            1
+        ) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(
+            num_embeddings, -1
+        )
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb
+    def forward(
+        self,
+        input,
+        incremental_state: Optional[Any] = None,
+        timestep: Optional[Tensor] = None,
+        positions: Optional[Any] = None,
+    ):
+        """Input is expected to be of size [bsz x seqlen]."""
+        bspair = torch.onnx.operators.shape_as_tensor(input)
+        bsz, seq_len = bspair[0], bspair[1]
+        max_pos = self.padding_idx + 1 + seq_len
+        if max_pos > self.weights.size(0):
+            # expand embeddings if needed
+            self.weights = SinusoidalPositionalEmbedding.get_embedding(
+                max_pos, self.embedding_dim, self.padding_idx
+            ).to(self.weights)
+        if incremental_state is not None:
+            # positions is the same for every token when decoding a single step
+            pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len
+            if self.onnx_trace:
+                return (
+                    self.weights.index_select(index=self.padding_idx + pos, dim=0)
+                    .unsqueeze(1)
+                    .repeat(bsz, 1, 1)
+                )
+            return self.weights[self.padding_idx + pos, :].expand(bsz, 1, -1)
+        positions = make_positions(
+            input, self.padding_idx, onnx_trace=self.onnx_trace
+        )
+        if self.onnx_trace:
+            flat_embeddings = self.weights.detach().index_select(0, positions.view(-1))
+            embedding_shape = torch.cat(
+                (bsz.view(1), seq_len.view(1), torch.tensor([-1], dtype=torch.long))
+            )
+            embeddings = torch.onnx.operators.reshape_from_tensor_shape(
+                flat_embeddings, embedding_shape
+            )
+            return embeddings
+        return (
+            self.weights.index_select(0, positions.view(-1))
+            .view(bsz, seq_len, -1)
+            .detach()
+        )
+def PositionalEmbedding(
+    num_embeddings: int,
+    embedding_dim: int,
+    padding_idx: int,
+    learned: bool = False,
+):
+    if learned:
+        # if padding_idx is specified then offset the embedding ids by
+        # this index and adjust num_embeddings appropriately
+        # TODO: The right place for this offset would be inside
+        # LearnedPositionalEmbedding. Move this there for a cleaner implementation.
+        if padding_idx is not None:
+            num_embeddings = num_embeddings + padding_idx + 1
+        m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx)
+        nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5)
+        if padding_idx is not None:
+            nn.init.constant_(m.weight[padding_idx], 0)
+    else:
+        m = SinusoidalPositionalEmbedding(
+            embedding_dim,
+            padding_idx,
+            init_size=num_embeddings + padding_idx + 1,
+        )
+    return m
+class TextLocalEncoder(nn.Module):
+    def __init__(
+        self,
+        vocab_size,
+        embed_dim,
+        max_source_positions,
+        pad_idx,
+        no_scale_embedding,
+        layernorm_embedding,
+        dropout,
+        no_token_positional_embeddings,
+        learned_pos,
+    ):
+        super().__init__()
+        self.pad_idx = pad_idx
+        self.dropout_module = nn.Dropout(dropout)
+        self.embed_tokens = nn.Embedding(vocab_size, embed_dim, pad_idx)
+        self.embed_scale = 1.0 if no_scale_embedding else math.sqrt(embed_dim)
+        self.embed_positions = (
+            PositionalEmbedding(
+                max_source_positions,
+                embed_dim,
+                pad_idx,
+                learned=learned_pos,
+            )
+            if not no_token_positional_embeddings
+            else None
+        )
+        self.embed_scale = 1.0 if no_scale_embedding else math.sqrt(embed_dim)
+        self.layernorm_embedding = None
+        if layernorm_embedding:
+            self.layernorm_embedding = LayerNorm(embed_dim)
+    def forward(self, src_tokens):
+        x = self.embed_scale * self.embed_tokens(src_tokens)
+        if self.embed_positions is not None:
+            x = x + self.embed_positions(src_tokens)
+        if self.layernorm_embedding is not None:
+            x = self.layernorm_embedding(x)
+        x = self.dropout_module(x)
+        return x
+class TextEncoder(ModalitySpecificEncoder):
+    modality_cfg: D2v2TextConfig
+    def __init__(
+        self,
+        modality_cfg: D2v2TextConfig,
+        embed_dim: int,
+        make_block: Callable[[float], nn.ModuleList],
+        norm_layer: Callable[[int], nn.LayerNorm],
+        layer_norm_first: bool,
+        alibi_biases: Dict,
+    ):
+        self.pad_idx = modality_cfg.pad_token_id
+        self.vocab_size = modality_cfg.vocab_size
+        local_encoder = TextLocalEncoder(
+            vocab_size=self.vocab_size,
+            embed_dim=embed_dim,
+            max_source_positions=modality_cfg.max_source_positions,
+            pad_idx=self.pad_idx,
+            no_scale_embedding=modality_cfg.no_scale_embedding,
+            layernorm_embedding=modality_cfg.layernorm_embedding,
+            dropout=modality_cfg.dropout,
+            no_token_positional_embeddings=modality_cfg.no_token_positional_embeddings,
+            learned_pos=modality_cfg.learned_pos,
+        )
+        dpr = np.linspace(
+            modality_cfg.start_drop_path_rate,
+            modality_cfg.end_drop_path_rate,
+            modality_cfg.prenet_depth,
+        )
+        context_encoder = BlockEncoder(
+            nn.ModuleList(make_block(dpr[i]) for i in range(modality_cfg.prenet_depth)),
+            norm_layer(embed_dim)
+            if not layer_norm_first and modality_cfg.prenet_depth > 0
+            else None,
+            layer_norm_first,
+            modality_cfg.prenet_layerdrop,
+            modality_cfg.prenet_dropout if modality_cfg.prenet_depth > 0 else 0.0,
+        )
+        decoder = None
+        alibi_bias_fn = partial(get_alibi_bias, alibi_biases=alibi_biases)
+        super().__init__(
+            modality_cfg=modality_cfg,
+            embed_dim=embed_dim,
+            local_encoder=local_encoder,
+            project_features=nn.Identity(),
+            fixed_positional_encoder=None,
+            relative_positional_encoder=None,
+            context_encoder=context_encoder,
+            decoder=decoder,
+            get_alibi_bias=alibi_bias_fn,
+        )
+    def convert_padding_mask(self, x, padding_mask):
+        if padding_mask is None or padding_mask.size(1) == x.size(1):
+            return padding_mask
+        diff = self.downsample - padding_mask.size(1) % self.downsample
+        if 0 < diff < self.downsample:
+            padding_mask = F.pad(padding_mask, (0, diff), value=True)
+        padding_mask = padding_mask.view(padding_mask.size(0), -1, self.downsample)
+        padding_mask = padding_mask.all(-1)
+        if padding_mask.size(1) > x.size(1):
+            padding_mask = padding_mask[:, : x.size(1)]
+        assert x.size(1) == padding_mask.size(
+            1
+        ), f"{x.size(1), padding_mask.size(1), diff, self.downsample}"
+        return padding_mask
+#################################################
+class Data2Vec2MultiPreTrainedModel(PreTrainedModel):
+    # use init_bert_params from fairseq
+    # copied from fairseq.modules.transformer_sentence_encoder.py
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        def normal_(data):
+            # with FSDP, module params will be on CUDA, so we cast them back to CPU
+            # so that the RNG is consistent with and without FSDP
+            data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device))
+        def _init(module):
+            if isinstance(module, nn.Linear):
+                normal_(module.weight.data)
+                if module.bias is not None:
+                    module.bias.data.zero_()
+            if isinstance(module, nn.Embedding):
+                normal_(module.weight.data)
+                if module.padding_idx is not None:
+                    module.weight.data[module.padding_idx].zero_()
+            if isinstance(module, AltBlock):
+                normal_(module.attn.proj.weight.data)
+            # init strategy for audio encoder
+            if isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+                if module.bias is not None:
+                    module.bias.data.zero_()
+                if module.weight is not None:
+                    module.weight.data.fill_(1.0)
+            if isinstance(module, nn.Conv1d):
+                nn.init.kaiming_normal_(module.weight)
+                if module.bias is not None:
+                    k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                    nn.init.uniform_(module.bias, a=-k, b=k)
+        if isinstance(module, nn.ModuleList):
+            for _, mod in enumerate(module):
+                _init(mod)
+        else:
+            _init(module)
+    # @classmethod
+    # def from_pretrained(
+    #     cls,
+    #     pretrained_model_name_or_path,
+    #     *model_args,
+    #     **kwargs,
+    # ):
+    #     config = cls.config_class()
+    #     config.from_pretrained(pretrained_model_name_or_path)
+    #     print(f"Loading configuration from pre-trained model: {type(config)}")
+    #     return super().from_pretrained(pretrained_model_name_or_path,
+    #                                    *model_args,
+    #                                    config,
+    #                                    **kwargs,)
+class Data2Vec2MultiModel(Data2Vec2MultiPreTrainedModel):
+    config_class = Data2Vec2MultiConfig
+    base_model_prefix = "data2vec2"
+    def __init__(self, config: Data2Vec2MultiConfig):
+        super().__init__(config)
+        self.config = config
+        modalities_cfg = config.modalities
+        self.modalities = [config.supported_modality]
+        make_layer_norm = partial(
+            nn.LayerNorm, eps=config.norm_eps, elementwise_affine=config.norm_affine
+        )
+        def make_block(drop_path, dim=None, heads=None):
+            return AltBlock(
+                config.embed_dim if dim is None else dim,
+                config.num_heads if heads is None else heads,
+                config.mlp_ratio,
+                qkv_bias=True,
+                drop=config.encoder_dropout,
+                attn_drop=config.attention_dropout,
+                mlp_drop=config.activation_dropout,
+                post_mlp_drop=config.post_mlp_drop,
+                drop_path=drop_path,
+                norm_layer=make_layer_norm,
+                layer_norm_first=config.layer_norm_first,
+                ffn_targets=not config.end_of_block_targets,
+            )
+        self.alibi_biases = {}
+        self.modality_encoders = nn.ModuleDict()
+        for mod in self.modalities:
+            mod_cfg = getattr(modalities_cfg, mod.lower())
+            enc = self.make_modality_encoder(
+                mod_cfg,
+                config.embed_dim,
+                make_block,
+                make_layer_norm,
+                config.layer_norm_first,
+                self.alibi_biases,
+            )
+            self.modality_encoders[mod] = enc
+        self.dropout_input = nn.Dropout(config.dropout_input)
+        dpr = np.linspace(config.start_drop_path_rate, config.end_drop_path_rate, config.depth)
+        self.blocks = nn.ModuleList([make_block(dpr[i]) for i in range(config.depth)])
+        self.norm = None
+        if config.layer_norm_first:
+            self.norm = make_layer_norm(config.embed_dim)
+        self.num_updates = 0
+        # Initialize weights and apply final processing
+        self.post_init()
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        for mod in self.modalities:
+            self.modality_encoders[mod]._freeze_parameters()
+    def make_modality_encoder(
+        self,
+        cfg: D2v2ModalityConfig,
+        embed_dim: int,
+        make_block: Callable[[float], nn.ModuleList],
+        norm_layer: Callable[[int], nn.LayerNorm],
+        layer_norm_first: bool,
+        alibi_biases,
+    ) -> ModalitySpecificEncoder:
+        if cfg.type == "AUDIO":
+            enc_cls = AudioEncoder
+        elif cfg.type == "TEXT":
+            enc_cls = TextEncoder
+        else:
+            raise Exception(f"unsupported modality {cfg.type}")
+        return enc_cls(
+            cfg,
+            embed_dim,
+            make_block,
+            norm_layer,
+            layer_norm_first,
+            alibi_biases,
+        )
+    def forward(
+        self,
+        input_values=None, # audio input
+        input_ids=None, # text input
+        attention_mask=None,
+        padding_mask=None,
+        mask=False,
+        mode=None,
+        output_hidden_states=True,
+        return_dict=True,
+    ):
+        if mode is None:
+            mode = "TEXT" if input_ids is not None else "AUDIO"
+        feature_extractor = self.modality_encoders[mode]
+        extractor_out = feature_extractor(
+            input_ids if input_ids is not None else input_values,
+            padding_mask,
+            mask,
+            remove_masked=False,
+            clone_batch=1,
+            mask_seeds=None,
+            precomputed_mask=None,
+        )
+        x = extractor_out["x"]
+        extract_features = x
+        # encoder_mask = extractor_out["encoder_mask"]
+        masked_padding_mask = extractor_out["padding_mask"]
+        masked_alibi_bias = extractor_out.get("alibi_bias", None)
+        alibi_scale = extractor_out.get("alibi_scale", None)
+        if self.dropout_input is not None:
+            x = self.dropout_input(x)
+        layer_results = []
+        for i, blk in enumerate(self.blocks):
+            if (
+                not self.training
+                or self.config.layerdrop == 0
+                or (np.random.random() > self.config.layerdrop)
+            ):
+                ab = masked_alibi_bias
+                if ab is not None and alibi_scale is not None:
+                    scale = (
+                        alibi_scale[i]
+                        if alibi_scale.size(0) > 1
+                        else alibi_scale.squeeze(0)
+                    )
+                    ab = ab * scale.type_as(ab)
+                x, lr = blk(
+                    x,
+                    padding_mask=masked_padding_mask,
+                    alibi_bias=ab,
+                )
+                layer_results.append(lr)
+        if self.norm is not None:
+            x = self.norm(x)
+        x = x[:, feature_extractor.modality_cfg.num_extra_tokens :]
+        if masked_padding_mask is not None:
+            masked_padding_mask = masked_padding_mask[
+                :, feature_extractor.modality_cfg.num_extra_tokens :
+            ]
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    x,
+                    extract_features,
+                    layer_results,
+                ]
+                if v is not None
+            )
+        return Wav2Vec2BaseModelOutput(
+            last_hidden_state=x,
+            extract_features=extract_features,
+            hidden_states=layer_results if output_hidden_states else None,
+            attentions=None, # switch to manual implementation with fast=False in forward pass of AltAttention as pytorch's dspa does not output attention weights
+        )

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,855 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<extra_id_0>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<extra_id_1>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<extra_id_2>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<extra_id_3>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<extra_id_4>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<extra_id_5>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<extra_id_6>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<extra_id_7>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<extra_id_8>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<extra_id_9>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<extra_id_10>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<extra_id_11>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "17": {
+      "content": "<extra_id_12>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "18": {
+      "content": "<extra_id_13>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "19": {
+      "content": "<extra_id_14>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "20": {
+      "content": "<extra_id_15>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "21": {
+      "content": "<extra_id_16>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "22": {
+      "content": "<extra_id_17>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "23": {
+      "content": "<extra_id_18>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "24": {
+      "content": "<extra_id_19>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "25": {
+      "content": "<extra_id_20>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "26": {
+      "content": "<extra_id_21>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "27": {
+      "content": "<extra_id_22>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "28": {
+      "content": "<extra_id_23>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "29": {
+      "content": "<extra_id_24>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "30": {
+      "content": "<extra_id_25>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "31": {
+      "content": "<extra_id_26>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32": {
+      "content": "<extra_id_27>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "33": {
+      "content": "<extra_id_28>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "34": {
+      "content": "<extra_id_29>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "35": {
+      "content": "<extra_id_30>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "36": {
+      "content": "<extra_id_31>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "37": {
+      "content": "<extra_id_32>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "38": {
+      "content": "<extra_id_33>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "39": {
+      "content": "<extra_id_34>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "40": {
+      "content": "<extra_id_35>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "41": {
+      "content": "<extra_id_36>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "42": {
+      "content": "<extra_id_37>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "43": {
+      "content": "<extra_id_38>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "44": {
+      "content": "<extra_id_39>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "45": {
+      "content": "<extra_id_40>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "46": {
+      "content": "<extra_id_41>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "47": {
+      "content": "<extra_id_42>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "48": {
+      "content": "<extra_id_43>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49": {
+      "content": "<extra_id_44>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50": {
+      "content": "<extra_id_45>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "51": {
+      "content": "<extra_id_46>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "52": {
+      "content": "<extra_id_47>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "53": {
+      "content": "<extra_id_48>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "54": {
+      "content": "<extra_id_49>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "55": {
+      "content": "<extra_id_50>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "56": {
+      "content": "<extra_id_51>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "57": {
+      "content": "<extra_id_52>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "58": {
+      "content": "<extra_id_53>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "59": {
+      "content": "<extra_id_54>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "60": {
+      "content": "<extra_id_55>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "61": {
+      "content": "<extra_id_56>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "62": {
+      "content": "<extra_id_57>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "63": {
+      "content": "<extra_id_58>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "64": {
+      "content": "<extra_id_59>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "65": {
+      "content": "<extra_id_60>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "66": {
+      "content": "<extra_id_61>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "67": {
+      "content": "<extra_id_62>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "68": {
+      "content": "<extra_id_63>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "69": {
+      "content": "<extra_id_64>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "70": {
+      "content": "<extra_id_65>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "71": {
+      "content": "<extra_id_66>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "72": {
+      "content": "<extra_id_67>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "73": {
+      "content": "<extra_id_68>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "74": {
+      "content": "<extra_id_69>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "75": {
+      "content": "<extra_id_70>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "76": {
+      "content": "<extra_id_71>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "77": {
+      "content": "<extra_id_72>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "78": {
+      "content": "<extra_id_73>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "79": {
+      "content": "<extra_id_74>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "80": {
+      "content": "<extra_id_75>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "81": {
+      "content": "<extra_id_76>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "82": {
+      "content": "<extra_id_77>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "83": {
+      "content": "<extra_id_78>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "84": {
+      "content": "<extra_id_79>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "85": {
+      "content": "<extra_id_80>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "86": {
+      "content": "<extra_id_81>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "87": {
+      "content": "<extra_id_82>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "88": {
+      "content": "<extra_id_83>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "89": {
+      "content": "<extra_id_84>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "90": {
+      "content": "<extra_id_85>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "91": {
+      "content": "<extra_id_86>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "92": {
+      "content": "<extra_id_87>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "93": {
+      "content": "<extra_id_88>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "94": {
+      "content": "<extra_id_89>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "95": {
+      "content": "<extra_id_90>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "96": {
+      "content": "<extra_id_91>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "97": {
+      "content": "<extra_id_92>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "98": {
+      "content": "<extra_id_93>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "99": {
+      "content": "<extra_id_94>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "<extra_id_95>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "<extra_id_96>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "<extra_id_97>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "<extra_id_98>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "104": {
+      "content": "<extra_id_99>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "mask_token": "<mask>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "unk_token": "<unk>"
+}

utils_data2vec2.py ADDED Viewed

	@@ -0,0 +1,439 @@

+# coding=utf-8
+#
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+import math
+import numpy as np
+from collections import namedtuple
+from typing import Optional, Tuple
+import torch
+import torch.nn.functional as F
+MaskSeed = namedtuple("MaskSeed", ["seed", "update", "ids"])
+MaskInfo = namedtuple("MaskInfo", ["x_unmasked", "mask", "ids_restore", "ids_keep"])
+def gather_unmasked(x: torch.Tensor, mask_info: MaskInfo) -> torch.Tensor:
+    return torch.gather(
+        x,
+        dim=1,
+        index=mask_info.ids_keep,
+    )
+def gather_unmasked_mask(x: torch.Tensor, mask_info: MaskInfo) -> torch.Tensor:
+    return torch.gather(
+        x,
+        dim=1,
+        index=mask_info.ids_keep[..., 0],  # ignore the feature dimension
+    )
+def masked_alibi(alibi_bias, mask_info):
+    H = alibi_bias.size(1)
+    orig_bias = alibi_bias
+    index = mask_info.ids_keep.unsqueeze(1)[..., 0].unsqueeze(-1)
+    alibi_bias = torch.gather(
+        orig_bias,
+        dim=-2,
+        index=index.expand(-1, H, -1, mask_info.ids_restore.size(1)),
+    )
+    alibi_bias = torch.gather(
+        alibi_bias,
+        dim=-1,
+        index=index.transpose(-1, -2).expand(-1, H, alibi_bias.size(-2), -1),
+    )
+    return alibi_bias
+def random_masking(x, mask_ratio, mask_seed: Optional[MaskSeed]):
+    N, L, D = x.shape  # batch, length, dim
+    len_keep = int(L * (1 - mask_ratio))
+    generator = None
+    if mask_seed is not None:
+        seed = int(
+            hash((mask_seed.seed, mask_seed.update, mask_seed.ids.sum().item())) % 1e6
+        )
+        generator = torch.Generator(device=x.device)
+        generator.manual_seed(seed)
+    noise = torch.rand(N, L, generator=generator, device=x.device)  # noise in [0, 1]
+    # sort noise for each sample
+    ids_shuffle = noise.argsort(dim=1)  # ascend: small is keep, large is remove
+    ids_restore = ids_shuffle.argsort(dim=1)
+    # keep the first subset
+    ids_keep = ids_shuffle[:, :len_keep]
+    ids_keep = ids_keep.unsqueeze(-1).expand(-1, -1, D)
+    x_unmasked = torch.gather(x, dim=1, index=ids_keep)
+    # generate the binary mask: 0 is keep, 1 is remove
+    mask = torch.ones([N, L], dtype=x.dtype, device=x.device)
+    mask[:, :len_keep] = 0
+    # unshuffle to get the binary mask
+    mask = torch.gather(mask, dim=1, index=ids_restore)
+    ids_restore = ids_restore.unsqueeze(-1).expand(-1, -1, D)
+    return MaskInfo(
+        x_unmasked=x_unmasked, mask=mask, ids_restore=ids_restore, ids_keep=ids_keep
+    )
+def get_alibi(
+    max_positions: int,
+    attention_heads: int,
+    dims: int = 1,
+    distance: str = "manhattan",
+):
+    def get_slopes(n):
+        def get_slopes_power_of_2(n):
+            start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+            ratio = start
+            return [start * ratio**i for i in range(n)]
+        # In the paper, we only train models that have 2^a heads for some
+        # a. This function has some good properties that only occur when
+        # the input is a power of 2. To maintain that even when the number
+        # of heads is not a power of 2, we use this workaround.
+        if math.log2(n).is_integer():
+            return get_slopes_power_of_2(n)
+        else:
+            closest_power_of_2 = 2 ** math.floor(math.log2(n))
+            return (
+                get_slopes_power_of_2(closest_power_of_2)
+                + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+            )
+    maxpos = max_positions
+    attn_heads = attention_heads
+    slopes = torch.Tensor(get_slopes(attn_heads))
+    if dims == 1:
+        # prepare alibi position linear bias. Note that wav2vec2 is non
+        # autoregressive model so we want a symmetric mask with 0 on the
+        # diagonal and other wise linear decreasing valuees
+        pos_bias = (
+            torch.abs(
+                torch.arange(maxpos).unsqueeze(0) - torch.arange(maxpos).unsqueeze(1)
+            )
+            * -1
+        )
+    elif dims == 2:
+        if distance == "manhattan":
+            df = lambda x1, y1, x2, y2: abs(x1 - x2) + abs(y1 - y2)
+        elif distance == "euclidean":
+            df = lambda x1, y1, x2, y2: math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)
+        n = math.sqrt(max_positions)
+        assert n.is_integer(), n
+        n = int(n)
+        pos_bias = torch.zeros((max_positions, max_positions))
+        for i in range(n):
+            for j in range(n):
+                for k in range(n):
+                    for l in range(n):
+                        new_x = i * n + j
+                        new_y = k * n + l
+                        pos_bias[new_x, new_y] = -df(i, j, k, l)
+    else:
+        raise Exception(f"unsupported number of alibi dims: {dims}")
+    alibi_bias = slopes.unsqueeze(1).unsqueeze(1) * pos_bias.unsqueeze(0).expand(
+        attn_heads, -1, -1
+    )
+    return alibi_bias
+def get_alibi_bias(
+    alibi_biases,
+    batch_size,
+    time_steps,
+    heads,
+    dtype,
+    device,
+    dims=1,
+    distance="manhattan",
+):
+    cache_key = f"{dims}_{heads}_{distance}"
+    buffered = alibi_biases.get(cache_key, None)
+    target_size = heads * batch_size
+    if (
+        buffered is None
+        or buffered.size(0) < target_size
+        or buffered.size(1) < time_steps
+        or buffered.dtype != dtype
+        or buffered.device != device
+    ):
+        bt = max(time_steps, buffered.size(1) if buffered is not None else 0)
+        bn = max(target_size, buffered.size(0) if buffered is not None else 0) // heads
+        buffered = (
+            get_alibi(bt, heads, dims=dims, distance=distance)
+            .to(dtype=dtype, device=device)
+            .repeat(bn, 1, 1)
+        )
+        alibi_biases[cache_key] = buffered
+    b = buffered[:target_size, :time_steps, :time_steps]
+    b = b.view(batch_size, heads, time_steps, time_steps)
+    return b
+def is_xla_tensor(tensor):
+    return torch.is_tensor(tensor) and tensor.device.type == "xla"
+def index_put(tensor, indices, value):
+    if is_xla_tensor(tensor):
+        for _ in range(indices.dim(), tensor.dim()):
+            indices = indices.unsqueeze(-1)
+        if indices.size(-1) < tensor.size(-1):
+            indices = indices.expand_as(tensor)
+        tensor = torch.mul(tensor, ~indices) + torch.mul(value, indices)
+    else:
+        tensor[indices] = value
+    return tensor
+def compute_mask_indices(
+    shape: Tuple[int, int],
+    padding_mask: Optional[torch.Tensor],
+    mask_prob: float,
+    mask_length: int,
+    mask_type: str = "static",
+    mask_other: float = 0.0,
+    min_masks: int = 0,
+    no_overlap: bool = False,
+    min_space: int = 0,
+    require_same_masks: bool = True,
+    mask_dropout: float = 0.0,
+    add_masks: bool = False,
+    seed: Optional[int] = None,
+    epoch: Optional[int] = None,
+    indices: Optional[torch.Tensor] = None,
+    idc_select_ver: int = 1,  # 2 to reproduce mask_tokens_dataset
+    num_mask_ver: int = 2,  # 2 to reproduce mask_tokens_dataset
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape
+    Args:
+        shape: the the shape for which to compute masks.
+            should be of size 2 where first element is batch size and 2nd is timesteps
+        padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
+        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+        mask_type: how to compute mask lengths
+            static = fixed size
+            uniform = sample from uniform distribution [mask_other, mask_length*2]
+            normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
+            poisson = sample from possion distribution with lambda = mask length
+        min_masks: minimum number of masked spans
+        no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
+        min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
+        require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample
+        mask_dropout: randomly dropout this percentage of masks in each example
+    """
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+    if num_mask_ver == 1:
+        all_num_mask = int(
+            # add a random number for probabilistic rounding
+            mask_prob * all_sz / float(mask_length)
+            + np.random.rand()
+        )
+        all_num_mask = max(min_masks, all_num_mask)
+    mask_idcs = []
+    for i in range(bsz):
+        if seed is not None and epoch is not None and indices is not None:
+            seed_i = int(hash((seed, epoch, indices[i].item())) % 1e6)
+        else:
+            seed_i = None
+        rng = np.random.default_rng(seed_i)
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            assert sz >= 0, sz
+        else:
+            sz = all_sz
+        if num_mask_ver == 1:
+            if padding_mask is not None:
+                num_mask = int(
+                    # add a random number for probabilistic rounding
+                    mask_prob * sz / float(mask_length)
+                    + np.random.rand()
+                )
+                num_mask = max(min_masks, num_mask)
+            else:
+                num_mask = all_num_mask
+        elif num_mask_ver == 2:
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                mask_prob * sz / float(mask_length)
+                + rng.random()
+            )
+            num_mask = max(min_masks, num_mask)
+        else:
+            raise ValueError()
+        if mask_type == "static":
+            lengths = np.full(num_mask, mask_length)
+        elif mask_type == "uniform":
+            lengths = rng.randint(mask_other, mask_length * 2 + 1, size=num_mask)
+        elif mask_type == "normal":
+            lengths = rng.normal(mask_length, mask_other, size=num_mask)
+            lengths = [max(1, int(round(x))) for x in lengths]
+        elif mask_type == "poisson":
+            lengths = rng.poisson(mask_length, size=num_mask)
+            lengths = [int(round(x)) for x in lengths]
+        else:
+            raise Exception("unknown mask selection " + mask_type)
+        if sum(lengths) == 0:
+            if mask_type == "static":
+                raise ValueError(f"this should never happens")
+            else:
+                lengths = [min(mask_length, sz - 1)]
+        if no_overlap:
+            mask_idc = []
+            def arrange(s, e, length, keep_length):
+                span_start = rng.randint(s, e - length)
+                mask_idc.extend(span_start + i for i in range(length))
+                new_parts = []
+                if span_start - s - min_space >= keep_length:
+                    new_parts.append((s, span_start - min_space + 1))
+                if e - span_start - length - min_space > keep_length:
+                    new_parts.append((span_start + length + min_space, e))
+                return new_parts
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = np.fromiter(
+                    (e - s if e - s >= length + min_space else 0 for s, e in parts),
+                    np.int,
+                )
+                l_sum = np.sum(lens)
+                if l_sum == 0:
+                    break
+                probs = lens / np.sum(lens)
+                c = rng.choice(len(parts), p=probs)
+                s, e = parts.pop(c)
+                parts.extend(arrange(s, e, length, min_length))
+            mask_idc = np.asarray(mask_idc)
+        else:
+            if idc_select_ver == 1:
+                min_len = min(lengths)
+                if sz - min_len <= num_mask:
+                    min_len = sz - num_mask - 1
+                mask_idc = rng.choice(sz - min_len, num_mask, replace=False)
+            elif idc_select_ver == 2:
+                mask_idc = rng.choice(sz, num_mask, replace=False)
+            else:
+                raise ValueError()
+            mask_idc = np.asarray(
+                [
+                    mask_idc[j] + offset
+                    for j in range(len(mask_idc))
+                    for offset in range(lengths[j])
+                ]
+            )
+        mask_idc = np.unique(mask_idc[mask_idc < sz])
+        if len(mask_idc) >= sz:
+            raise ValueError(
+                (
+                    f"the entire sequence is masked. "
+                    f"sz={sz}; mask_idc[mask_idc]; "
+                    f"index={indices[i] if indices is not None else None}"
+                )
+            )
+        mask_idcs.append(mask_idc)
+    target_len = None
+    if require_same_masks:
+        if add_masks:
+            target_len = max([len(m) for m in mask_idcs])
+        else:
+            target_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if target_len is not None and len(mask_idc) > target_len:
+            mask_idc = rng.choice(mask_idc, target_len, replace=False)
+        mask[i, mask_idc] = True
+        if target_len is not None and len(mask_idc) < target_len:
+            unmasked = np.flatnonzero(~mask[i])
+            to_mask = rng.choice(unmasked, target_len - len(mask_idc), replace=False)
+            mask[i, to_mask] = True
+        if mask_dropout > 0:
+            masked = np.flatnonzero(mask[i])
+            num_holes = np.rint(len(masked) * mask_dropout).astype(int)
+            to_drop = rng.choice(masked, num_holes, replace=False)
+            mask[i, to_drop] = False
+    return mask
+def _learned_alibi_bias(
+    alibi_bias,
+    batch_size,
+    time_steps,
+    heads,
+    scale,
+    dtype,
+    device,
+):
+    assert alibi_bias.size(1) == heads, alibi_bias.shape
+    assert alibi_bias.dtype == dtype, alibi_bias.dtype
+    assert alibi_bias.device == device, alibi_bias.device
+    if alibi_bias.size(-1) < time_steps:
+        psz = math.ceil((time_steps - alibi_bias.size(-1)) / 2)
+        alibi_bias = F.pad(alibi_bias, (psz, psz, psz, psz), mode="replicate")
+    alibi_bias = alibi_bias.expand(batch_size, -1, -1, -1) * scale
+    return alibi_bias[..., :time_steps, :time_steps]
+def make_positions(tensor, padding_idx: int, onnx_trace: bool = False):
+    """Replace non-padding symbols with their position numbers.
+    Position numbers begin at padding_idx+1. Padding symbols are ignored.
+    """
+    # The series of casts and type-conversions here are carefully
+    # balanced to both work with ONNX export and XLA. In particular XLA
+    # prefers ints, cumsum defaults to output longs, and ONNX doesn't know
+    # how to handle the dtype kwarg in cumsum.
+    mask = tensor.ne(padding_idx).int()
+    return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx