Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

.DS_Store +0 -0
config.json +52 -5
configuration_pantagruel_uni.py +488 -0
modeling_pantagruel_uni.py +0 -0
preprocessor_config.json +9 -0
utils_pantagruel_uni.py +439 -0
vocab.json +82 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

config.json CHANGED Viewed

@@ -3,17 +3,25 @@
   "activation_dropout": 0.0,
   "add_cross_attention": false,
   "architectures": [
-    "Data2Vec2MultiModel"
   ],
   "attention_dropout": 0.1,
   "auto_map": {
-    "AutoConfig": "configuration_data2vec2.Data2Vec2MultiConfig",
-    "AutoModel": "modeling_data2vec2.Data2Vec2MultiModel"
   },
   "bad_words_ids": null,
   "begin_suppress_tokens": null,
   "bos_token_id": null,
   "chunk_size_feed_forward": 0,
   "clone_batch": 12,
   "cross_attention_hidden_size": null,
   "decoder_start_token_id": null,
@@ -30,6 +38,7 @@
   "end_of_block_targets": false,
   "eos_token_id": null,
   "exponential_decay_length_penalty": null,
   "finetuning_task": null,
   "forced_bos_token_id": null,
   "forced_eos_token_id": null,
@@ -57,6 +66,9 @@
     "architectures": null,
     "audio": {
       "_name_or_path": "",
       "add_cross_attention": false,
       "add_masks": false,
       "alibi_max_pos": null,
@@ -66,11 +78,14 @@
       "begin_suppress_tokens": null,
       "bos_token_id": null,
       "chunk_size_feed_forward": 0,
       "conv_pos_depth": 5,
       "conv_pos_groups": 16,
       "conv_pos_pre_ln": false,
       "conv_pos_width": 95,
       "cross_attention_hidden_size": null,
       "decoder_start_token_id": null,
       "diversity_penalty": 0.0,
       "do_sample": false,
@@ -108,22 +123,30 @@
       "mask_channel_length": 64,
       "mask_channel_prob": 0.0,
       "mask_dropout": 0.0,
       "mask_length": 5,
       "mask_noise_std": 0.01,
       "mask_prob": 0.55,
       "mask_prob_adjust": 0.1,
       "mask_prob_min": null,
       "max_length": 20,
       "min_length": 0,
       "model_depth": 16,
       "model_type": "",
       "no_repeat_ngram_size": 0,
       "num_alibi_heads": 16,
       "num_beam_groups": 1,
       "num_beams": 1,
       "num_extra_tokens": 0,
       "num_return_sequences": 1,
       "output_attentions": false,
       "output_hidden_states": false,
       "output_scores": false,
       "pad_token_id": null,
@@ -142,6 +165,27 @@
       "start_drop_path_rate": 0.0,
       "suppress_tokens": null,
       "task_specific_params": null,
       "temperature": 1.0,
       "tie_encoder_decoder": false,
       "tie_word_embeddings": true,
@@ -151,7 +195,10 @@
       "torchscript": false,
       "type": "AUDIO",
       "typical_p": 1.0,
-      "use_alibi_encoder": true
     },
     "bad_words_ids": null,
     "begin_suppress_tokens": null,
@@ -310,7 +357,7 @@
     "torchscript": false,
     "typical_p": 1.0
   },
-  "model_type": "data2vec2",
   "n_layers": 12,
   "no_repeat_ngram_size": 0,
   "norm_affine": true,

   "activation_dropout": 0.0,
   "add_cross_attention": false,
   "architectures": [
+    "PantagruelUniModel"
   ],
   "attention_dropout": 0.1,
   "auto_map": {
+    "AutoConfig": "configuration_pantagruel_uni.PantagruelUniConfig",
+    "AutoModel": "modeling_pantagruel_uni.PantagruelUniModel",
+    "AutoModelForAudioFrameClassification": "modeling_pantagruel_uni.PantagruelUniForAudioFrameClassification",
+    "AutoModelForCTC": "modeling_pantagruel_uni.PantagruelUniForCTC",
+    "AutoModelForMaskedLM": "modeling_pantagruel_uni.PantagruelUniForMaskedLM",
+    "AutoModelForMultipleChoice": "modeling_pantagruel_uni.PantagruelUniForMultipleChoice",
+    "AutoModelForQuestionAnswering": "modeling_pantagruel_uni.PantagruelUniForQuestionAnswering",
+    "AutoModelForSequenceClassification": "modeling_pantagruel_uni.PantagruelUniForSequenceClassification",
+    "AutoModelForTokenClassification": "modeling_pantagruel_uni.PantagruelUniForTokenClassification"
   },
   "bad_words_ids": null,
   "begin_suppress_tokens": null,
   "bos_token_id": null,
   "chunk_size_feed_forward": 0,
+  "classifier_dropout": null,
   "clone_batch": 12,
   "cross_attention_hidden_size": null,
   "decoder_start_token_id": null,
   "end_of_block_targets": false,
   "eos_token_id": null,
   "exponential_decay_length_penalty": null,
+  "final_dropout": 0.1,
   "finetuning_task": null,
   "forced_bos_token_id": null,
   "forced_eos_token_id": null,
     "architectures": null,
     "audio": {
       "_name_or_path": "",
+      "adapter_kernel_size": 3,
+      "adapter_stride": 2,
+      "add_adapter": false,
       "add_cross_attention": false,
       "add_masks": false,
       "alibi_max_pos": null,
       "begin_suppress_tokens": null,
       "bos_token_id": null,
       "chunk_size_feed_forward": 0,
+      "classifier_proj_size": 256,
       "conv_pos_depth": 5,
       "conv_pos_groups": 16,
       "conv_pos_pre_ln": false,
       "conv_pos_width": 95,
       "cross_attention_hidden_size": null,
+      "ctc_loss_reduction": "sum",
+      "ctc_zero_infinity": false,
       "decoder_start_token_id": null,
       "diversity_penalty": 0.0,
       "do_sample": false,
       "mask_channel_length": 64,
       "mask_channel_prob": 0.0,
       "mask_dropout": 0.0,
+      "mask_feature_length": 10,
+      "mask_feature_min_masks": 0,
+      "mask_feature_prob": 0.0,
       "mask_length": 5,
       "mask_noise_std": 0.01,
       "mask_prob": 0.55,
       "mask_prob_adjust": 0.1,
       "mask_prob_min": null,
+      "mask_time_length": 10,
+      "mask_time_min_masks": 2,
+      "mask_time_prob": 0.05,
       "max_length": 20,
       "min_length": 0,
       "model_depth": 16,
       "model_type": "",
       "no_repeat_ngram_size": 0,
+      "num_adapter_layers": 3,
       "num_alibi_heads": 16,
       "num_beam_groups": 1,
       "num_beams": 1,
       "num_extra_tokens": 0,
       "num_return_sequences": 1,
       "output_attentions": false,
+      "output_hidden_size": null,
       "output_hidden_states": false,
       "output_scores": false,
       "pad_token_id": null,
       "start_drop_path_rate": 0.0,
       "suppress_tokens": null,
       "task_specific_params": null,
+      "tdnn_dilation": [
+        1,
+        2,
+        3,
+        1,
+        1
+      ],
+      "tdnn_dim": [
+        512,
+        512,
+        512,
+        512,
+        1500
+      ],
+      "tdnn_kernel": [
+        5,
+        3,
+        3,
+        1,
+        1
+      ],
       "temperature": 1.0,
       "tie_encoder_decoder": false,
       "tie_word_embeddings": true,
       "torchscript": false,
       "type": "AUDIO",
       "typical_p": 1.0,
+      "use_alibi_encoder": true,
+      "use_weighted_layer_sum": false,
+      "vocab_size": 80,
+      "xvector_output_dim": 512
     },
     "bad_words_ids": null,
     "begin_suppress_tokens": null,
     "torchscript": false,
     "typical_p": 1.0
   },
+  "model_type": "pantagruel_uni",
   "n_layers": 12,
   "no_repeat_ngram_size": 0,
   "norm_affine": true,

configuration_pantagruel_uni.py ADDED Viewed

	@@ -0,0 +1,488 @@

+# coding=utf-8
+#
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+#
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Pantagruel unimodal configuration"""
+import os
+from typing import Union, Dict, Any, Optional
+from transformers.dynamic_module_utils import custom_object_save
+from transformers.utils import logging
+from transformers.configuration_utils import PretrainedConfig, CONFIG_NAME
+logger = logging.get_logger(__name__)
+class MyPretrainedConfig(PretrainedConfig):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+    def to_json_string(self, use_diff: bool = False) -> str:
+        return super().to_json_string(use_diff)
+    def update(self, config_dict):
+        for key, value in config_dict.items():
+            if not hasattr(self, key):
+                continue
+            if isinstance(getattr(self, key), MyPretrainedConfig):
+                getattr(self, key).update(config_dict[key])
+            else:
+                setattr(self, key, value)
+    # Copied from the parent class, only changed use_diff from True to False to correctly save nested config class
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+        """
+        Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~PretrainedConfig.from_pretrained`] class method.
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file will be saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        self._set_token_in_kwargs(kwargs)
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+        non_default_generation_parameters = {}
+        for parameter_name, default_value in self._get_global_generation_defaults().items():
+            if hasattr(self, parameter_name) and getattr(self, parameter_name) != default_value:
+                non_default_generation_parameters[parameter_name] = getattr(self, parameter_name)
+        if len(non_default_generation_parameters) > 0:
+            logger.warning(
+                "Some non-default generation parameters are set in the model config. These should go into a "
+                "GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) "
+                "instead. This warning will be raised to an exception in v4.41.\n"
+                f"Non-default generation parameters: {str(non_default_generation_parameters)}"
+            )
+        os.makedirs(save_directory, exist_ok=True)
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = self._create_repo(repo_id, **kwargs)
+            files_timestamps = self._get_files_timestamps(save_directory)
+        # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
+        # loaded from the Hub.
+        if self._auto_class is not None:
+            custom_object_save(self, save_directory, config=self)
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_config_file = os.path.join(save_directory, CONFIG_NAME)
+        self.to_json_file(output_config_file, use_diff=False)
+        logger.info(f"Configuration saved in {output_config_file}")
+        if push_to_hub:
+            self._upload_modified_files(
+                save_directory,
+                repo_id,
+                files_timestamps,
+                commit_message=commit_message,
+                token=kwargs.get("token"),
+            )
+    # Copied from the parent class, change the instantiation and updating of class from config_dict to correctly load nested config
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "MyPretrainedConfig":
+        """
+        Instantiates a [`PretrainedConfig`] from a Python dictionary of parameters.
+        Args:
+            config_dict (`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the configuration object. Such a dictionary can be
+                retrieved from a pretrained checkpoint by leveraging the [`~PretrainedConfig.get_config_dict`] method.
+            kwargs (`Dict[str, Any]`):
+                Additional parameters from which to initialize the configuration object.
+        Returns:
+            [`PretrainedConfig`]: The configuration object instantiated from those parameters.
+        """
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+        # Those arguments may be passed along for our internal telemetry.
+        # We remove them so they don't appear in `return_unused_kwargs`.
+        kwargs.pop("_from_auto", None)
+        kwargs.pop("_from_pipeline", None)
+        # The commit hash might have been updated in the `config_dict`, we don't want the kwargs to erase that update.
+        if "_commit_hash" in kwargs and "_commit_hash" in config_dict:
+            kwargs["_commit_hash"] = config_dict["_commit_hash"]
+        # We remove it from kwargs so that it does not appear in `return_unused_kwargs`.
+        config_dict["attn_implementation"] = kwargs.pop("attn_implementation", None)
+        # config = cls(**config_dict)
+        # My updated config
+        config = cls()
+        for key, value in config_dict.items():
+            if not hasattr(config, key):
+                continue
+            if isinstance(getattr(config, key), MyPretrainedConfig):
+                getattr(config, key).update(config_dict[key])
+            else:
+                setattr(config, key, value)
+        if hasattr(config, "pruned_heads"):
+            config.pruned_heads = {int(key): value for key, value in config.pruned_heads.items()}
+        # Update config with kwargs if needed
+        if "num_labels" in kwargs and "id2label" in kwargs:
+            num_labels = kwargs["num_labels"]
+            id2label = kwargs["id2label"] if kwargs["id2label"] is not None else []
+            if len(id2label) != num_labels:
+                raise ValueError(
+                    f"You passed along `num_labels={num_labels }` with an incompatible id to label map: "
+                    f"{kwargs['id2label']}. Since those arguments are inconsistent with each other, you should remove "
+                    "one of them."
+                )
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                current_attr = getattr(config, key)
+                # To authorize passing a custom subconfig as kwarg in models that have nested configs.
+                if isinstance(current_attr, PretrainedConfig) and isinstance(value, dict):
+                    value = current_attr.__class__(**value)
+                setattr(config, key, value)
+                if key != "torch_dtype":
+                    to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+        logger.info(f"Model config {config}")
+        if return_unused_kwargs:
+            return config, kwargs
+        else:
+            return config
+class PantagruelModalityConfig(MyPretrainedConfig):
+    """
+    Configuration including common args to both speech and text modality
+    """
+    def __init__(
+        self,
+        type="AUDIO",
+        prenet_depth=4,
+        prenet_layerdrop=0,
+        prenet_dropout=0.0,
+        start_drop_path_rate=0.0,
+        end_drop_path_rate=0.0,
+        num_extra_tokens=0,
+        init_extra_token_zero=True,
+        mask_noise_std=0.01,
+        mask_prob_min=None,
+        mask_prob=0.7,
+        inverse_mask=False,
+        mask_prob_adjust=0.0,
+        keep_masked_pct=0.0,
+        mask_length=5,
+        add_masks=False,
+        remove_masks=False,
+        mask_dropout=0.0,
+        encoder_zero_mask=True,
+        mask_channel_prob=0.0,
+        mask_channel_length=64,
+        local_grad_mult=1.0,
+        use_alibi_encoder=False,
+        alibi_scale=1.0,
+        learned_alibi=False,
+        alibi_max_pos=None,
+        learned_alibi_scale=False,
+        learned_alibi_scale_per_head=False,
+        learned_alibi_scale_per_layer=False,
+        num_alibi_heads=12,
+        model_depth=12,
+        ema_local_encoder=False,
+        decoder=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.type = type
+        self.prenet_depth = prenet_depth
+        self.prenet_layerdrop = prenet_layerdrop
+        self.prenet_dropout = prenet_dropout
+        self.start_drop_path_rate = start_drop_path_rate
+        self.end_drop_path_rate = end_drop_path_rate
+        self.num_extra_tokens = num_extra_tokens
+        self.init_extra_token_zero = init_extra_token_zero
+        self.mask_noise_std = mask_noise_std
+        self.mask_prob_min = mask_prob_min
+        self.mask_prob = mask_prob
+        self.inverse_mask = inverse_mask
+        self.mask_prob_adjust = mask_prob_adjust
+        self.keep_masked_pct = keep_masked_pct
+        self.mask_length = mask_length
+        self.add_masks = add_masks
+        self.remove_masks = remove_masks
+        self.mask_dropout = mask_dropout
+        self.encoder_zero_mask = encoder_zero_mask
+        self.mask_channel_prob = mask_channel_prob
+        self.mask_channel_length = mask_channel_length
+        self.local_grad_mult = local_grad_mult
+        self.use_alibi_encoder = use_alibi_encoder
+        self.alibi_scale = alibi_scale
+        self.learned_alibi = learned_alibi
+        self.alibi_max_pos = alibi_max_pos
+        self.learned_alibi_scale = learned_alibi_scale
+        self.learned_alibi_scale_per_head = learned_alibi_scale_per_head
+        self.learned_alibi_scale_per_layer = learned_alibi_scale_per_layer
+        self.num_alibi_heads = num_alibi_heads
+        self.model_depth = model_depth
+class PantagruelAudioConfig(PantagruelModalityConfig):
+    """
+    Configuration including args specific to audio-only tasks
+    """
+    def __init__(
+        self,
+        vocab_size=80,
+        extractor_mode="layer_norm",
+        feature_encoder_spec="[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
+        conv_pos_width=95,
+        conv_pos_groups=16,
+        conv_pos_depth=5,
+        conv_pos_pre_ln=False,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        ctc_loss_reduction="sum",
+        ctc_zero_infinity=False,
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        tdnn_dim=(512, 512, 512, 512, 1500),
+        tdnn_kernel=(5, 3, 3, 1, 1),
+        tdnn_dilation=(1, 2, 3, 1, 1),
+        xvector_output_dim=512,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        add_adapter=False,
+        adapter_kernel_size=3,
+        adapter_stride=2,
+        num_adapter_layers=3,
+        output_hidden_size=None,
+        **kwargs,
+    ):
+        super().__init__(type="AUDIO", **kwargs)
+        self.extractor_mode = extractor_mode
+        self.feature_encoder_spec = feature_encoder_spec
+        self.conv_pos_width = conv_pos_width
+        self.conv_pos_groups = conv_pos_groups
+        self.conv_pos_depth = conv_pos_depth
+        self.conv_pos_pre_ln = conv_pos_pre_ln
+        self.vocab_size = vocab_size
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+        # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+        # ctc loss
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+        # adapter
+        self.add_adapter = add_adapter
+        self.adapter_kernel_size = adapter_kernel_size
+        self.adapter_stride = adapter_stride
+        self.num_adapter_layers = num_adapter_layers
+        self.output_hidden_size = output_hidden_size
+        # SequenceClassification-specific parameter. Feel free to ignore for other classes.
+        self.classifier_proj_size = classifier_proj_size
+        # XVector-specific parameters. Feel free to ignore for other classes.
+        self.tdnn_dim = list(tdnn_dim)
+        self.tdnn_kernel = list(tdnn_kernel)
+        self.tdnn_dilation = list(tdnn_dilation)
+        self.xvector_output_dim = xvector_output_dim
+class PantagruelTextConfig(PantagruelModalityConfig):
+    """
+    Configuration including args specific to text-only tasks
+    """
+    def __init__(
+        self,
+        vocab_size=50000,
+        unk_token_id=3,
+        bos_token_id=0,
+        eos_token_id=2,
+        pad_token_id=1,
+        max_source_positions=512,
+        learned_pos=True,
+        dropout=0.1,
+        no_scale_embedding=True,
+        layernorm_embedding=True,
+        no_token_positional_embeddings=False,
+        **kwargs,
+    ):
+        super().__init__(type="TEXT", **kwargs)
+        self.vocab_size = vocab_size
+        self.unk_token_id = unk_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.max_source_positions = max_source_positions
+        self.learned_pos = learned_pos
+        self.dropout = dropout
+        self.no_scale_embedding = no_scale_embedding
+        self.layernorm_embedding = layernorm_embedding
+        self.no_token_positional_embeddings = no_token_positional_embeddings
+class PantagruelModalitiesConfig(MyPretrainedConfig):
+    """
+    Container class for both audio and text modality configurations
+    """
+    def __init__(
+        self,
+        audio_config=PantagruelAudioConfig(),
+        text_config=PantagruelTextConfig(),
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.audio = audio_config
+        self.text = text_config
+class PantagruelUniConfig(MyPretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PantagruelUniModel`].
+    It is used to instantiate an PantagruelUniModel model according to the specified arguments,
+    defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to
+    control the model outputs. Read the documentation from [`PretrainedConfig`] for more information.
+    Args:
+        depth (`int`, *optional*, defaults to 12):
+            Number of Transformer layers in the encoder.
+    Example:
+    ```python
+    >>> from transformers import PantagruelUniConfig, PantagruelUniModel
+    >>> # Initializing a PantagruelUniConfig for audio
+    >>> configuration = PantagruelUniConfig()
+    >>> # Initializing a model (with random weights) with the configuration
+    >>> model = PantagruelUniModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+    model_type = "pantagruel_uni"
+    def __init__(
+        self,
+        depth=12,
+        start_drop_path_rate=0.0,
+        end_drop_path_rate=0.0,
+        num_heads=12,
+        norm_eps=1e-5,
+        norm_affine=True,
+        encoder_dropout=0.1,
+        post_mlp_drop=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.0,
+        dropout_input=0.0,
+        final_dropout=0.1,
+        layerdrop=0.0,
+        embed_dim=768,
+        mlp_ratio=4.0,
+        layer_norm_first=False,
+        end_of_block_targets=False,
+        clone_batch=1,
+        log_norms=True,
+        modalities=PantagruelModalitiesConfig(),
+        supported_modality="AUDIO",
+        classifier_dropout=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.depth = depth
+        self.start_drop_path_rate = start_drop_path_rate
+        self.end_drop_path_rate = end_drop_path_rate
+        self.num_heads = num_heads
+        self.norm_eps = norm_eps
+        self.norm_affine = norm_affine
+        self.post_mlp_drop = post_mlp_drop
+        self.encoder_dropout = encoder_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.dropout_input = dropout_input
+        self.final_dropout = final_dropout
+        self.layerdrop = layerdrop
+        self.embed_dim = embed_dim
+        self.mlp_ratio = mlp_ratio
+        self.layer_norm_first = layer_norm_first
+        self.end_of_block_targets = end_of_block_targets
+        self.clone_batch = clone_batch
+        self.log_norms = log_norms
+        self.modalities = modalities
+        self.supported_modality = supported_modality
+        # Attributes for hopsparser
+        self.hidden_size = embed_dim
+        self.num_layers = depth
+        self.n_layers = depth
+        self.num_hidden_layers = depth
+        self.classifier_dropout = classifier_dropout
+        self.auto_map = {
+            'AutoConfig': 'configuration_pantagruel_uni.PantagruelUniConfig',
+            'AutoModel': 'modeling_pantagruel_uni.PantagruelUniModel',
+            'AutoModelForMaskedLM': 'modeling_pantagruel_uni.PantagruelUniForMaskedLM',
+            'AutoModelForSequenceClassification': 'modeling_pantagruel_uni.PantagruelUniForSequenceClassification',
+            'AutoModelForMultipleChoice': 'modeling_pantagruel_uni.PantagruelUniForMultipleChoice',
+            'AutoModelForTokenClassification': 'modeling_pantagruel_uni.PantagruelUniForTokenClassification',
+            'AutoModelForQuestionAnswering': 'modeling_pantagruel_uni.PantagruelUniForQuestionAnswering',
+            'AutoModelForAudioFrameClassification': 'modeling_pantagruel_uni.PantagruelUniForAudioFrameClassification',
+            'AutoModelForCTC': 'modeling_pantagruel_uni.PantagruelUniForCTC',
+        }

modeling_pantagruel_uni.py ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0,
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

utils_pantagruel_uni.py ADDED Viewed

	@@ -0,0 +1,439 @@

+# coding=utf-8
+#
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+import math
+import numpy as np
+from collections import namedtuple
+from typing import Optional, Tuple
+import torch
+import torch.nn.functional as F
+MaskSeed = namedtuple("MaskSeed", ["seed", "update", "ids"])
+MaskInfo = namedtuple("MaskInfo", ["x_unmasked", "mask", "ids_restore", "ids_keep"])
+def gather_unmasked(x: torch.Tensor, mask_info: MaskInfo) -> torch.Tensor:
+    return torch.gather(
+        x,
+        dim=1,
+        index=mask_info.ids_keep,
+    )
+def gather_unmasked_mask(x: torch.Tensor, mask_info: MaskInfo) -> torch.Tensor:
+    return torch.gather(
+        x,
+        dim=1,
+        index=mask_info.ids_keep[..., 0],  # ignore the feature dimension
+    )
+def masked_alibi(alibi_bias, mask_info):
+    H = alibi_bias.size(1)
+    orig_bias = alibi_bias
+    index = mask_info.ids_keep.unsqueeze(1)[..., 0].unsqueeze(-1)
+    alibi_bias = torch.gather(
+        orig_bias,
+        dim=-2,
+        index=index.expand(-1, H, -1, mask_info.ids_restore.size(1)),
+    )
+    alibi_bias = torch.gather(
+        alibi_bias,
+        dim=-1,
+        index=index.transpose(-1, -2).expand(-1, H, alibi_bias.size(-2), -1),
+    )
+    return alibi_bias
+def random_masking(x, mask_ratio, mask_seed: Optional[MaskSeed]):
+    N, L, D = x.shape  # batch, length, dim
+    len_keep = int(L * (1 - mask_ratio))
+    generator = None
+    if mask_seed is not None:
+        seed = int(
+            hash((mask_seed.seed, mask_seed.update, mask_seed.ids.sum().item())) % 1e6
+        )
+        generator = torch.Generator(device=x.device)
+        generator.manual_seed(seed)
+    noise = torch.rand(N, L, generator=generator, device=x.device)  # noise in [0, 1]
+    # sort noise for each sample
+    ids_shuffle = noise.argsort(dim=1)  # ascend: small is keep, large is remove
+    ids_restore = ids_shuffle.argsort(dim=1)
+    # keep the first subset
+    ids_keep = ids_shuffle[:, :len_keep]
+    ids_keep = ids_keep.unsqueeze(-1).expand(-1, -1, D)
+    x_unmasked = torch.gather(x, dim=1, index=ids_keep)
+    # generate the binary mask: 0 is keep, 1 is remove
+    mask = torch.ones([N, L], dtype=x.dtype, device=x.device)
+    mask[:, :len_keep] = 0
+    # unshuffle to get the binary mask
+    mask = torch.gather(mask, dim=1, index=ids_restore)
+    ids_restore = ids_restore.unsqueeze(-1).expand(-1, -1, D)
+    return MaskInfo(
+        x_unmasked=x_unmasked, mask=mask, ids_restore=ids_restore, ids_keep=ids_keep
+    )
+def get_alibi(
+    max_positions: int,
+    attention_heads: int,
+    dims: int = 1,
+    distance: str = "manhattan",
+):
+    def get_slopes(n):
+        def get_slopes_power_of_2(n):
+            start = 2 ** (-(2 ** -(math.log2(n) - 3)))
+            ratio = start
+            return [start * ratio**i for i in range(n)]
+        # In the paper, we only train models that have 2^a heads for some
+        # a. This function has some good properties that only occur when
+        # the input is a power of 2. To maintain that even when the number
+        # of heads is not a power of 2, we use this workaround.
+        if math.log2(n).is_integer():
+            return get_slopes_power_of_2(n)
+        else:
+            closest_power_of_2 = 2 ** math.floor(math.log2(n))
+            return (
+                get_slopes_power_of_2(closest_power_of_2)
+                + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
+            )
+    maxpos = max_positions
+    attn_heads = attention_heads
+    slopes = torch.Tensor(get_slopes(attn_heads))
+    if dims == 1:
+        # prepare alibi position linear bias. Note that wav2vec2 is non
+        # autoregressive model so we want a symmetric mask with 0 on the
+        # diagonal and other wise linear decreasing valuees
+        pos_bias = (
+            torch.abs(
+                torch.arange(maxpos).unsqueeze(0) - torch.arange(maxpos).unsqueeze(1)
+            )
+            * -1
+        )
+    elif dims == 2:
+        if distance == "manhattan":
+            df = lambda x1, y1, x2, y2: abs(x1 - x2) + abs(y1 - y2)
+        elif distance == "euclidean":
+            df = lambda x1, y1, x2, y2: math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)
+        n = math.sqrt(max_positions)
+        assert n.is_integer(), n
+        n = int(n)
+        pos_bias = torch.zeros((max_positions, max_positions))
+        for i in range(n):
+            for j in range(n):
+                for k in range(n):
+                    for l in range(n):
+                        new_x = i * n + j
+                        new_y = k * n + l
+                        pos_bias[new_x, new_y] = -df(i, j, k, l)
+    else:
+        raise Exception(f"unsupported number of alibi dims: {dims}")
+    alibi_bias = slopes.unsqueeze(1).unsqueeze(1) * pos_bias.unsqueeze(0).expand(
+        attn_heads, -1, -1
+    )
+    return alibi_bias
+def get_alibi_bias(
+    alibi_biases,
+    batch_size,
+    time_steps,
+    heads,
+    dtype,
+    device,
+    dims=1,
+    distance="manhattan",
+):
+    cache_key = f"{dims}_{heads}_{distance}"
+    buffered = alibi_biases.get(cache_key, None)
+    target_size = heads * batch_size
+    if (
+        buffered is None
+        or buffered.size(0) < target_size
+        or buffered.size(1) < time_steps
+        or buffered.dtype != dtype
+        or buffered.device != device
+    ):
+        bt = max(time_steps, buffered.size(1) if buffered is not None else 0)
+        bn = max(target_size, buffered.size(0) if buffered is not None else 0) // heads
+        buffered = (
+            get_alibi(bt, heads, dims=dims, distance=distance)
+            .to(dtype=dtype, device=device)
+            .repeat(bn, 1, 1)
+        )
+        alibi_biases[cache_key] = buffered
+    b = buffered[:target_size, :time_steps, :time_steps]
+    b = b.view(batch_size, heads, time_steps, time_steps)
+    return b
+def is_xla_tensor(tensor):
+    return torch.is_tensor(tensor) and tensor.device.type == "xla"
+def index_put(tensor, indices, value):
+    if is_xla_tensor(tensor):
+        for _ in range(indices.dim(), tensor.dim()):
+            indices = indices.unsqueeze(-1)
+        if indices.size(-1) < tensor.size(-1):
+            indices = indices.expand_as(tensor)
+        tensor = torch.mul(tensor, ~indices) + torch.mul(value, indices)
+    else:
+        tensor[indices] = value
+    return tensor
+def compute_mask_indices(
+    shape: Tuple[int, int],
+    padding_mask: Optional[torch.Tensor],
+    mask_prob: float,
+    mask_length: int,
+    mask_type: str = "static",
+    mask_other: float = 0.0,
+    min_masks: int = 0,
+    no_overlap: bool = False,
+    min_space: int = 0,
+    require_same_masks: bool = True,
+    mask_dropout: float = 0.0,
+    add_masks: bool = False,
+    seed: Optional[int] = None,
+    epoch: Optional[int] = None,
+    indices: Optional[torch.Tensor] = None,
+    idc_select_ver: int = 1,  # 2 to reproduce mask_tokens_dataset
+    num_mask_ver: int = 2,  # 2 to reproduce mask_tokens_dataset
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape
+    Args:
+        shape: the the shape for which to compute masks.
+            should be of size 2 where first element is batch size and 2nd is timesteps
+        padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
+        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+        mask_type: how to compute mask lengths
+            static = fixed size
+            uniform = sample from uniform distribution [mask_other, mask_length*2]
+            normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
+            poisson = sample from possion distribution with lambda = mask length
+        min_masks: minimum number of masked spans
+        no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
+        min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
+        require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample
+        mask_dropout: randomly dropout this percentage of masks in each example
+    """
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+    if num_mask_ver == 1:
+        all_num_mask = int(
+            # add a random number for probabilistic rounding
+            mask_prob * all_sz / float(mask_length)
+            + np.random.rand()
+        )
+        all_num_mask = max(min_masks, all_num_mask)
+    mask_idcs = []
+    for i in range(bsz):
+        if seed is not None and epoch is not None and indices is not None:
+            seed_i = int(hash((seed, epoch, indices[i].item())) % 1e6)
+        else:
+            seed_i = None
+        rng = np.random.default_rng(seed_i)
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            assert sz >= 0, sz
+        else:
+            sz = all_sz
+        if num_mask_ver == 1:
+            if padding_mask is not None:
+                num_mask = int(
+                    # add a random number for probabilistic rounding
+                    mask_prob * sz / float(mask_length)
+                    + np.random.rand()
+                )
+                num_mask = max(min_masks, num_mask)
+            else:
+                num_mask = all_num_mask
+        elif num_mask_ver == 2:
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                mask_prob * sz / float(mask_length)
+                + rng.random()
+            )
+            num_mask = max(min_masks, num_mask)
+        else:
+            raise ValueError()
+        if mask_type == "static":
+            lengths = np.full(num_mask, mask_length)
+        elif mask_type == "uniform":
+            lengths = rng.randint(mask_other, mask_length * 2 + 1, size=num_mask)
+        elif mask_type == "normal":
+            lengths = rng.normal(mask_length, mask_other, size=num_mask)
+            lengths = [max(1, int(round(x))) for x in lengths]
+        elif mask_type == "poisson":
+            lengths = rng.poisson(mask_length, size=num_mask)
+            lengths = [int(round(x)) for x in lengths]
+        else:
+            raise Exception("unknown mask selection " + mask_type)
+        if sum(lengths) == 0:
+            if mask_type == "static":
+                raise ValueError(f"this should never happens")
+            else:
+                lengths = [min(mask_length, sz - 1)]
+        if no_overlap:
+            mask_idc = []
+            def arrange(s, e, length, keep_length):
+                span_start = rng.randint(s, e - length)
+                mask_idc.extend(span_start + i for i in range(length))
+                new_parts = []
+                if span_start - s - min_space >= keep_length:
+                    new_parts.append((s, span_start - min_space + 1))
+                if e - span_start - length - min_space > keep_length:
+                    new_parts.append((span_start + length + min_space, e))
+                return new_parts
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = np.fromiter(
+                    (e - s if e - s >= length + min_space else 0 for s, e in parts),
+                    np.int,
+                )
+                l_sum = np.sum(lens)
+                if l_sum == 0:
+                    break
+                probs = lens / np.sum(lens)
+                c = rng.choice(len(parts), p=probs)
+                s, e = parts.pop(c)
+                parts.extend(arrange(s, e, length, min_length))
+            mask_idc = np.asarray(mask_idc)
+        else:
+            if idc_select_ver == 1:
+                min_len = min(lengths)
+                if sz - min_len <= num_mask:
+                    min_len = sz - num_mask - 1
+                mask_idc = rng.choice(sz - min_len, num_mask, replace=False)
+            elif idc_select_ver == 2:
+                mask_idc = rng.choice(sz, num_mask, replace=False)
+            else:
+                raise ValueError()
+            mask_idc = np.asarray(
+                [
+                    mask_idc[j] + offset
+                    for j in range(len(mask_idc))
+                    for offset in range(lengths[j])
+                ]
+            )
+        mask_idc = np.unique(mask_idc[mask_idc < sz])
+        if len(mask_idc) >= sz:
+            raise ValueError(
+                (
+                    f"the entire sequence is masked. "
+                    f"sz={sz}; mask_idc[mask_idc]; "
+                    f"index={indices[i] if indices is not None else None}"
+                )
+            )
+        mask_idcs.append(mask_idc)
+    target_len = None
+    if require_same_masks:
+        if add_masks:
+            target_len = max([len(m) for m in mask_idcs])
+        else:
+            target_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if target_len is not None and len(mask_idc) > target_len:
+            mask_idc = rng.choice(mask_idc, target_len, replace=False)
+        mask[i, mask_idc] = True
+        if target_len is not None and len(mask_idc) < target_len:
+            unmasked = np.flatnonzero(~mask[i])
+            to_mask = rng.choice(unmasked, target_len - len(mask_idc), replace=False)
+            mask[i, to_mask] = True
+        if mask_dropout > 0:
+            masked = np.flatnonzero(mask[i])
+            num_holes = np.rint(len(masked) * mask_dropout).astype(int)
+            to_drop = rng.choice(masked, num_holes, replace=False)
+            mask[i, to_drop] = False
+    return mask
+def _learned_alibi_bias(
+    alibi_bias,
+    batch_size,
+    time_steps,
+    heads,
+    scale,
+    dtype,
+    device,
+):
+    assert alibi_bias.size(1) == heads, alibi_bias.shape
+    assert alibi_bias.dtype == dtype, alibi_bias.dtype
+    assert alibi_bias.device == device, alibi_bias.device
+    if alibi_bias.size(-1) < time_steps:
+        psz = math.ceil((time_steps - alibi_bias.size(-1)) / 2)
+        alibi_bias = F.pad(alibi_bias, (psz, psz, psz, psz), mode="replicate")
+    alibi_bias = alibi_bias.expand(batch_size, -1, -1, -1) * scale
+    return alibi_bias[..., :time_steps, :time_steps]
+def make_positions(tensor, padding_idx: int, onnx_trace: bool = False):
+    """Replace non-padding symbols with their position numbers.
+    Position numbers begin at padding_idx+1. Padding symbols are ignored.
+    """
+    # The series of casts and type-conversions here are carefully
+    # balanced to both work with ONNX export and XLA. In particular XLA
+    # prefers ints, cumsum defaults to output longs, and ONNX doesn't know
+    # how to handle the dtype kwarg in cumsum.
+    mask = tensor.ne(padding_idx).int()
+    return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx

vocab.json ADDED Viewed

	@@ -0,0 +1,82 @@

+{
+    "<s>": 0,
+    "<pad>": 1,
+    "</s>": 2,
+    "<unk>": 3,
+    "|": 4,
+    "E": 5,
+    "S": 6,
+    "A": 7,
+    "T": 8,
+    "I": 9,
+    "N": 10,
+    "R": 11,
+    "L": 12,
+    "U": 13,
+    "O": 14,
+    "D": 15,
+    "C": 16,
+    "M": 17,
+    "P": 18,
+    "É": 19,
+    "V": 20,
+    "G": 21,
+    "'": 22,
+    "F": 23,
+    "B": 24,
+    "H": 25,
+    "Q": 26,
+    "È": 27,
+    "À": 28,
+    "X": 29,
+    "J": 30,
+    "Y": 31,
+    "K": 32,
+    "Z": 33,
+    "Ê": 34,
+    "W": 35,
+    "Ç": 36,
+    "Â": 37,
+    "Ô": 38,
+    "Î": 39,
+    "Ï": 40,
+    "Û": 41,
+    "Ù": 42,
+    "Á": 43,
+    "Ë": 44,
+    "Í": 45,
+    "Ü": 46,
+    "Ö": 47,
+    "Ó": 48,
+    "Ä": 49,
+    "Ñ": 50,
+    "Ú": 51,
+    "Ø": 52,
+    "Ã": 53,
+    "Æ": 54,
+    "Å": 55,
+    "Ý": 56,
+    "Ò": 57,
+    "Ð": 58,
+    "Ì": 59,
+    "Õ": 60,
+    "Þ": 61,
+    "Г": 62,
+    "А": 63,
+    "Е": 64,
+    "І": 65,
+    "Ј": 66,
+    "З": 67,
+    "И": 68,
+    "К": 69,
+    "М": 70,
+    "Н": 71,
+    "П": 72,
+    "Р": 73,
+    "Э": 74,
+    "Ҫ": 75,
+    "madeupword0000": 76,
+    "madeupword0001": 77,
+    "madeupword0002": 78,
+    "madeupword0003": 79
+}