flaubert commited on
Commit
529cde1
·
verified ·
1 Parent(s): 9a4a7a8

Upload folder using huggingface_hub

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
config.json CHANGED
@@ -3,17 +3,25 @@
3
  "activation_dropout": 0.0,
4
  "add_cross_attention": false,
5
  "architectures": [
6
- "Data2Vec2MultiModel"
7
  ],
8
  "attention_dropout": 0.1,
9
  "auto_map": {
10
- "AutoConfig": "configuration_data2vec2.Data2Vec2MultiConfig",
11
- "AutoModel": "modeling_data2vec2.Data2Vec2MultiModel"
 
 
 
 
 
 
 
12
  },
13
  "bad_words_ids": null,
14
  "begin_suppress_tokens": null,
15
  "bos_token_id": null,
16
  "chunk_size_feed_forward": 0,
 
17
  "clone_batch": 12,
18
  "cross_attention_hidden_size": null,
19
  "decoder_start_token_id": null,
@@ -30,6 +38,7 @@
30
  "end_of_block_targets": false,
31
  "eos_token_id": null,
32
  "exponential_decay_length_penalty": null,
 
33
  "finetuning_task": null,
34
  "forced_bos_token_id": null,
35
  "forced_eos_token_id": null,
@@ -57,6 +66,9 @@
57
  "architectures": null,
58
  "audio": {
59
  "_name_or_path": "",
 
 
 
60
  "add_cross_attention": false,
61
  "add_masks": false,
62
  "alibi_max_pos": null,
@@ -66,11 +78,14 @@
66
  "begin_suppress_tokens": null,
67
  "bos_token_id": null,
68
  "chunk_size_feed_forward": 0,
 
69
  "conv_pos_depth": 5,
70
  "conv_pos_groups": 16,
71
  "conv_pos_pre_ln": false,
72
  "conv_pos_width": 95,
73
  "cross_attention_hidden_size": null,
 
 
74
  "decoder_start_token_id": null,
75
  "diversity_penalty": 0.0,
76
  "do_sample": false,
@@ -108,22 +123,30 @@
108
  "mask_channel_length": 64,
109
  "mask_channel_prob": 0.0,
110
  "mask_dropout": 0.0,
 
 
 
111
  "mask_length": 5,
112
  "mask_noise_std": 0.01,
113
  "mask_prob": 0.55,
114
  "mask_prob_adjust": 0.1,
115
  "mask_prob_min": null,
 
 
 
116
  "max_length": 20,
117
  "min_length": 0,
118
  "model_depth": 16,
119
  "model_type": "",
120
  "no_repeat_ngram_size": 0,
 
121
  "num_alibi_heads": 16,
122
  "num_beam_groups": 1,
123
  "num_beams": 1,
124
  "num_extra_tokens": 0,
125
  "num_return_sequences": 1,
126
  "output_attentions": false,
 
127
  "output_hidden_states": false,
128
  "output_scores": false,
129
  "pad_token_id": null,
@@ -142,6 +165,27 @@
142
  "start_drop_path_rate": 0.0,
143
  "suppress_tokens": null,
144
  "task_specific_params": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  "temperature": 1.0,
146
  "tie_encoder_decoder": false,
147
  "tie_word_embeddings": true,
@@ -151,7 +195,10 @@
151
  "torchscript": false,
152
  "type": "AUDIO",
153
  "typical_p": 1.0,
154
- "use_alibi_encoder": true
 
 
 
155
  },
156
  "bad_words_ids": null,
157
  "begin_suppress_tokens": null,
@@ -310,7 +357,7 @@
310
  "torchscript": false,
311
  "typical_p": 1.0
312
  },
313
- "model_type": "data2vec2",
314
  "n_layers": 12,
315
  "no_repeat_ngram_size": 0,
316
  "norm_affine": true,
 
3
  "activation_dropout": 0.0,
4
  "add_cross_attention": false,
5
  "architectures": [
6
+ "PantagruelUniModel"
7
  ],
8
  "attention_dropout": 0.1,
9
  "auto_map": {
10
+ "AutoConfig": "configuration_pantagruel_uni.PantagruelUniConfig",
11
+ "AutoModel": "modeling_pantagruel_uni.PantagruelUniModel",
12
+ "AutoModelForAudioFrameClassification": "modeling_pantagruel_uni.PantagruelUniForAudioFrameClassification",
13
+ "AutoModelForCTC": "modeling_pantagruel_uni.PantagruelUniForCTC",
14
+ "AutoModelForMaskedLM": "modeling_pantagruel_uni.PantagruelUniForMaskedLM",
15
+ "AutoModelForMultipleChoice": "modeling_pantagruel_uni.PantagruelUniForMultipleChoice",
16
+ "AutoModelForQuestionAnswering": "modeling_pantagruel_uni.PantagruelUniForQuestionAnswering",
17
+ "AutoModelForSequenceClassification": "modeling_pantagruel_uni.PantagruelUniForSequenceClassification",
18
+ "AutoModelForTokenClassification": "modeling_pantagruel_uni.PantagruelUniForTokenClassification"
19
  },
20
  "bad_words_ids": null,
21
  "begin_suppress_tokens": null,
22
  "bos_token_id": null,
23
  "chunk_size_feed_forward": 0,
24
+ "classifier_dropout": null,
25
  "clone_batch": 12,
26
  "cross_attention_hidden_size": null,
27
  "decoder_start_token_id": null,
 
38
  "end_of_block_targets": false,
39
  "eos_token_id": null,
40
  "exponential_decay_length_penalty": null,
41
+ "final_dropout": 0.1,
42
  "finetuning_task": null,
43
  "forced_bos_token_id": null,
44
  "forced_eos_token_id": null,
 
66
  "architectures": null,
67
  "audio": {
68
  "_name_or_path": "",
69
+ "adapter_kernel_size": 3,
70
+ "adapter_stride": 2,
71
+ "add_adapter": false,
72
  "add_cross_attention": false,
73
  "add_masks": false,
74
  "alibi_max_pos": null,
 
78
  "begin_suppress_tokens": null,
79
  "bos_token_id": null,
80
  "chunk_size_feed_forward": 0,
81
+ "classifier_proj_size": 256,
82
  "conv_pos_depth": 5,
83
  "conv_pos_groups": 16,
84
  "conv_pos_pre_ln": false,
85
  "conv_pos_width": 95,
86
  "cross_attention_hidden_size": null,
87
+ "ctc_loss_reduction": "sum",
88
+ "ctc_zero_infinity": false,
89
  "decoder_start_token_id": null,
90
  "diversity_penalty": 0.0,
91
  "do_sample": false,
 
123
  "mask_channel_length": 64,
124
  "mask_channel_prob": 0.0,
125
  "mask_dropout": 0.0,
126
+ "mask_feature_length": 10,
127
+ "mask_feature_min_masks": 0,
128
+ "mask_feature_prob": 0.0,
129
  "mask_length": 5,
130
  "mask_noise_std": 0.01,
131
  "mask_prob": 0.55,
132
  "mask_prob_adjust": 0.1,
133
  "mask_prob_min": null,
134
+ "mask_time_length": 10,
135
+ "mask_time_min_masks": 2,
136
+ "mask_time_prob": 0.05,
137
  "max_length": 20,
138
  "min_length": 0,
139
  "model_depth": 16,
140
  "model_type": "",
141
  "no_repeat_ngram_size": 0,
142
+ "num_adapter_layers": 3,
143
  "num_alibi_heads": 16,
144
  "num_beam_groups": 1,
145
  "num_beams": 1,
146
  "num_extra_tokens": 0,
147
  "num_return_sequences": 1,
148
  "output_attentions": false,
149
+ "output_hidden_size": null,
150
  "output_hidden_states": false,
151
  "output_scores": false,
152
  "pad_token_id": null,
 
165
  "start_drop_path_rate": 0.0,
166
  "suppress_tokens": null,
167
  "task_specific_params": null,
168
+ "tdnn_dilation": [
169
+ 1,
170
+ 2,
171
+ 3,
172
+ 1,
173
+ 1
174
+ ],
175
+ "tdnn_dim": [
176
+ 512,
177
+ 512,
178
+ 512,
179
+ 512,
180
+ 1500
181
+ ],
182
+ "tdnn_kernel": [
183
+ 5,
184
+ 3,
185
+ 3,
186
+ 1,
187
+ 1
188
+ ],
189
  "temperature": 1.0,
190
  "tie_encoder_decoder": false,
191
  "tie_word_embeddings": true,
 
195
  "torchscript": false,
196
  "type": "AUDIO",
197
  "typical_p": 1.0,
198
+ "use_alibi_encoder": true,
199
+ "use_weighted_layer_sum": false,
200
+ "vocab_size": 80,
201
+ "xvector_output_dim": 512
202
  },
203
  "bad_words_ids": null,
204
  "begin_suppress_tokens": null,
 
357
  "torchscript": false,
358
  "typical_p": 1.0
359
  },
360
+ "model_type": "pantagruel_uni",
361
  "n_layers": 12,
362
  "no_repeat_ngram_size": 0,
363
  "norm_affine": true,
configuration_pantagruel_uni.py ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ #
3
+ # Copyright (c) Facebook, Inc. and its affiliates.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+ #
9
+ # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
10
+ #
11
+ # Licensed under the Apache License, Version 2.0 (the "License");
12
+ # you may not use this file except in compliance with the License.
13
+ # You may obtain a copy of the License at
14
+ #
15
+ # http://www.apache.org/licenses/LICENSE-2.0
16
+ #
17
+ # Unless required by applicable law or agreed to in writing, software
18
+ # distributed under the License is distributed on an "AS IS" BASIS,
19
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20
+ # See the License for the specific language governing permissions and
21
+ # limitations under the License.
22
+
23
+
24
+ """ Pantagruel unimodal configuration"""
25
+
26
+ import os
27
+ from typing import Union, Dict, Any, Optional
28
+ from transformers.dynamic_module_utils import custom_object_save
29
+ from transformers.utils import logging
30
+ from transformers.configuration_utils import PretrainedConfig, CONFIG_NAME
31
+
32
+
33
+ logger = logging.get_logger(__name__)
34
+
35
+
36
+ class MyPretrainedConfig(PretrainedConfig):
37
+ def __init__(self, **kwargs):
38
+ super().__init__(**kwargs)
39
+
40
+ def to_json_string(self, use_diff: bool = False) -> str:
41
+ return super().to_json_string(use_diff)
42
+
43
+ def update(self, config_dict):
44
+ for key, value in config_dict.items():
45
+ if not hasattr(self, key):
46
+ continue
47
+ if isinstance(getattr(self, key), MyPretrainedConfig):
48
+ getattr(self, key).update(config_dict[key])
49
+ else:
50
+ setattr(self, key, value)
51
+
52
+ # Copied from the parent class, only changed use_diff from True to False to correctly save nested config class
53
+ def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
54
+ """
55
+ Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the
56
+ [`~PretrainedConfig.from_pretrained`] class method.
57
+
58
+ Args:
59
+ save_directory (`str` or `os.PathLike`):
60
+ Directory where the configuration JSON file will be saved (will be created if it does not exist).
61
+ push_to_hub (`bool`, *optional*, defaults to `False`):
62
+ Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
63
+ repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
64
+ namespace).
65
+ kwargs (`Dict[str, Any]`, *optional*):
66
+ Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
67
+ """
68
+ self._set_token_in_kwargs(kwargs)
69
+
70
+ if os.path.isfile(save_directory):
71
+ raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
72
+
73
+ non_default_generation_parameters = {}
74
+ for parameter_name, default_value in self._get_global_generation_defaults().items():
75
+ if hasattr(self, parameter_name) and getattr(self, parameter_name) != default_value:
76
+ non_default_generation_parameters[parameter_name] = getattr(self, parameter_name)
77
+ if len(non_default_generation_parameters) > 0:
78
+ logger.warning(
79
+ "Some non-default generation parameters are set in the model config. These should go into a "
80
+ "GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) "
81
+ "instead. This warning will be raised to an exception in v4.41.\n"
82
+ f"Non-default generation parameters: {str(non_default_generation_parameters)}"
83
+ )
84
+
85
+ os.makedirs(save_directory, exist_ok=True)
86
+
87
+ if push_to_hub:
88
+ commit_message = kwargs.pop("commit_message", None)
89
+ repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
90
+ repo_id = self._create_repo(repo_id, **kwargs)
91
+ files_timestamps = self._get_files_timestamps(save_directory)
92
+
93
+ # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be
94
+ # loaded from the Hub.
95
+ if self._auto_class is not None:
96
+ custom_object_save(self, save_directory, config=self)
97
+
98
+ # If we save using the predefined names, we can load using `from_pretrained`
99
+ output_config_file = os.path.join(save_directory, CONFIG_NAME)
100
+
101
+ self.to_json_file(output_config_file, use_diff=False)
102
+ logger.info(f"Configuration saved in {output_config_file}")
103
+
104
+ if push_to_hub:
105
+ self._upload_modified_files(
106
+ save_directory,
107
+ repo_id,
108
+ files_timestamps,
109
+ commit_message=commit_message,
110
+ token=kwargs.get("token"),
111
+ )
112
+
113
+ # Copied from the parent class, change the instantiation and updating of class from config_dict to correctly load nested config
114
+ @classmethod
115
+ def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "MyPretrainedConfig":
116
+ """
117
+ Instantiates a [`PretrainedConfig`] from a Python dictionary of parameters.
118
+
119
+ Args:
120
+ config_dict (`Dict[str, Any]`):
121
+ Dictionary that will be used to instantiate the configuration object. Such a dictionary can be
122
+ retrieved from a pretrained checkpoint by leveraging the [`~PretrainedConfig.get_config_dict`] method.
123
+ kwargs (`Dict[str, Any]`):
124
+ Additional parameters from which to initialize the configuration object.
125
+
126
+ Returns:
127
+ [`PretrainedConfig`]: The configuration object instantiated from those parameters.
128
+ """
129
+ return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
130
+ # Those arguments may be passed along for our internal telemetry.
131
+ # We remove them so they don't appear in `return_unused_kwargs`.
132
+ kwargs.pop("_from_auto", None)
133
+ kwargs.pop("_from_pipeline", None)
134
+ # The commit hash might have been updated in the `config_dict`, we don't want the kwargs to erase that update.
135
+ if "_commit_hash" in kwargs and "_commit_hash" in config_dict:
136
+ kwargs["_commit_hash"] = config_dict["_commit_hash"]
137
+
138
+ # We remove it from kwargs so that it does not appear in `return_unused_kwargs`.
139
+ config_dict["attn_implementation"] = kwargs.pop("attn_implementation", None)
140
+
141
+ # config = cls(**config_dict)
142
+ # My updated config
143
+ config = cls()
144
+ for key, value in config_dict.items():
145
+ if not hasattr(config, key):
146
+ continue
147
+ if isinstance(getattr(config, key), MyPretrainedConfig):
148
+ getattr(config, key).update(config_dict[key])
149
+ else:
150
+ setattr(config, key, value)
151
+
152
+
153
+ if hasattr(config, "pruned_heads"):
154
+ config.pruned_heads = {int(key): value for key, value in config.pruned_heads.items()}
155
+
156
+ # Update config with kwargs if needed
157
+ if "num_labels" in kwargs and "id2label" in kwargs:
158
+ num_labels = kwargs["num_labels"]
159
+ id2label = kwargs["id2label"] if kwargs["id2label"] is not None else []
160
+ if len(id2label) != num_labels:
161
+ raise ValueError(
162
+ f"You passed along `num_labels={num_labels }` with an incompatible id to label map: "
163
+ f"{kwargs['id2label']}. Since those arguments are inconsistent with each other, you should remove "
164
+ "one of them."
165
+ )
166
+ to_remove = []
167
+ for key, value in kwargs.items():
168
+ if hasattr(config, key):
169
+ current_attr = getattr(config, key)
170
+ # To authorize passing a custom subconfig as kwarg in models that have nested configs.
171
+ if isinstance(current_attr, PretrainedConfig) and isinstance(value, dict):
172
+ value = current_attr.__class__(**value)
173
+ setattr(config, key, value)
174
+ if key != "torch_dtype":
175
+ to_remove.append(key)
176
+ for key in to_remove:
177
+ kwargs.pop(key, None)
178
+
179
+ logger.info(f"Model config {config}")
180
+ if return_unused_kwargs:
181
+ return config, kwargs
182
+ else:
183
+ return config
184
+
185
+
186
+ class PantagruelModalityConfig(MyPretrainedConfig):
187
+ """
188
+ Configuration including common args to both speech and text modality
189
+ """
190
+ def __init__(
191
+ self,
192
+ type="AUDIO",
193
+ prenet_depth=4,
194
+ prenet_layerdrop=0,
195
+ prenet_dropout=0.0,
196
+ start_drop_path_rate=0.0,
197
+ end_drop_path_rate=0.0,
198
+ num_extra_tokens=0,
199
+ init_extra_token_zero=True,
200
+ mask_noise_std=0.01,
201
+ mask_prob_min=None,
202
+ mask_prob=0.7,
203
+ inverse_mask=False,
204
+ mask_prob_adjust=0.0,
205
+ keep_masked_pct=0.0,
206
+ mask_length=5,
207
+ add_masks=False,
208
+ remove_masks=False,
209
+ mask_dropout=0.0,
210
+ encoder_zero_mask=True,
211
+ mask_channel_prob=0.0,
212
+ mask_channel_length=64,
213
+ local_grad_mult=1.0,
214
+ use_alibi_encoder=False,
215
+ alibi_scale=1.0,
216
+ learned_alibi=False,
217
+ alibi_max_pos=None,
218
+ learned_alibi_scale=False,
219
+ learned_alibi_scale_per_head=False,
220
+ learned_alibi_scale_per_layer=False,
221
+ num_alibi_heads=12,
222
+ model_depth=12,
223
+ ema_local_encoder=False,
224
+ decoder=None,
225
+ **kwargs,
226
+ ):
227
+ super().__init__(**kwargs)
228
+ self.type = type
229
+ self.prenet_depth = prenet_depth
230
+ self.prenet_layerdrop = prenet_layerdrop
231
+ self.prenet_dropout = prenet_dropout
232
+ self.start_drop_path_rate = start_drop_path_rate
233
+ self.end_drop_path_rate = end_drop_path_rate
234
+ self.num_extra_tokens = num_extra_tokens
235
+ self.init_extra_token_zero = init_extra_token_zero
236
+ self.mask_noise_std = mask_noise_std
237
+ self.mask_prob_min = mask_prob_min
238
+ self.mask_prob = mask_prob
239
+ self.inverse_mask = inverse_mask
240
+ self.mask_prob_adjust = mask_prob_adjust
241
+ self.keep_masked_pct = keep_masked_pct
242
+ self.mask_length = mask_length
243
+ self.add_masks = add_masks
244
+ self.remove_masks = remove_masks
245
+ self.mask_dropout = mask_dropout
246
+ self.encoder_zero_mask = encoder_zero_mask
247
+ self.mask_channel_prob = mask_channel_prob
248
+ self.mask_channel_length = mask_channel_length
249
+ self.local_grad_mult = local_grad_mult
250
+ self.use_alibi_encoder = use_alibi_encoder
251
+ self.alibi_scale = alibi_scale
252
+ self.learned_alibi = learned_alibi
253
+ self.alibi_max_pos = alibi_max_pos
254
+ self.learned_alibi_scale = learned_alibi_scale
255
+ self.learned_alibi_scale_per_head = learned_alibi_scale_per_head
256
+ self.learned_alibi_scale_per_layer = learned_alibi_scale_per_layer
257
+ self.num_alibi_heads = num_alibi_heads
258
+ self.model_depth = model_depth
259
+
260
+
261
+ class PantagruelAudioConfig(PantagruelModalityConfig):
262
+ """
263
+ Configuration including args specific to audio-only tasks
264
+ """
265
+ def __init__(
266
+ self,
267
+ vocab_size=80,
268
+ extractor_mode="layer_norm",
269
+ feature_encoder_spec="[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
270
+ conv_pos_width=95,
271
+ conv_pos_groups=16,
272
+ conv_pos_depth=5,
273
+ conv_pos_pre_ln=False,
274
+ mask_time_prob=0.05,
275
+ mask_time_length=10,
276
+ mask_time_min_masks=2,
277
+ mask_feature_prob=0.0,
278
+ mask_feature_length=10,
279
+ mask_feature_min_masks=0,
280
+ ctc_loss_reduction="sum",
281
+ ctc_zero_infinity=False,
282
+ use_weighted_layer_sum=False,
283
+ classifier_proj_size=256,
284
+ tdnn_dim=(512, 512, 512, 512, 1500),
285
+ tdnn_kernel=(5, 3, 3, 1, 1),
286
+ tdnn_dilation=(1, 2, 3, 1, 1),
287
+ xvector_output_dim=512,
288
+ pad_token_id=0,
289
+ bos_token_id=1,
290
+ eos_token_id=2,
291
+ add_adapter=False,
292
+ adapter_kernel_size=3,
293
+ adapter_stride=2,
294
+ num_adapter_layers=3,
295
+ output_hidden_size=None,
296
+ **kwargs,
297
+ ):
298
+ super().__init__(type="AUDIO", **kwargs)
299
+ self.extractor_mode = extractor_mode
300
+ self.feature_encoder_spec = feature_encoder_spec
301
+ self.conv_pos_width = conv_pos_width
302
+ self.conv_pos_groups = conv_pos_groups
303
+ self.conv_pos_depth = conv_pos_depth
304
+ self.conv_pos_pre_ln = conv_pos_pre_ln
305
+
306
+ self.vocab_size = vocab_size
307
+ self.use_weighted_layer_sum = use_weighted_layer_sum
308
+
309
+ # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779
310
+ self.mask_time_prob = mask_time_prob
311
+ self.mask_time_length = mask_time_length
312
+ self.mask_time_min_masks = mask_time_min_masks
313
+ self.mask_feature_prob = mask_feature_prob
314
+ self.mask_feature_length = mask_feature_length
315
+ self.mask_feature_min_masks = mask_feature_min_masks
316
+
317
+ # ctc loss
318
+ self.ctc_loss_reduction = ctc_loss_reduction
319
+ self.ctc_zero_infinity = ctc_zero_infinity
320
+
321
+ # adapter
322
+ self.add_adapter = add_adapter
323
+ self.adapter_kernel_size = adapter_kernel_size
324
+ self.adapter_stride = adapter_stride
325
+ self.num_adapter_layers = num_adapter_layers
326
+ self.output_hidden_size = output_hidden_size
327
+
328
+ # SequenceClassification-specific parameter. Feel free to ignore for other classes.
329
+ self.classifier_proj_size = classifier_proj_size
330
+
331
+ # XVector-specific parameters. Feel free to ignore for other classes.
332
+ self.tdnn_dim = list(tdnn_dim)
333
+ self.tdnn_kernel = list(tdnn_kernel)
334
+ self.tdnn_dilation = list(tdnn_dilation)
335
+ self.xvector_output_dim = xvector_output_dim
336
+
337
+
338
+ class PantagruelTextConfig(PantagruelModalityConfig):
339
+ """
340
+ Configuration including args specific to text-only tasks
341
+ """
342
+ def __init__(
343
+ self,
344
+ vocab_size=50000,
345
+ unk_token_id=3,
346
+ bos_token_id=0,
347
+ eos_token_id=2,
348
+ pad_token_id=1,
349
+ max_source_positions=512,
350
+ learned_pos=True,
351
+ dropout=0.1,
352
+ no_scale_embedding=True,
353
+ layernorm_embedding=True,
354
+ no_token_positional_embeddings=False,
355
+ **kwargs,
356
+ ):
357
+ super().__init__(type="TEXT", **kwargs)
358
+ self.vocab_size = vocab_size
359
+ self.unk_token_id = unk_token_id
360
+ self.bos_token_id = bos_token_id
361
+ self.eos_token_id = eos_token_id
362
+ self.pad_token_id = pad_token_id
363
+ self.max_source_positions = max_source_positions
364
+ self.learned_pos = learned_pos
365
+ self.dropout = dropout
366
+ self.no_scale_embedding = no_scale_embedding
367
+ self.layernorm_embedding = layernorm_embedding
368
+ self.no_token_positional_embeddings = no_token_positional_embeddings
369
+
370
+
371
+ class PantagruelModalitiesConfig(MyPretrainedConfig):
372
+ """
373
+ Container class for both audio and text modality configurations
374
+ """
375
+ def __init__(
376
+ self,
377
+ audio_config=PantagruelAudioConfig(),
378
+ text_config=PantagruelTextConfig(),
379
+ **kwargs
380
+ ):
381
+ super().__init__(**kwargs)
382
+ self.audio = audio_config
383
+ self.text = text_config
384
+
385
+
386
+ class PantagruelUniConfig(MyPretrainedConfig):
387
+ r"""
388
+ This is the configuration class to store the configuration of a [`PantagruelUniModel`].
389
+ It is used to instantiate an PantagruelUniModel model according to the specified arguments,
390
+ defining the model architecture.
391
+
392
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to
393
+ control the model outputs. Read the documentation from [`PretrainedConfig`] for more information.
394
+
395
+ Args:
396
+ depth (`int`, *optional*, defaults to 12):
397
+ Number of Transformer layers in the encoder.
398
+
399
+ Example:
400
+
401
+ ```python
402
+ >>> from transformers import PantagruelUniConfig, PantagruelUniModel
403
+
404
+ >>> # Initializing a PantagruelUniConfig for audio
405
+ >>> configuration = PantagruelUniConfig()
406
+
407
+ >>> # Initializing a model (with random weights) with the configuration
408
+ >>> model = PantagruelUniModel(configuration)
409
+
410
+ >>> # Accessing the model configuration
411
+ >>> configuration = model.config
412
+ ```
413
+ """
414
+
415
+ model_type = "pantagruel_uni"
416
+
417
+ def __init__(
418
+ self,
419
+ depth=12,
420
+ start_drop_path_rate=0.0,
421
+ end_drop_path_rate=0.0,
422
+ num_heads=12,
423
+ norm_eps=1e-5,
424
+ norm_affine=True,
425
+ encoder_dropout=0.1,
426
+ post_mlp_drop=0.1,
427
+ attention_dropout=0.1,
428
+ activation_dropout=0.0,
429
+ dropout_input=0.0,
430
+ final_dropout=0.1,
431
+ layerdrop=0.0,
432
+ embed_dim=768,
433
+ mlp_ratio=4.0,
434
+ layer_norm_first=False,
435
+ end_of_block_targets=False,
436
+ clone_batch=1,
437
+ log_norms=True,
438
+ modalities=PantagruelModalitiesConfig(),
439
+ supported_modality="AUDIO",
440
+ classifier_dropout=None,
441
+ **kwargs,
442
+ ):
443
+ super().__init__(**kwargs)
444
+
445
+ self.depth = depth
446
+ self.start_drop_path_rate = start_drop_path_rate
447
+ self.end_drop_path_rate = end_drop_path_rate
448
+
449
+ self.num_heads = num_heads
450
+ self.norm_eps = norm_eps
451
+ self.norm_affine = norm_affine
452
+ self.post_mlp_drop = post_mlp_drop
453
+ self.encoder_dropout = encoder_dropout
454
+ self.attention_dropout = attention_dropout
455
+ self.activation_dropout = activation_dropout
456
+ self.dropout_input = dropout_input
457
+ self.final_dropout = final_dropout
458
+ self.layerdrop = layerdrop
459
+ self.embed_dim = embed_dim
460
+ self.mlp_ratio = mlp_ratio
461
+
462
+ self.layer_norm_first = layer_norm_first
463
+ self.end_of_block_targets = end_of_block_targets
464
+ self.clone_batch = clone_batch
465
+ self.log_norms = log_norms
466
+
467
+ self.modalities = modalities
468
+ self.supported_modality = supported_modality
469
+
470
+ # Attributes for hopsparser
471
+ self.hidden_size = embed_dim
472
+ self.num_layers = depth
473
+ self.n_layers = depth
474
+ self.num_hidden_layers = depth
475
+
476
+ self.classifier_dropout = classifier_dropout
477
+
478
+ self.auto_map = {
479
+ 'AutoConfig': 'configuration_pantagruel_uni.PantagruelUniConfig',
480
+ 'AutoModel': 'modeling_pantagruel_uni.PantagruelUniModel',
481
+ 'AutoModelForMaskedLM': 'modeling_pantagruel_uni.PantagruelUniForMaskedLM',
482
+ 'AutoModelForSequenceClassification': 'modeling_pantagruel_uni.PantagruelUniForSequenceClassification',
483
+ 'AutoModelForMultipleChoice': 'modeling_pantagruel_uni.PantagruelUniForMultipleChoice',
484
+ 'AutoModelForTokenClassification': 'modeling_pantagruel_uni.PantagruelUniForTokenClassification',
485
+ 'AutoModelForQuestionAnswering': 'modeling_pantagruel_uni.PantagruelUniForQuestionAnswering',
486
+ 'AutoModelForAudioFrameClassification': 'modeling_pantagruel_uni.PantagruelUniForAudioFrameClassification',
487
+ 'AutoModelForCTC': 'modeling_pantagruel_uni.PantagruelUniForCTC',
488
+ }
modeling_pantagruel_uni.py ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
utils_pantagruel_uni.py ADDED
@@ -0,0 +1,439 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ #
3
+ # Copyright (c) Facebook, Inc. and its affiliates.
4
+ #
5
+ # This source code is licensed under the MIT license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+ #
8
+
9
+ import math
10
+ import numpy as np
11
+ from collections import namedtuple
12
+ from typing import Optional, Tuple
13
+
14
+ import torch
15
+ import torch.nn.functional as F
16
+
17
+
18
+ MaskSeed = namedtuple("MaskSeed", ["seed", "update", "ids"])
19
+ MaskInfo = namedtuple("MaskInfo", ["x_unmasked", "mask", "ids_restore", "ids_keep"])
20
+
21
+
22
+ def gather_unmasked(x: torch.Tensor, mask_info: MaskInfo) -> torch.Tensor:
23
+ return torch.gather(
24
+ x,
25
+ dim=1,
26
+ index=mask_info.ids_keep,
27
+ )
28
+
29
+
30
+ def gather_unmasked_mask(x: torch.Tensor, mask_info: MaskInfo) -> torch.Tensor:
31
+ return torch.gather(
32
+ x,
33
+ dim=1,
34
+ index=mask_info.ids_keep[..., 0], # ignore the feature dimension
35
+ )
36
+
37
+
38
+ def masked_alibi(alibi_bias, mask_info):
39
+ H = alibi_bias.size(1)
40
+
41
+ orig_bias = alibi_bias
42
+
43
+ index = mask_info.ids_keep.unsqueeze(1)[..., 0].unsqueeze(-1)
44
+ alibi_bias = torch.gather(
45
+ orig_bias,
46
+ dim=-2,
47
+ index=index.expand(-1, H, -1, mask_info.ids_restore.size(1)),
48
+ )
49
+ alibi_bias = torch.gather(
50
+ alibi_bias,
51
+ dim=-1,
52
+ index=index.transpose(-1, -2).expand(-1, H, alibi_bias.size(-2), -1),
53
+ )
54
+
55
+ return alibi_bias
56
+
57
+
58
+ def random_masking(x, mask_ratio, mask_seed: Optional[MaskSeed]):
59
+ N, L, D = x.shape # batch, length, dim
60
+ len_keep = int(L * (1 - mask_ratio))
61
+
62
+ generator = None
63
+ if mask_seed is not None:
64
+ seed = int(
65
+ hash((mask_seed.seed, mask_seed.update, mask_seed.ids.sum().item())) % 1e6
66
+ )
67
+ generator = torch.Generator(device=x.device)
68
+ generator.manual_seed(seed)
69
+
70
+ noise = torch.rand(N, L, generator=generator, device=x.device) # noise in [0, 1]
71
+
72
+ # sort noise for each sample
73
+ ids_shuffle = noise.argsort(dim=1) # ascend: small is keep, large is remove
74
+ ids_restore = ids_shuffle.argsort(dim=1)
75
+
76
+ # keep the first subset
77
+ ids_keep = ids_shuffle[:, :len_keep]
78
+ ids_keep = ids_keep.unsqueeze(-1).expand(-1, -1, D)
79
+ x_unmasked = torch.gather(x, dim=1, index=ids_keep)
80
+
81
+ # generate the binary mask: 0 is keep, 1 is remove
82
+ mask = torch.ones([N, L], dtype=x.dtype, device=x.device)
83
+ mask[:, :len_keep] = 0
84
+ # unshuffle to get the binary mask
85
+ mask = torch.gather(mask, dim=1, index=ids_restore)
86
+
87
+ ids_restore = ids_restore.unsqueeze(-1).expand(-1, -1, D)
88
+
89
+ return MaskInfo(
90
+ x_unmasked=x_unmasked, mask=mask, ids_restore=ids_restore, ids_keep=ids_keep
91
+ )
92
+
93
+
94
+ def get_alibi(
95
+ max_positions: int,
96
+ attention_heads: int,
97
+ dims: int = 1,
98
+ distance: str = "manhattan",
99
+ ):
100
+ def get_slopes(n):
101
+ def get_slopes_power_of_2(n):
102
+ start = 2 ** (-(2 ** -(math.log2(n) - 3)))
103
+ ratio = start
104
+ return [start * ratio**i for i in range(n)]
105
+
106
+ # In the paper, we only train models that have 2^a heads for some
107
+ # a. This function has some good properties that only occur when
108
+ # the input is a power of 2. To maintain that even when the number
109
+ # of heads is not a power of 2, we use this workaround.
110
+ if math.log2(n).is_integer():
111
+ return get_slopes_power_of_2(n)
112
+ else:
113
+ closest_power_of_2 = 2 ** math.floor(math.log2(n))
114
+ return (
115
+ get_slopes_power_of_2(closest_power_of_2)
116
+ + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2]
117
+ )
118
+
119
+ maxpos = max_positions
120
+ attn_heads = attention_heads
121
+ slopes = torch.Tensor(get_slopes(attn_heads))
122
+
123
+ if dims == 1:
124
+ # prepare alibi position linear bias. Note that wav2vec2 is non
125
+ # autoregressive model so we want a symmetric mask with 0 on the
126
+ # diagonal and other wise linear decreasing valuees
127
+ pos_bias = (
128
+ torch.abs(
129
+ torch.arange(maxpos).unsqueeze(0) - torch.arange(maxpos).unsqueeze(1)
130
+ )
131
+ * -1
132
+ )
133
+ elif dims == 2:
134
+ if distance == "manhattan":
135
+ df = lambda x1, y1, x2, y2: abs(x1 - x2) + abs(y1 - y2)
136
+ elif distance == "euclidean":
137
+ df = lambda x1, y1, x2, y2: math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2)
138
+
139
+ n = math.sqrt(max_positions)
140
+ assert n.is_integer(), n
141
+ n = int(n)
142
+
143
+ pos_bias = torch.zeros((max_positions, max_positions))
144
+
145
+ for i in range(n):
146
+ for j in range(n):
147
+ for k in range(n):
148
+ for l in range(n):
149
+ new_x = i * n + j
150
+ new_y = k * n + l
151
+ pos_bias[new_x, new_y] = -df(i, j, k, l)
152
+
153
+ else:
154
+ raise Exception(f"unsupported number of alibi dims: {dims}")
155
+
156
+ alibi_bias = slopes.unsqueeze(1).unsqueeze(1) * pos_bias.unsqueeze(0).expand(
157
+ attn_heads, -1, -1
158
+ )
159
+
160
+ return alibi_bias
161
+
162
+
163
+ def get_alibi_bias(
164
+ alibi_biases,
165
+ batch_size,
166
+ time_steps,
167
+ heads,
168
+ dtype,
169
+ device,
170
+ dims=1,
171
+ distance="manhattan",
172
+ ):
173
+ cache_key = f"{dims}_{heads}_{distance}"
174
+
175
+ buffered = alibi_biases.get(cache_key, None)
176
+
177
+ target_size = heads * batch_size
178
+ if (
179
+ buffered is None
180
+ or buffered.size(0) < target_size
181
+ or buffered.size(1) < time_steps
182
+ or buffered.dtype != dtype
183
+ or buffered.device != device
184
+ ):
185
+ bt = max(time_steps, buffered.size(1) if buffered is not None else 0)
186
+ bn = max(target_size, buffered.size(0) if buffered is not None else 0) // heads
187
+
188
+ buffered = (
189
+ get_alibi(bt, heads, dims=dims, distance=distance)
190
+ .to(dtype=dtype, device=device)
191
+ .repeat(bn, 1, 1)
192
+ )
193
+
194
+ alibi_biases[cache_key] = buffered
195
+
196
+ b = buffered[:target_size, :time_steps, :time_steps]
197
+ b = b.view(batch_size, heads, time_steps, time_steps)
198
+ return b
199
+
200
+
201
+ def is_xla_tensor(tensor):
202
+ return torch.is_tensor(tensor) and tensor.device.type == "xla"
203
+
204
+
205
+ def index_put(tensor, indices, value):
206
+ if is_xla_tensor(tensor):
207
+ for _ in range(indices.dim(), tensor.dim()):
208
+ indices = indices.unsqueeze(-1)
209
+ if indices.size(-1) < tensor.size(-1):
210
+ indices = indices.expand_as(tensor)
211
+ tensor = torch.mul(tensor, ~indices) + torch.mul(value, indices)
212
+ else:
213
+ tensor[indices] = value
214
+ return tensor
215
+
216
+
217
+ def compute_mask_indices(
218
+ shape: Tuple[int, int],
219
+ padding_mask: Optional[torch.Tensor],
220
+ mask_prob: float,
221
+ mask_length: int,
222
+ mask_type: str = "static",
223
+ mask_other: float = 0.0,
224
+ min_masks: int = 0,
225
+ no_overlap: bool = False,
226
+ min_space: int = 0,
227
+ require_same_masks: bool = True,
228
+ mask_dropout: float = 0.0,
229
+ add_masks: bool = False,
230
+ seed: Optional[int] = None,
231
+ epoch: Optional[int] = None,
232
+ indices: Optional[torch.Tensor] = None,
233
+ idc_select_ver: int = 1, # 2 to reproduce mask_tokens_dataset
234
+ num_mask_ver: int = 2, # 2 to reproduce mask_tokens_dataset
235
+ ) -> np.ndarray:
236
+ """
237
+ Computes random mask spans for a given shape
238
+
239
+ Args:
240
+ shape: the the shape for which to compute masks.
241
+ should be of size 2 where first element is batch size and 2nd is timesteps
242
+ padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
243
+ mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
244
+ number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
245
+ however due to overlaps, the actual number will be smaller (unless no_overlap is True)
246
+ mask_type: how to compute mask lengths
247
+ static = fixed size
248
+ uniform = sample from uniform distribution [mask_other, mask_length*2]
249
+ normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
250
+ poisson = sample from possion distribution with lambda = mask length
251
+ min_masks: minimum number of masked spans
252
+ no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
253
+ min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
254
+ require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample
255
+ mask_dropout: randomly dropout this percentage of masks in each example
256
+ """
257
+
258
+ bsz, all_sz = shape
259
+ mask = np.full((bsz, all_sz), False)
260
+
261
+ if num_mask_ver == 1:
262
+ all_num_mask = int(
263
+ # add a random number for probabilistic rounding
264
+ mask_prob * all_sz / float(mask_length)
265
+ + np.random.rand()
266
+ )
267
+ all_num_mask = max(min_masks, all_num_mask)
268
+
269
+ mask_idcs = []
270
+ for i in range(bsz):
271
+ if seed is not None and epoch is not None and indices is not None:
272
+ seed_i = int(hash((seed, epoch, indices[i].item())) % 1e6)
273
+ else:
274
+ seed_i = None
275
+
276
+ rng = np.random.default_rng(seed_i)
277
+
278
+ if padding_mask is not None:
279
+ sz = all_sz - padding_mask[i].long().sum().item()
280
+ assert sz >= 0, sz
281
+ else:
282
+ sz = all_sz
283
+
284
+ if num_mask_ver == 1:
285
+ if padding_mask is not None:
286
+ num_mask = int(
287
+ # add a random number for probabilistic rounding
288
+ mask_prob * sz / float(mask_length)
289
+ + np.random.rand()
290
+ )
291
+ num_mask = max(min_masks, num_mask)
292
+ else:
293
+ num_mask = all_num_mask
294
+ elif num_mask_ver == 2:
295
+ num_mask = int(
296
+ # add a random number for probabilistic rounding
297
+ mask_prob * sz / float(mask_length)
298
+ + rng.random()
299
+ )
300
+ num_mask = max(min_masks, num_mask)
301
+ else:
302
+ raise ValueError()
303
+
304
+ if mask_type == "static":
305
+ lengths = np.full(num_mask, mask_length)
306
+ elif mask_type == "uniform":
307
+ lengths = rng.randint(mask_other, mask_length * 2 + 1, size=num_mask)
308
+ elif mask_type == "normal":
309
+ lengths = rng.normal(mask_length, mask_other, size=num_mask)
310
+ lengths = [max(1, int(round(x))) for x in lengths]
311
+ elif mask_type == "poisson":
312
+ lengths = rng.poisson(mask_length, size=num_mask)
313
+ lengths = [int(round(x)) for x in lengths]
314
+ else:
315
+ raise Exception("unknown mask selection " + mask_type)
316
+
317
+ if sum(lengths) == 0:
318
+ if mask_type == "static":
319
+ raise ValueError(f"this should never happens")
320
+ else:
321
+ lengths = [min(mask_length, sz - 1)]
322
+
323
+ if no_overlap:
324
+ mask_idc = []
325
+
326
+ def arrange(s, e, length, keep_length):
327
+ span_start = rng.randint(s, e - length)
328
+ mask_idc.extend(span_start + i for i in range(length))
329
+
330
+ new_parts = []
331
+ if span_start - s - min_space >= keep_length:
332
+ new_parts.append((s, span_start - min_space + 1))
333
+ if e - span_start - length - min_space > keep_length:
334
+ new_parts.append((span_start + length + min_space, e))
335
+ return new_parts
336
+
337
+ parts = [(0, sz)]
338
+ min_length = min(lengths)
339
+ for length in sorted(lengths, reverse=True):
340
+ lens = np.fromiter(
341
+ (e - s if e - s >= length + min_space else 0 for s, e in parts),
342
+ np.int,
343
+ )
344
+ l_sum = np.sum(lens)
345
+ if l_sum == 0:
346
+ break
347
+ probs = lens / np.sum(lens)
348
+ c = rng.choice(len(parts), p=probs)
349
+ s, e = parts.pop(c)
350
+ parts.extend(arrange(s, e, length, min_length))
351
+ mask_idc = np.asarray(mask_idc)
352
+ else:
353
+ if idc_select_ver == 1:
354
+ min_len = min(lengths)
355
+ if sz - min_len <= num_mask:
356
+ min_len = sz - num_mask - 1
357
+ mask_idc = rng.choice(sz - min_len, num_mask, replace=False)
358
+ elif idc_select_ver == 2:
359
+ mask_idc = rng.choice(sz, num_mask, replace=False)
360
+ else:
361
+ raise ValueError()
362
+
363
+ mask_idc = np.asarray(
364
+ [
365
+ mask_idc[j] + offset
366
+ for j in range(len(mask_idc))
367
+ for offset in range(lengths[j])
368
+ ]
369
+ )
370
+
371
+ mask_idc = np.unique(mask_idc[mask_idc < sz])
372
+ if len(mask_idc) >= sz:
373
+ raise ValueError(
374
+ (
375
+ f"the entire sequence is masked. "
376
+ f"sz={sz}; mask_idc[mask_idc]; "
377
+ f"index={indices[i] if indices is not None else None}"
378
+ )
379
+ )
380
+ mask_idcs.append(mask_idc)
381
+
382
+ target_len = None
383
+ if require_same_masks:
384
+ if add_masks:
385
+ target_len = max([len(m) for m in mask_idcs])
386
+ else:
387
+ target_len = min([len(m) for m in mask_idcs])
388
+
389
+ for i, mask_idc in enumerate(mask_idcs):
390
+ if target_len is not None and len(mask_idc) > target_len:
391
+ mask_idc = rng.choice(mask_idc, target_len, replace=False)
392
+
393
+ mask[i, mask_idc] = True
394
+
395
+ if target_len is not None and len(mask_idc) < target_len:
396
+ unmasked = np.flatnonzero(~mask[i])
397
+ to_mask = rng.choice(unmasked, target_len - len(mask_idc), replace=False)
398
+ mask[i, to_mask] = True
399
+
400
+ if mask_dropout > 0:
401
+ masked = np.flatnonzero(mask[i])
402
+ num_holes = np.rint(len(masked) * mask_dropout).astype(int)
403
+ to_drop = rng.choice(masked, num_holes, replace=False)
404
+ mask[i, to_drop] = False
405
+
406
+ return mask
407
+
408
+
409
+ def _learned_alibi_bias(
410
+ alibi_bias,
411
+ batch_size,
412
+ time_steps,
413
+ heads,
414
+ scale,
415
+ dtype,
416
+ device,
417
+ ):
418
+ assert alibi_bias.size(1) == heads, alibi_bias.shape
419
+ assert alibi_bias.dtype == dtype, alibi_bias.dtype
420
+ assert alibi_bias.device == device, alibi_bias.device
421
+
422
+ if alibi_bias.size(-1) < time_steps:
423
+ psz = math.ceil((time_steps - alibi_bias.size(-1)) / 2)
424
+ alibi_bias = F.pad(alibi_bias, (psz, psz, psz, psz), mode="replicate")
425
+
426
+ alibi_bias = alibi_bias.expand(batch_size, -1, -1, -1) * scale
427
+ return alibi_bias[..., :time_steps, :time_steps]
428
+
429
+ def make_positions(tensor, padding_idx: int, onnx_trace: bool = False):
430
+ """Replace non-padding symbols with their position numbers.
431
+
432
+ Position numbers begin at padding_idx+1. Padding symbols are ignored.
433
+ """
434
+ # The series of casts and type-conversions here are carefully
435
+ # balanced to both work with ONNX export and XLA. In particular XLA
436
+ # prefers ints, cumsum defaults to output longs, and ONNX doesn't know
437
+ # how to handle the dtype kwarg in cumsum.
438
+ mask = tensor.ne(padding_idx).int()
439
+ return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx
vocab.json ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<s>": 0,
3
+ "<pad>": 1,
4
+ "</s>": 2,
5
+ "<unk>": 3,
6
+ "|": 4,
7
+ "E": 5,
8
+ "S": 6,
9
+ "A": 7,
10
+ "T": 8,
11
+ "I": 9,
12
+ "N": 10,
13
+ "R": 11,
14
+ "L": 12,
15
+ "U": 13,
16
+ "O": 14,
17
+ "D": 15,
18
+ "C": 16,
19
+ "M": 17,
20
+ "P": 18,
21
+ "É": 19,
22
+ "V": 20,
23
+ "G": 21,
24
+ "'": 22,
25
+ "F": 23,
26
+ "B": 24,
27
+ "H": 25,
28
+ "Q": 26,
29
+ "È": 27,
30
+ "À": 28,
31
+ "X": 29,
32
+ "J": 30,
33
+ "Y": 31,
34
+ "K": 32,
35
+ "Z": 33,
36
+ "Ê": 34,
37
+ "W": 35,
38
+ "Ç": 36,
39
+ "Â": 37,
40
+ "Ô": 38,
41
+ "Î": 39,
42
+ "Ï": 40,
43
+ "Û": 41,
44
+ "Ù": 42,
45
+ "Á": 43,
46
+ "Ë": 44,
47
+ "Í": 45,
48
+ "Ü": 46,
49
+ "Ö": 47,
50
+ "Ó": 48,
51
+ "Ä": 49,
52
+ "Ñ": 50,
53
+ "Ú": 51,
54
+ "Ø": 52,
55
+ "Ã": 53,
56
+ "Æ": 54,
57
+ "Å": 55,
58
+ "Ý": 56,
59
+ "Ò": 57,
60
+ "Ð": 58,
61
+ "Ì": 59,
62
+ "Õ": 60,
63
+ "Þ": 61,
64
+ "Г": 62,
65
+ "А": 63,
66
+ "Е": 64,
67
+ "І": 65,
68
+ "Ј": 66,
69
+ "З": 67,
70
+ "И": 68,
71
+ "К": 69,
72
+ "М": 70,
73
+ "Н": 71,
74
+ "П": 72,
75
+ "Р": 73,
76
+ "Э": 74,
77
+ "Ҫ": 75,
78
+ "madeupword0000": 76,
79
+ "madeupword0001": 77,
80
+ "madeupword0002": 78,
81
+ "madeupword0003": 79
82
+ }