Commit
·
9168399
1
Parent(s):
7113da7
add model
Browse files- preprocessor_config.json +13 -0
- processor_wav2vec2_new.py +106 -0
- special_tokens_map.json +1 -0
- tokenizer_config.json +1 -0
- vocab.json +1 -0
preprocessor_config.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"auto_map": {
|
| 3 |
+
"AutoProcessor": "processor_wav2vec2_new.Wav2Vec2ProcessorNew"
|
| 4 |
+
},
|
| 5 |
+
"do_normalize": true,
|
| 6 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
| 7 |
+
"feature_size": 1,
|
| 8 |
+
"padding_side": "right",
|
| 9 |
+
"padding_value": 0.0,
|
| 10 |
+
"processor_class": "Wav2Vec2ProcessorNew",
|
| 11 |
+
"return_attention_mask": false,
|
| 12 |
+
"sampling_rate": 16000
|
| 13 |
+
}
|
processor_wav2vec2_new.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
# Copyright 2021 The HuggingFace Inc. team.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
"""
|
| 16 |
+
Speech processor class for Wav2Vec2
|
| 17 |
+
"""
|
| 18 |
+
import warnings
|
| 19 |
+
from contextlib import contextmanager
|
| 20 |
+
|
| 21 |
+
from transformers import ProcessorMixin
|
| 22 |
+
from transformers import Wav2Vec2FeatureExtractor
|
| 23 |
+
from transformers import Wav2Vec2CTCTokenizer
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class Wav2Vec2ProcessorNew(ProcessorMixin):
|
| 27 |
+
r"""
|
| 28 |
+
Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single
|
| 29 |
+
processor.
|
| 30 |
+
|
| 31 |
+
[`Wav2Vec2Processor`] offers all the functionalities of [`Wav2Vec2FeatureExtractor`] and [`PreTrainedTokenizer`].
|
| 32 |
+
See the docstring of [`~Wav2Vec2Processor.__call__`] and [`~Wav2Vec2Processor.decode`] for more information.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
feature_extractor (`Wav2Vec2FeatureExtractor`):
|
| 36 |
+
An instance of [`Wav2Vec2FeatureExtractor`]. The feature extractor is a required input.
|
| 37 |
+
tokenizer ([`PreTrainedTokenizer`]):
|
| 38 |
+
An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
|
| 39 |
+
"""
|
| 40 |
+
feature_extractor_class = "Wav2Vec2FeatureExtractor"
|
| 41 |
+
tokenizer_class = "AutoTokenizer"
|
| 42 |
+
|
| 43 |
+
def __init__(self, feature_extractor, tokenizer):
|
| 44 |
+
super().__init__(feature_extractor, tokenizer)
|
| 45 |
+
self.current_processor = self.feature_extractor
|
| 46 |
+
|
| 47 |
+
@classmethod
|
| 48 |
+
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
| 49 |
+
try:
|
| 50 |
+
return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
|
| 51 |
+
except OSError:
|
| 52 |
+
warnings.warn(
|
| 53 |
+
f"Loading a tokenizer inside {cls.__name__} from a config that does not"
|
| 54 |
+
" include a `tokenizer_class` attribute is deprecated and will be "
|
| 55 |
+
"removed in v5. Please add `'tokenizer_class': 'Wav2Vec2CTCTokenizer'`"
|
| 56 |
+
" attribute to either your `config.json` or `tokenizer_config.json` "
|
| 57 |
+
"file to suppress this warning: ",
|
| 58 |
+
FutureWarning,
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
| 62 |
+
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
|
| 63 |
+
|
| 64 |
+
return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
|
| 65 |
+
|
| 66 |
+
def __call__(self, *args, **kwargs):
|
| 67 |
+
"""
|
| 68 |
+
When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
|
| 69 |
+
[`~Wav2Vec2FeatureExtractor.__call__`] and returns its output. If used in the context
|
| 70 |
+
[`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
|
| 71 |
+
[`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
|
| 72 |
+
"""
|
| 73 |
+
return self.current_processor(*args, **kwargs)
|
| 74 |
+
|
| 75 |
+
def pad(self, *args, **kwargs):
|
| 76 |
+
"""
|
| 77 |
+
When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
|
| 78 |
+
[`~Wav2Vec2FeatureExtractor.pad`] and returns its output. If used in the context
|
| 79 |
+
[`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
|
| 80 |
+
[`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above two methods for more information.
|
| 81 |
+
"""
|
| 82 |
+
return self.current_processor.pad(*args, **kwargs)
|
| 83 |
+
|
| 84 |
+
def batch_decode(self, *args, **kwargs):
|
| 85 |
+
"""
|
| 86 |
+
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
|
| 87 |
+
refer to the docstring of this method for more information.
|
| 88 |
+
"""
|
| 89 |
+
return self.tokenizer.batch_decode(*args, **kwargs)
|
| 90 |
+
|
| 91 |
+
def decode(self, *args, **kwargs):
|
| 92 |
+
"""
|
| 93 |
+
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
|
| 94 |
+
to the docstring of this method for more information.
|
| 95 |
+
"""
|
| 96 |
+
return self.tokenizer.decode(*args, **kwargs)
|
| 97 |
+
|
| 98 |
+
@contextmanager
|
| 99 |
+
def as_target_processor(self):
|
| 100 |
+
"""
|
| 101 |
+
Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
|
| 102 |
+
Wav2Vec2.
|
| 103 |
+
"""
|
| 104 |
+
self.current_processor = self.tokenizer
|
| 105 |
+
yield
|
| 106 |
+
self.current_processor = self.feature_extractor
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "name_or_path": "hf-internal-testing/processor_with_lm", "auto_map": {"AutoProcessor": "processor_wav2vec2_new.Wav2Vec2ProcessorNew"}, "tokenizer_class": "Wav2Vec2CTCTokenizer", "processor_class": "Wav2Vec2ProcessorNew"}
|
vocab.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"|": 0, "<pad>": 1, "<unk>": 2, "<s>": 3, "</s>": 4, "a": 5, "b": 6, "c": 7, "d": 8, "e": 9, "f": 10, "g": 11, "h": 12, "i": 13, "j": 14, "k": 15}
|