patrickvonplaten commited on
Commit
9168399
·
1 Parent(s): 7113da7
preprocessor_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processor_wav2vec2_new.Wav2Vec2ProcessorNew"
4
+ },
5
+ "do_normalize": true,
6
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
7
+ "feature_size": 1,
8
+ "padding_side": "right",
9
+ "padding_value": 0.0,
10
+ "processor_class": "Wav2Vec2ProcessorNew",
11
+ "return_attention_mask": false,
12
+ "sampling_rate": 16000
13
+ }
processor_wav2vec2_new.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2021 The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Speech processor class for Wav2Vec2
17
+ """
18
+ import warnings
19
+ from contextlib import contextmanager
20
+
21
+ from transformers import ProcessorMixin
22
+ from transformers import Wav2Vec2FeatureExtractor
23
+ from transformers import Wav2Vec2CTCTokenizer
24
+
25
+
26
+ class Wav2Vec2ProcessorNew(ProcessorMixin):
27
+ r"""
28
+ Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single
29
+ processor.
30
+
31
+ [`Wav2Vec2Processor`] offers all the functionalities of [`Wav2Vec2FeatureExtractor`] and [`PreTrainedTokenizer`].
32
+ See the docstring of [`~Wav2Vec2Processor.__call__`] and [`~Wav2Vec2Processor.decode`] for more information.
33
+
34
+ Args:
35
+ feature_extractor (`Wav2Vec2FeatureExtractor`):
36
+ An instance of [`Wav2Vec2FeatureExtractor`]. The feature extractor is a required input.
37
+ tokenizer ([`PreTrainedTokenizer`]):
38
+ An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
39
+ """
40
+ feature_extractor_class = "Wav2Vec2FeatureExtractor"
41
+ tokenizer_class = "AutoTokenizer"
42
+
43
+ def __init__(self, feature_extractor, tokenizer):
44
+ super().__init__(feature_extractor, tokenizer)
45
+ self.current_processor = self.feature_extractor
46
+
47
+ @classmethod
48
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
49
+ try:
50
+ return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
51
+ except OSError:
52
+ warnings.warn(
53
+ f"Loading a tokenizer inside {cls.__name__} from a config that does not"
54
+ " include a `tokenizer_class` attribute is deprecated and will be "
55
+ "removed in v5. Please add `'tokenizer_class': 'Wav2Vec2CTCTokenizer'`"
56
+ " attribute to either your `config.json` or `tokenizer_config.json` "
57
+ "file to suppress this warning: ",
58
+ FutureWarning,
59
+ )
60
+
61
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
62
+ tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
63
+
64
+ return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
65
+
66
+ def __call__(self, *args, **kwargs):
67
+ """
68
+ When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
69
+ [`~Wav2Vec2FeatureExtractor.__call__`] and returns its output. If used in the context
70
+ [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
71
+ [`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
72
+ """
73
+ return self.current_processor(*args, **kwargs)
74
+
75
+ def pad(self, *args, **kwargs):
76
+ """
77
+ When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
78
+ [`~Wav2Vec2FeatureExtractor.pad`] and returns its output. If used in the context
79
+ [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
80
+ [`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above two methods for more information.
81
+ """
82
+ return self.current_processor.pad(*args, **kwargs)
83
+
84
+ def batch_decode(self, *args, **kwargs):
85
+ """
86
+ This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
87
+ refer to the docstring of this method for more information.
88
+ """
89
+ return self.tokenizer.batch_decode(*args, **kwargs)
90
+
91
+ def decode(self, *args, **kwargs):
92
+ """
93
+ This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
94
+ to the docstring of this method for more information.
95
+ """
96
+ return self.tokenizer.decode(*args, **kwargs)
97
+
98
+ @contextmanager
99
+ def as_target_processor(self):
100
+ """
101
+ Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
102
+ Wav2Vec2.
103
+ """
104
+ self.current_processor = self.tokenizer
105
+ yield
106
+ self.current_processor = self.feature_extractor
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|", "special_tokens_map_file": null, "name_or_path": "hf-internal-testing/processor_with_lm", "auto_map": {"AutoProcessor": "processor_wav2vec2_new.Wav2Vec2ProcessorNew"}, "tokenizer_class": "Wav2Vec2CTCTokenizer", "processor_class": "Wav2Vec2ProcessorNew"}
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"|": 0, "<pad>": 1, "<unk>": 2, "<s>": 3, "</s>": 4, "a": 5, "b": 6, "c": 7, "d": 8, "e": 9, "f": 10, "g": 11, "h": 12, "i": 13, "j": 14, "k": 15}