IlyasMoutawwakil HF Staff commited on
Commit
65384f5
·
verified ·
1 Parent(s): 9f15cd0

Mirror from asapp/sew-d-tiny-100k-ft-ls100h

Browse files
.gitattributes CHANGED
@@ -1,35 +1,28 @@
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
 
11
  *.model filter=lfs diff=lfs merge=lfs -text
12
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
13
  *.onnx filter=lfs diff=lfs merge=lfs -text
14
  *.ot filter=lfs diff=lfs merge=lfs -text
15
  *.parquet filter=lfs diff=lfs merge=lfs -text
16
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
17
  *.pt filter=lfs diff=lfs merge=lfs -text
18
  *.pth filter=lfs diff=lfs merge=lfs -text
19
  *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 
21
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
22
  *.tflite filter=lfs diff=lfs merge=lfs -text
23
  *.tgz filter=lfs diff=lfs merge=lfs -text
 
24
  *.xz filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ datasets:
4
+ - librispeech_asr
5
+ tags:
6
+ - audio
7
+ - speech
8
+ - automatic-speech-recognition
9
+ - hf-asr-leaderboard
10
+ license: apache-2.0
11
+ widget:
12
+ - example_title: Librispeech sample 1
13
+ src: https://cdn-media.huggingface.co/speech_samples/sample1.flac
14
+ - example_title: Librispeech sample 2
15
+ src: https://cdn-media.huggingface.co/speech_samples/sample2.flac
16
+ model-index:
17
+ - name: sew-d-tiny-100k-ft-ls100h
18
+ results:
19
+ - task:
20
+ name: Automatic Speech Recognition
21
+ type: automatic-speech-recognition
22
+ dataset:
23
+ name: LibriSpeech (clean)
24
+ type: librispeech_asr
25
+ config: clean
26
+ split: test
27
+ args:
28
+ language: en
29
+ metrics:
30
+ - name: Test WER
31
+ type: wer
32
+ value: 10.47
33
+ - task:
34
+ name: Automatic Speech Recognition
35
+ type: automatic-speech-recognition
36
+ dataset:
37
+ name: LibriSpeech (other)
38
+ type: librispeech_asr
39
+ config: other
40
+ split: test
41
+ args:
42
+ language: en
43
+ metrics:
44
+ - name: Test WER
45
+ type: wer
46
+ value: 22.73
47
+ ---
48
+
49
+ # SEW-D-tiny
50
+
51
+ [SEW-D by ASAPP Research](https://github.com/asappresearch/sew)
52
+
53
+ The base model pretrained on 16kHz sampled speech audio. When using the model make sure that your speech input is also sampled at 16Khz. Note that this model should be fine-tuned on a downstream task, like Automatic Speech Recognition, Speaker Identification, Intent Classification, Emotion Recognition, etc...
54
+
55
+ Paper: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
56
+
57
+ Authors: Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi
58
+
59
+ **Abstract**
60
+ This paper is a study of performance-efficiency trade-offs in pre-trained models for automatic speech recognition (ASR). We focus on wav2vec 2.0, and formalize several architecture designs that influence both the model performance and its efficiency. Putting together all our observations, we introduce SEW (Squeezed and Efficient Wav2vec), a pre-trained model architecture with significant improvements along both performance and efficiency dimensions across a variety of training setups. For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference time, SEW reduces word error rate by 25-50% across different model sizes.
61
+
62
+ The original model can be found under https://github.com/asappresearch/sew#model-checkpoints .
63
+
64
+ # Usage
65
+ To transcribe audio files the model can be used as a standalone acoustic model as follows:
66
+ ```python
67
+ from transformers import Wav2Vec2Processor, SEWDForCTC
68
+ from datasets import load_dataset
69
+ import soundfile as sf
70
+ import torch
71
+
72
+ # load the model and preprocessor
73
+ processor = Wav2Vec2Processor.from_pretrained("asapp/sew-d-tiny-100k-ft-ls100h")
74
+ model = SEWDForCTC.from_pretrained("asapp/sew-d-tiny-100k-ft-ls100h")
75
+
76
+ # load the dummy dataset with speech samples
77
+ ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
78
+
79
+ # preprocess
80
+ input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values # Batch size 1
81
+
82
+ # retrieve logits
83
+ logits = model(input_values).logits
84
+
85
+ # take argmax and decode
86
+ predicted_ids = torch.argmax(logits, dim=-1)
87
+ transcription = processor.batch_decode(predicted_ids)
88
+ ```
89
+
90
+ ## Evaluation
91
+
92
+ This code snippet shows how to evaluate **asapp/sew-d-tiny-100k-ft-ls100h** on LibriSpeech's "clean" and "other" test data.
93
+
94
+ ```python
95
+ from datasets import load_dataset
96
+ from transformers import SEWDForCTC, Wav2Vec2Processor
97
+ import torch
98
+ from jiwer import wer
99
+
100
+ librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")
101
+
102
+ model = SEWDForCTC.from_pretrained("asapp/sew-d-tiny-100k-ft-ls100h").to("cuda")
103
+ processor = Wav2Vec2Processor.from_pretrained("asapp/sew-d-tiny-100k-ft-ls100h")
104
+
105
+ def map_to_pred(batch):
106
+ input_values = processor(batch["audio"][0]["array"], sampling_rate=16000,
107
+ return_tensors="pt", padding="longest").input_values
108
+ with torch.no_grad():
109
+ logits = model(input_values.to("cuda")).logits
110
+
111
+ predicted_ids = torch.argmax(logits, dim=-1)
112
+ transcription = processor.batch_decode(predicted_ids)
113
+ batch["transcription"] = transcription
114
+ return batch
115
+
116
+ result = librispeech_eval.map(map_to_pred, batched=True, batch_size=1, remove_columns=["audio"])
117
+
118
+ print("WER:", wer(result["text"], result["transcription"]))
119
+ ```
120
+
121
+ *Result (WER)*:
122
+
123
+ | "clean" | "other" |
124
+ | --- | --- |
125
+ | 10.47 | 22.73 |
config.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "SEWDForCTC"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "classifier_proj_size": 256,
10
+ "conv_bias": false,
11
+ "conv_dim": [
12
+ 64,
13
+ 128,
14
+ 128,
15
+ 128,
16
+ 128,
17
+ 256,
18
+ 256,
19
+ 256,
20
+ 256,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 1,
30
+ 3,
31
+ 1,
32
+ 3,
33
+ 1,
34
+ 3,
35
+ 1,
36
+ 2,
37
+ 1,
38
+ 2,
39
+ 1
40
+ ],
41
+ "conv_stride": [
42
+ 5,
43
+ 2,
44
+ 1,
45
+ 2,
46
+ 1,
47
+ 2,
48
+ 1,
49
+ 2,
50
+ 1,
51
+ 2,
52
+ 1,
53
+ 2,
54
+ 1
55
+ ],
56
+ "ctc_loss_reduction": "mean",
57
+ "ctc_zero_infinity": false,
58
+ "eos_token_id": 2,
59
+ "feat_extract_activation": "gelu",
60
+ "feat_extract_norm": "group",
61
+ "feat_proj_dropout": 0.0,
62
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
63
+ "final_dropout": 0.0,
64
+ "hidden_act": "gelu_python",
65
+ "hidden_dropout": 0.0,
66
+ "hidden_size": 384,
67
+ "initializer_range": 0.02,
68
+ "intermediate_size": 1536,
69
+ "layer_norm_eps": 1e-07,
70
+ "layerdrop": 0.1,
71
+ "mask_feature_length": 64,
72
+ "mask_feature_prob": 0.5,
73
+ "mask_time_length": 10,
74
+ "mask_time_prob": 0.65,
75
+ "max_position_embeddings": 512,
76
+ "model_type": "sew-d",
77
+ "norm_rel_ebd": "layer_norm",
78
+ "num_attention_heads": 6,
79
+ "num_conv_pos_embedding_groups": 16,
80
+ "num_conv_pos_embeddings": 31,
81
+ "num_feat_extract_layers": 13,
82
+ "num_hidden_layers": 12,
83
+ "pad_token_id": 0,
84
+ "pos_att_type": [
85
+ "p2c",
86
+ "c2p"
87
+ ],
88
+ "position_biased_input": false,
89
+ "position_buckets": 256,
90
+ "relative_attention": true,
91
+ "share_att_key": true,
92
+ "squeeze_factor": 2,
93
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
94
+ "torch_dtype": "float32",
95
+ "transformers_version": "4.12.0.dev0",
96
+ "use_weighted_layer_sum": false,
97
+ "vocab_size": 32,
98
+ "feature_layer_norm_eps": 1e-05
99
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a34b13a854712ea576cbd1dcb6eedcfa976138275570873fad83d0e40e73f166
3
+ size 96537316
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dd5d9f5e6aff70472ddf580d5ce0e387f9edfa46e643dcada39bcc8df053684
3
+ size 96600453
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"<s>": 1, "<pad>": 0, "</s>": 2, "<unk>": 3, "|": 4, "E": 5, "T": 6, "A": 7, "O": 8, "N": 9, "I": 10, "H": 11, "S": 12, "R": 13, "D": 14, "L": 15, "U": 16, "M": 17, "W": 18, "C": 19, "F": 20, "G": 21, "Y": 22, "P": 23, "B": 24, "V": 25, "K": 26, "'": 27, "X": 28, "J": 29, "Q": 30, "Z": 31}