Mirror from asapp/sew-d-tiny-100k-ft-ls100h
Browse files- .gitattributes +4 -11
- README.md +125 -0
- config.json +99 -0
- model.safetensors +3 -0
- preprocessor_config.json +9 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +1 -0
- tokenizer_config.json +1 -0
- vocab.json +1 -0
.gitattributes
CHANGED
|
@@ -1,35 +1,28 @@
|
|
| 1 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 4 |
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 1 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 5 |
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 6 |
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 11 |
*.model filter=lfs diff=lfs merge=lfs -text
|
| 12 |
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
| 13 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 14 |
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 15 |
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 16 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
| 17 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 18 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 19 |
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 21 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 22 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 23 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 24 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
| 27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
model.safetensors filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language: en
|
| 3 |
+
datasets:
|
| 4 |
+
- librispeech_asr
|
| 5 |
+
tags:
|
| 6 |
+
- audio
|
| 7 |
+
- speech
|
| 8 |
+
- automatic-speech-recognition
|
| 9 |
+
- hf-asr-leaderboard
|
| 10 |
+
license: apache-2.0
|
| 11 |
+
widget:
|
| 12 |
+
- example_title: Librispeech sample 1
|
| 13 |
+
src: https://cdn-media.huggingface.co/speech_samples/sample1.flac
|
| 14 |
+
- example_title: Librispeech sample 2
|
| 15 |
+
src: https://cdn-media.huggingface.co/speech_samples/sample2.flac
|
| 16 |
+
model-index:
|
| 17 |
+
- name: sew-d-tiny-100k-ft-ls100h
|
| 18 |
+
results:
|
| 19 |
+
- task:
|
| 20 |
+
name: Automatic Speech Recognition
|
| 21 |
+
type: automatic-speech-recognition
|
| 22 |
+
dataset:
|
| 23 |
+
name: LibriSpeech (clean)
|
| 24 |
+
type: librispeech_asr
|
| 25 |
+
config: clean
|
| 26 |
+
split: test
|
| 27 |
+
args:
|
| 28 |
+
language: en
|
| 29 |
+
metrics:
|
| 30 |
+
- name: Test WER
|
| 31 |
+
type: wer
|
| 32 |
+
value: 10.47
|
| 33 |
+
- task:
|
| 34 |
+
name: Automatic Speech Recognition
|
| 35 |
+
type: automatic-speech-recognition
|
| 36 |
+
dataset:
|
| 37 |
+
name: LibriSpeech (other)
|
| 38 |
+
type: librispeech_asr
|
| 39 |
+
config: other
|
| 40 |
+
split: test
|
| 41 |
+
args:
|
| 42 |
+
language: en
|
| 43 |
+
metrics:
|
| 44 |
+
- name: Test WER
|
| 45 |
+
type: wer
|
| 46 |
+
value: 22.73
|
| 47 |
+
---
|
| 48 |
+
|
| 49 |
+
# SEW-D-tiny
|
| 50 |
+
|
| 51 |
+
[SEW-D by ASAPP Research](https://github.com/asappresearch/sew)
|
| 52 |
+
|
| 53 |
+
The base model pretrained on 16kHz sampled speech audio. When using the model make sure that your speech input is also sampled at 16Khz. Note that this model should be fine-tuned on a downstream task, like Automatic Speech Recognition, Speaker Identification, Intent Classification, Emotion Recognition, etc...
|
| 54 |
+
|
| 55 |
+
Paper: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
|
| 56 |
+
|
| 57 |
+
Authors: Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi
|
| 58 |
+
|
| 59 |
+
**Abstract**
|
| 60 |
+
This paper is a study of performance-efficiency trade-offs in pre-trained models for automatic speech recognition (ASR). We focus on wav2vec 2.0, and formalize several architecture designs that influence both the model performance and its efficiency. Putting together all our observations, we introduce SEW (Squeezed and Efficient Wav2vec), a pre-trained model architecture with significant improvements along both performance and efficiency dimensions across a variety of training setups. For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference time, SEW reduces word error rate by 25-50% across different model sizes.
|
| 61 |
+
|
| 62 |
+
The original model can be found under https://github.com/asappresearch/sew#model-checkpoints .
|
| 63 |
+
|
| 64 |
+
# Usage
|
| 65 |
+
To transcribe audio files the model can be used as a standalone acoustic model as follows:
|
| 66 |
+
```python
|
| 67 |
+
from transformers import Wav2Vec2Processor, SEWDForCTC
|
| 68 |
+
from datasets import load_dataset
|
| 69 |
+
import soundfile as sf
|
| 70 |
+
import torch
|
| 71 |
+
|
| 72 |
+
# load the model and preprocessor
|
| 73 |
+
processor = Wav2Vec2Processor.from_pretrained("asapp/sew-d-tiny-100k-ft-ls100h")
|
| 74 |
+
model = SEWDForCTC.from_pretrained("asapp/sew-d-tiny-100k-ft-ls100h")
|
| 75 |
+
|
| 76 |
+
# load the dummy dataset with speech samples
|
| 77 |
+
ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
|
| 78 |
+
|
| 79 |
+
# preprocess
|
| 80 |
+
input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values # Batch size 1
|
| 81 |
+
|
| 82 |
+
# retrieve logits
|
| 83 |
+
logits = model(input_values).logits
|
| 84 |
+
|
| 85 |
+
# take argmax and decode
|
| 86 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
| 87 |
+
transcription = processor.batch_decode(predicted_ids)
|
| 88 |
+
```
|
| 89 |
+
|
| 90 |
+
## Evaluation
|
| 91 |
+
|
| 92 |
+
This code snippet shows how to evaluate **asapp/sew-d-tiny-100k-ft-ls100h** on LibriSpeech's "clean" and "other" test data.
|
| 93 |
+
|
| 94 |
+
```python
|
| 95 |
+
from datasets import load_dataset
|
| 96 |
+
from transformers import SEWDForCTC, Wav2Vec2Processor
|
| 97 |
+
import torch
|
| 98 |
+
from jiwer import wer
|
| 99 |
+
|
| 100 |
+
librispeech_eval = load_dataset("librispeech_asr", "clean", split="test")
|
| 101 |
+
|
| 102 |
+
model = SEWDForCTC.from_pretrained("asapp/sew-d-tiny-100k-ft-ls100h").to("cuda")
|
| 103 |
+
processor = Wav2Vec2Processor.from_pretrained("asapp/sew-d-tiny-100k-ft-ls100h")
|
| 104 |
+
|
| 105 |
+
def map_to_pred(batch):
|
| 106 |
+
input_values = processor(batch["audio"][0]["array"], sampling_rate=16000,
|
| 107 |
+
return_tensors="pt", padding="longest").input_values
|
| 108 |
+
with torch.no_grad():
|
| 109 |
+
logits = model(input_values.to("cuda")).logits
|
| 110 |
+
|
| 111 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
| 112 |
+
transcription = processor.batch_decode(predicted_ids)
|
| 113 |
+
batch["transcription"] = transcription
|
| 114 |
+
return batch
|
| 115 |
+
|
| 116 |
+
result = librispeech_eval.map(map_to_pred, batched=True, batch_size=1, remove_columns=["audio"])
|
| 117 |
+
|
| 118 |
+
print("WER:", wer(result["text"], result["transcription"]))
|
| 119 |
+
```
|
| 120 |
+
|
| 121 |
+
*Result (WER)*:
|
| 122 |
+
|
| 123 |
+
| "clean" | "other" |
|
| 124 |
+
| --- | --- |
|
| 125 |
+
| 10.47 | 22.73 |
|
config.json
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_dropout": 0.1,
|
| 3 |
+
"apply_spec_augment": true,
|
| 4 |
+
"architectures": [
|
| 5 |
+
"SEWDForCTC"
|
| 6 |
+
],
|
| 7 |
+
"attention_dropout": 0.0,
|
| 8 |
+
"bos_token_id": 1,
|
| 9 |
+
"classifier_proj_size": 256,
|
| 10 |
+
"conv_bias": false,
|
| 11 |
+
"conv_dim": [
|
| 12 |
+
64,
|
| 13 |
+
128,
|
| 14 |
+
128,
|
| 15 |
+
128,
|
| 16 |
+
128,
|
| 17 |
+
256,
|
| 18 |
+
256,
|
| 19 |
+
256,
|
| 20 |
+
256,
|
| 21 |
+
512,
|
| 22 |
+
512,
|
| 23 |
+
512,
|
| 24 |
+
512
|
| 25 |
+
],
|
| 26 |
+
"conv_kernel": [
|
| 27 |
+
10,
|
| 28 |
+
3,
|
| 29 |
+
1,
|
| 30 |
+
3,
|
| 31 |
+
1,
|
| 32 |
+
3,
|
| 33 |
+
1,
|
| 34 |
+
3,
|
| 35 |
+
1,
|
| 36 |
+
2,
|
| 37 |
+
1,
|
| 38 |
+
2,
|
| 39 |
+
1
|
| 40 |
+
],
|
| 41 |
+
"conv_stride": [
|
| 42 |
+
5,
|
| 43 |
+
2,
|
| 44 |
+
1,
|
| 45 |
+
2,
|
| 46 |
+
1,
|
| 47 |
+
2,
|
| 48 |
+
1,
|
| 49 |
+
2,
|
| 50 |
+
1,
|
| 51 |
+
2,
|
| 52 |
+
1,
|
| 53 |
+
2,
|
| 54 |
+
1
|
| 55 |
+
],
|
| 56 |
+
"ctc_loss_reduction": "mean",
|
| 57 |
+
"ctc_zero_infinity": false,
|
| 58 |
+
"eos_token_id": 2,
|
| 59 |
+
"feat_extract_activation": "gelu",
|
| 60 |
+
"feat_extract_norm": "group",
|
| 61 |
+
"feat_proj_dropout": 0.0,
|
| 62 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
| 63 |
+
"final_dropout": 0.0,
|
| 64 |
+
"hidden_act": "gelu_python",
|
| 65 |
+
"hidden_dropout": 0.0,
|
| 66 |
+
"hidden_size": 384,
|
| 67 |
+
"initializer_range": 0.02,
|
| 68 |
+
"intermediate_size": 1536,
|
| 69 |
+
"layer_norm_eps": 1e-07,
|
| 70 |
+
"layerdrop": 0.1,
|
| 71 |
+
"mask_feature_length": 64,
|
| 72 |
+
"mask_feature_prob": 0.5,
|
| 73 |
+
"mask_time_length": 10,
|
| 74 |
+
"mask_time_prob": 0.65,
|
| 75 |
+
"max_position_embeddings": 512,
|
| 76 |
+
"model_type": "sew-d",
|
| 77 |
+
"norm_rel_ebd": "layer_norm",
|
| 78 |
+
"num_attention_heads": 6,
|
| 79 |
+
"num_conv_pos_embedding_groups": 16,
|
| 80 |
+
"num_conv_pos_embeddings": 31,
|
| 81 |
+
"num_feat_extract_layers": 13,
|
| 82 |
+
"num_hidden_layers": 12,
|
| 83 |
+
"pad_token_id": 0,
|
| 84 |
+
"pos_att_type": [
|
| 85 |
+
"p2c",
|
| 86 |
+
"c2p"
|
| 87 |
+
],
|
| 88 |
+
"position_biased_input": false,
|
| 89 |
+
"position_buckets": 256,
|
| 90 |
+
"relative_attention": true,
|
| 91 |
+
"share_att_key": true,
|
| 92 |
+
"squeeze_factor": 2,
|
| 93 |
+
"tokenizer_class": "Wav2Vec2CTCTokenizer",
|
| 94 |
+
"torch_dtype": "float32",
|
| 95 |
+
"transformers_version": "4.12.0.dev0",
|
| 96 |
+
"use_weighted_layer_sum": false,
|
| 97 |
+
"vocab_size": 32,
|
| 98 |
+
"feature_layer_norm_eps": 1e-05
|
| 99 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a34b13a854712ea576cbd1dcb6eedcfa976138275570873fad83d0e40e73f166
|
| 3 |
+
size 96537316
|
preprocessor_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"do_normalize": true,
|
| 3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
| 4 |
+
"feature_size": 1,
|
| 5 |
+
"padding_side": "right",
|
| 6 |
+
"padding_value": 0,
|
| 7 |
+
"return_attention_mask": false,
|
| 8 |
+
"sampling_rate": 16000
|
| 9 |
+
}
|
pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8dd5d9f5e6aff70472ddf580d5ce0e387f9edfa46e643dcada39bcc8df053684
|
| 3 |
+
size 96600453
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>", "pad_token": "<pad>", "do_lower_case": false, "word_delimiter_token": "|", "tokenizer_class": "Wav2Vec2CTCTokenizer"}
|
vocab.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"<s>": 1, "<pad>": 0, "</s>": 2, "<unk>": 3, "|": 4, "E": 5, "T": 6, "A": 7, "O": 8, "N": 9, "I": 10, "H": 11, "S": 12, "R": 13, "D": 14, "L": 15, "U": 16, "M": 17, "W": 18, "C": 19, "F": 20, "G": 21, "Y": 22, "P": 23, "B": 24, "V": 25, "K": 26, "'": 27, "X": 28, "J": 29, "Q": 30, "Z": 31}
|