Add template files
Browse files- .gitattributes +1 -0
- README.md +10 -0
- asr/README.md +16 -0
- asr/args.yaml +26 -0
- asr/char.dict +28 -0
- asr/config.yaml +84 -0
- asr/model.ckpt +3 -0
- asr/model.py +39 -0
.gitattributes
CHANGED
|
@@ -15,3 +15,4 @@
|
|
| 15 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 16 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 17 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 15 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 16 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 17 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
asr/model.ckpt filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Fine-tuned Model Submission Template
|
| 2 |
+
|
| 3 |
+
This is a template reprository for the SUPERB benchmark for the _fine-tuned model_ category. In this category, participants are asked to fine-tuned a pretrained model in each of SUPERB's downstream tasks and then store the model weights and hyperparameters in this repo.
|
| 4 |
+
|
| 5 |
+
There are four steps involved in making a submission:
|
| 6 |
+
|
| 7 |
+
1. Fine-tune a pretrained model on a downstream task.
|
| 8 |
+
2. Implement the `PreTrainedModel` interface defined in each `model.py` module.
|
| 9 |
+
3. Store the weights and hyperparameters in the task directory
|
| 10 |
+
4. Push all the files to the Hugging Face Hub.
|
asr/README.md
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: superb
|
| 3 |
+
benchmark: superb
|
| 4 |
+
task: asr
|
| 5 |
+
type: finetuned_upload
|
| 6 |
+
submission_id: {{submission_id}}
|
| 7 |
+
datasets:
|
| 8 |
+
- superb
|
| 9 |
+
tags:
|
| 10 |
+
- automatic-speech-recognition
|
| 11 |
+
widget:
|
| 12 |
+
- label: Librispeech sample 1
|
| 13 |
+
src: https://cdn-media.huggingface.co/speech_samples/sample1.flac
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
# Fine-tuned s3prl model for ASR
|
asr/args.yaml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
auto_resume: false
|
| 2 |
+
backend: nccl
|
| 3 |
+
cache_dir: null
|
| 4 |
+
config: ./downstream/asr/config.yaml
|
| 5 |
+
device: cuda
|
| 6 |
+
downstream: asr
|
| 7 |
+
downstream_variant: null
|
| 8 |
+
evaluate_split: test
|
| 9 |
+
expdir: result/downstream/asr-push-to-hub
|
| 10 |
+
expname: asr-push-to-hub
|
| 11 |
+
from_hf_hub: true
|
| 12 |
+
hub: huggingface
|
| 13 |
+
init_ckpt: null
|
| 14 |
+
local_rank: null
|
| 15 |
+
mode: train
|
| 16 |
+
override: config.downstream_expert.datarc.libri_root='/data/lewis/superb/LibriSpeech',,config.downstream_expert.datarc.bucket_file='/data/lewis/superb/LibriSpeech/len_for_bucket',,config.runner.total_steps=2000
|
| 17 |
+
past_exp: null
|
| 18 |
+
seed: 1337
|
| 19 |
+
upstream: osanseviero/hubert_base
|
| 20 |
+
upstream_ckpt: null
|
| 21 |
+
upstream_feature_selection: hidden_states
|
| 22 |
+
upstream_model_config: null
|
| 23 |
+
upstream_model_name: model.pt
|
| 24 |
+
upstream_refresh: false
|
| 25 |
+
upstream_trainable: false
|
| 26 |
+
verbose: false
|
asr/char.dict
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
| 1980202
|
| 2 |
+
E 1091870
|
| 3 |
+
T 789572
|
| 4 |
+
A 689048
|
| 5 |
+
O 647720
|
| 6 |
+
N 591778
|
| 7 |
+
I 585614
|
| 8 |
+
H 557204
|
| 9 |
+
S 545238
|
| 10 |
+
R 499568
|
| 11 |
+
D 380912
|
| 12 |
+
L 344952
|
| 13 |
+
U 242014
|
| 14 |
+
M 217730
|
| 15 |
+
C 210734
|
| 16 |
+
W 204598
|
| 17 |
+
F 195086
|
| 18 |
+
G 174098
|
| 19 |
+
Y 168548
|
| 20 |
+
P 146722
|
| 21 |
+
B 129608
|
| 22 |
+
V 81496
|
| 23 |
+
K 65070
|
| 24 |
+
' 19660
|
| 25 |
+
X 12530
|
| 26 |
+
J 12062
|
| 27 |
+
Q 8164
|
| 28 |
+
Z 4916
|
asr/config.yaml
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
downstream_expert:
|
| 2 |
+
datarc:
|
| 3 |
+
batch_size: 32
|
| 4 |
+
bucket_file: /data/lewis/superb/LibriSpeech/len_for_bucket
|
| 5 |
+
decoder_args:
|
| 6 |
+
beam: 5
|
| 7 |
+
beam_threshold: 25
|
| 8 |
+
criterion: ctc
|
| 9 |
+
decoder_type: None
|
| 10 |
+
kenlm_model: /path/to/KenLM
|
| 11 |
+
lexicon: /path/to/4-gram.arpa
|
| 12 |
+
lm_weight: 2
|
| 13 |
+
nbest: 1
|
| 14 |
+
sil_weight: 0
|
| 15 |
+
unk_weight: -math.inf
|
| 16 |
+
word_score: -1
|
| 17 |
+
dev-clean:
|
| 18 |
+
- dev-clean
|
| 19 |
+
dev-other:
|
| 20 |
+
- dev-other
|
| 21 |
+
dict_path: ./downstream/asr/char.dict
|
| 22 |
+
eval_batch_size: 1
|
| 23 |
+
libri_root: /data/lewis/superb/LibriSpeech
|
| 24 |
+
num_workers: 12
|
| 25 |
+
test-clean:
|
| 26 |
+
- test-clean
|
| 27 |
+
test-other:
|
| 28 |
+
- test-other
|
| 29 |
+
train:
|
| 30 |
+
- train-clean-100
|
| 31 |
+
train_batch_size: 32
|
| 32 |
+
zero_infinity: true
|
| 33 |
+
modelrc:
|
| 34 |
+
RNNs:
|
| 35 |
+
bidirection: true
|
| 36 |
+
dim:
|
| 37 |
+
- 1024
|
| 38 |
+
- 1024
|
| 39 |
+
dropout:
|
| 40 |
+
- 0.2
|
| 41 |
+
- 0.2
|
| 42 |
+
layer_norm:
|
| 43 |
+
- false
|
| 44 |
+
- false
|
| 45 |
+
module: LSTM
|
| 46 |
+
proj:
|
| 47 |
+
- false
|
| 48 |
+
- false
|
| 49 |
+
sample_rate:
|
| 50 |
+
- 1
|
| 51 |
+
- 1
|
| 52 |
+
sample_style: concat
|
| 53 |
+
total_rate: -1
|
| 54 |
+
Wav2Letter:
|
| 55 |
+
total_rate: 320
|
| 56 |
+
project_dim: 1024
|
| 57 |
+
select: RNNs
|
| 58 |
+
optimizer:
|
| 59 |
+
lr: 0.0001
|
| 60 |
+
name: TorchOptim
|
| 61 |
+
torch_optim_name: Adam
|
| 62 |
+
runner:
|
| 63 |
+
eval_dataloaders:
|
| 64 |
+
- dev-clean
|
| 65 |
+
eval_step: 2000
|
| 66 |
+
gradient_accumulate_steps: 1
|
| 67 |
+
gradient_clipping: 1
|
| 68 |
+
log_step: 100
|
| 69 |
+
max_keep: 1
|
| 70 |
+
save_step: 500
|
| 71 |
+
total_steps: 2000
|
| 72 |
+
specaug:
|
| 73 |
+
apply_freq_mask: true
|
| 74 |
+
apply_time_mask: true
|
| 75 |
+
apply_time_warp: true
|
| 76 |
+
freq_mask_width_range:
|
| 77 |
+
- 0
|
| 78 |
+
- 50
|
| 79 |
+
num_freq_mask: 4
|
| 80 |
+
num_time_mask: 2
|
| 81 |
+
time_mask_width_range:
|
| 82 |
+
- 0
|
| 83 |
+
- 40
|
| 84 |
+
time_warp_window: 5
|
asr/model.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:91030dbbf4f9cd4ab4412aec64a219e972cf0049adbe93269bc4e0965586b693
|
| 3 |
+
size 513965711
|
asr/model.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from s3prl.downstream.runner import Runner
|
| 2 |
+
from typing import Dict
|
| 3 |
+
import torch
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class PreTrainedModel(Runner):
|
| 8 |
+
def __init__(self, path=""):
|
| 9 |
+
"""
|
| 10 |
+
Initialize downstream model.
|
| 11 |
+
"""
|
| 12 |
+
ckp_file = os.path.join(path, "model.ckpt")
|
| 13 |
+
ckp = torch.load(ckp_file, map_location='cpu')
|
| 14 |
+
ckp["Args"].init_ckpt = ckp_file
|
| 15 |
+
ckp["Args"].mode = "inference"
|
| 16 |
+
ckp["Args"].device = "cpu"
|
| 17 |
+
ckp["Config"]["downstream_expert"]["datarc"]["dict_path"] = os.path.join(path,'char.dict')
|
| 18 |
+
|
| 19 |
+
Runner.__init__(self, ckp["Args"], ckp["Config"])
|
| 20 |
+
|
| 21 |
+
def __call__(self, inputs)-> Dict[str, str]:
|
| 22 |
+
"""
|
| 23 |
+
Args:
|
| 24 |
+
inputs (:obj:`np.array`):
|
| 25 |
+
The raw waveform of audio received. By default at 16KHz.
|
| 26 |
+
Return:
|
| 27 |
+
A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing
|
| 28 |
+
the detected text from the input audio.
|
| 29 |
+
"""
|
| 30 |
+
for entry in self.all_entries:
|
| 31 |
+
entry.model.eval()
|
| 32 |
+
|
| 33 |
+
inputs = [torch.FloatTensor(inputs)]
|
| 34 |
+
|
| 35 |
+
with torch.no_grad():
|
| 36 |
+
features = self.upstream.model(inputs)
|
| 37 |
+
features = self.featurizer.model(inputs, features)
|
| 38 |
+
preds = self.downstream.model.inference(features, [])
|
| 39 |
+
return {"text": preds[0]}
|