DPTNet (code, models, paper)
Browse files- .gitattributes +1 -0
- DPTNet. A Dual-Path Transformer Architecture for Scene Text Detection.pdf +3 -0
- code/DPTNet [ilyakava] +3.zip +3 -0
- code/DPTNet.zip +3 -0
- code/TargetSpeakerEnhance.zip +3 -0
- code/dptnet_mindspore.zip +3 -0
- models/DPTNet_Libri1Mix_enhsingle_16k/.gitattributes +8 -0
- models/DPTNet_Libri1Mix_enhsingle_16k/README.md +86 -0
- models/DPTNet_Libri1Mix_enhsingle_16k/pytorch_model.bin +3 -0
- models/DPTNet_Libri1Mix_enhsingle_16k/source.txt +1 -0
- models/DPTNet_WHAMR_enhsingle_16k/.gitattributes +27 -0
- models/DPTNet_WHAMR_enhsingle_16k/README.md +82 -0
- models/DPTNet_WHAMR_enhsingle_16k/pytorch_model.bin +3 -0
- models/DPTNet_WHAMR_enhsingle_16k/source.txt +1 -0
- models/DPTNet_jaCappella_VES_48k/.gitattributes +34 -0
- models/DPTNet_jaCappella_VES_48k/README.md +80 -0
- models/DPTNet_jaCappella_VES_48k/best_model.pth +3 -0
- models/DPTNet_jaCappella_VES_48k/conf.yml +46 -0
- models/DPTNet_jaCappella_VES_48k/source.txt +1 -0
- models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/.gitattributes +27 -0
- models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/README.md +253 -0
- models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_stats_8k/train/feats_stats.npz +3 -0
- models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/99epoch.pth +3 -0
- models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/RESULTS.md +20 -0
- models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/config.yaml +169 -0
- models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/backward_time.png +0 -0
- models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/forward_time.png +0 -0
- models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/gpu_max_cached_mem_GB.png +0 -0
- models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/iter_time.png +0 -0
- models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/loss.png +0 -0
- models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/optim0_lr0.png +0 -0
- models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/optim_step_time.png +0 -0
- models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/si_snr_loss.png +0 -0
- models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/train_time.png +0 -0
- models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/valid.loss.best.pth +3 -0
- models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/meta.yaml +8 -0
- models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/source.txt +1 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
DPTNet.[[:space:]]A[[:space:]]Dual-Path[[:space:]]Transformer[[:space:]]Architecture[[:space:]]for[[:space:]]Scene[[:space:]]Text[[:space:]]Detection.pdf filter=lfs diff=lfs merge=lfs -text
|
DPTNet. A Dual-Path Transformer Architecture for Scene Text Detection.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8419807e87010892c99f4def064fad96cb0bcfd25ec692c0d40426900e34921c
|
| 3 |
+
size 1920411
|
code/DPTNet [ilyakava] +3.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e1a744916e8e5ba210a5768f0c218e031911b044b618ed129ac158dcc52ab0e
|
| 3 |
+
size 74404
|
code/DPTNet.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8d9e69327140cf7f62d3c6f2c41f4636ecf65d35524575c9c249dd3eb9a326ec
|
| 3 |
+
size 51997
|
code/TargetSpeakerEnhance.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d51a6516e72917aca4a9d43a7df92c568b49b26e40bd28a0ef909e3ab8eb8139
|
| 3 |
+
size 2300975
|
code/dptnet_mindspore.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fe820cc93c5b217a14e9f516d4d156ff1334261ac1eb8b56d61fcb4214573b2c
|
| 3 |
+
size 78533
|
models/DPTNet_Libri1Mix_enhsingle_16k/.gitattributes
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
models/DPTNet_Libri1Mix_enhsingle_16k/README.md
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- asteroid
|
| 4 |
+
- audio
|
| 5 |
+
- DPTNet
|
| 6 |
+
- audio-to-audio
|
| 7 |
+
datasets:
|
| 8 |
+
- Libri1Mix
|
| 9 |
+
- enh_single
|
| 10 |
+
license: cc-by-sa-4.0
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
## Asteroid model `JorisCos/DPTNet_Libri1Mix_enhsignle_16k`
|
| 14 |
+
|
| 15 |
+
Description:
|
| 16 |
+
|
| 17 |
+
This model was trained by Joris Cosentino using the librimix recipe in [Asteroid](https://github.com/asteroid-team/asteroid).
|
| 18 |
+
It was trained on the `enh_single` task of the Libri1Mix dataset.
|
| 19 |
+
|
| 20 |
+
Training config:
|
| 21 |
+
|
| 22 |
+
```yml
|
| 23 |
+
data:
|
| 24 |
+
n_src: 1
|
| 25 |
+
sample_rate: 16000
|
| 26 |
+
segment: 3
|
| 27 |
+
task: enh_single
|
| 28 |
+
train_dir: data/wav16k/min/train-360
|
| 29 |
+
valid_dir: data/wav16k/min/dev
|
| 30 |
+
filterbank:
|
| 31 |
+
kernel_size: 16
|
| 32 |
+
n_filters: 64
|
| 33 |
+
stride: 8
|
| 34 |
+
masknet:
|
| 35 |
+
bidirectional: true
|
| 36 |
+
chunk_size: 100
|
| 37 |
+
dropout: 0
|
| 38 |
+
ff_activation: relu
|
| 39 |
+
ff_hid: 256
|
| 40 |
+
hop_size: 50
|
| 41 |
+
in_chan: 64
|
| 42 |
+
mask_act: sigmoid
|
| 43 |
+
n_repeats: 2
|
| 44 |
+
n_src: 1
|
| 45 |
+
norm_type: gLN
|
| 46 |
+
out_chan: 64
|
| 47 |
+
optim:
|
| 48 |
+
lr: 0.001
|
| 49 |
+
optimizer: adam
|
| 50 |
+
weight_decay: 1.0e-05
|
| 51 |
+
scheduler:
|
| 52 |
+
d_model: 64
|
| 53 |
+
steps_per_epoch: 10000
|
| 54 |
+
training:
|
| 55 |
+
batch_size: 4
|
| 56 |
+
early_stop: true
|
| 57 |
+
epochs: 200
|
| 58 |
+
gradient_clipping: 5
|
| 59 |
+
half_lr: true
|
| 60 |
+
num_workers: 4
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
Results:
|
| 65 |
+
|
| 66 |
+
On Libri1Mix min test set :
|
| 67 |
+
```yml
|
| 68 |
+
si_sdr: 14.829670037349064
|
| 69 |
+
si_sdr_imp: 11.379888731489366
|
| 70 |
+
sdr: 15.395712644737149
|
| 71 |
+
sdr_imp: 11.893049845524112
|
| 72 |
+
sir: Infinity
|
| 73 |
+
sir_imp: NaN
|
| 74 |
+
sar: 15.395712644737149
|
| 75 |
+
sar_imp: 11.893049845524112
|
| 76 |
+
stoi: 0.9301948391058859
|
| 77 |
+
stoi_imp: 0.13427501556534832
|
| 78 |
+
```
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
License notice:
|
| 82 |
+
|
| 83 |
+
This work "DPTNet_Libri1Mix_enhsignle_16k" is a derivative of [LibriSpeech ASR corpus](http://www.openslr.org/12) by Vassil Panayotov,
|
| 84 |
+
used under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/); of The WSJ0 Hipster Ambient Mixtures
|
| 85 |
+
dataset by [Whisper.ai](http://wham.whisper.ai/), used under [CC BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/) (Research only).
|
| 86 |
+
"DPTNet_Libri1Mix_enhsignle_16k" is licensed under [Attribution-ShareAlike 3.0 Unported](https://creativecommons.org/licenses/by-sa/3.0/) by Joris Cosentino
|
models/DPTNet_Libri1Mix_enhsingle_16k/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0f4f38dc0be2bcb479364b4b49fdc0c92d77fc3f1aa6049090cd3ea0db95019f
|
| 3 |
+
size 11437018
|
models/DPTNet_Libri1Mix_enhsingle_16k/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/JorisCos/DPTNet_Libri1Mix_enhsingle_16k
|
models/DPTNet_WHAMR_enhsingle_16k/.gitattributes
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
models/DPTNet_WHAMR_enhsingle_16k/README.md
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- asteroid
|
| 4 |
+
- audio
|
| 5 |
+
- DPTNet
|
| 6 |
+
- audio-to-audio
|
| 7 |
+
datasets:
|
| 8 |
+
- Libri1Mix
|
| 9 |
+
- enh_single
|
| 10 |
+
license: cc-by-sa-4.0
|
| 11 |
+
---
|
| 12 |
+
## Asteroid model `cankeles/DPTNet_WHAMR_enhsignle_16k`
|
| 13 |
+
|
| 14 |
+
Description:
|
| 15 |
+
|
| 16 |
+
This model was trained by M. Can Keleş using the librimix recipe in [Asteroid](https://github.com/asteroid-team/asteroid).
|
| 17 |
+
It was trained on the `enh_single` task of the Libri1Mix dataset.
|
| 18 |
+
|
| 19 |
+
Training config:
|
| 20 |
+
|
| 21 |
+
```yml
|
| 22 |
+
data:
|
| 23 |
+
mode: min
|
| 24 |
+
nondefault_nsrc: null
|
| 25 |
+
sample_rate: 16000
|
| 26 |
+
segment: 2.0
|
| 27 |
+
task: enh_single
|
| 28 |
+
train_dir: wav16k/min/tr/
|
| 29 |
+
valid_dir: wav16k/min/cv/
|
| 30 |
+
filterbank:
|
| 31 |
+
kernel_size: 16
|
| 32 |
+
n_filters: 64
|
| 33 |
+
stride: 8
|
| 34 |
+
main_args:
|
| 35 |
+
exp_dir: exp/tmp
|
| 36 |
+
help: null
|
| 37 |
+
masknet:
|
| 38 |
+
bidirectional: true
|
| 39 |
+
chunk_size: 100
|
| 40 |
+
dropout: 0
|
| 41 |
+
ff_activation: relu
|
| 42 |
+
ff_hid: 256
|
| 43 |
+
hop_size: 50
|
| 44 |
+
in_chan: 64
|
| 45 |
+
mask_act: sigmoid
|
| 46 |
+
n_repeats: 2
|
| 47 |
+
n_src: 1
|
| 48 |
+
norm_type: gLN
|
| 49 |
+
out_chan: 64
|
| 50 |
+
optim:
|
| 51 |
+
lr: 0.001
|
| 52 |
+
optimizer: adam
|
| 53 |
+
weight_decay: 1.0e-05
|
| 54 |
+
positional arguments: {}
|
| 55 |
+
scheduler:
|
| 56 |
+
d_model: 64
|
| 57 |
+
steps_per_epoch: 10000
|
| 58 |
+
training:
|
| 59 |
+
batch_size: 4
|
| 60 |
+
early_stop: true
|
| 61 |
+
epochs: 60
|
| 62 |
+
gradient_clipping: 5
|
| 63 |
+
half_lr: true
|
| 64 |
+
num_workers: 4
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
Results:
|
| 69 |
+
|
| 70 |
+
On custom min test set :
|
| 71 |
+
```yml
|
| 72 |
+
'sar': 12.853384266251018,
|
| 73 |
+
'sar_imp': 8.950332361953906,
|
| 74 |
+
'sdr': 12.853384266251018,
|
| 75 |
+
'sdr_imp': 8.950332361953906,
|
| 76 |
+
'si_sdr': 12.247012621312548,
|
| 77 |
+
'si_sdr_imp': 8.429646186633407,
|
| 78 |
+
'sir': inf,
|
| 79 |
+
'sir_imp': nan,
|
| 80 |
+
'stoi': 0.9022338865380519,
|
| 81 |
+
'stoi_imp': 0.09735707619500522
|
| 82 |
+
```
|
models/DPTNet_WHAMR_enhsingle_16k/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:950d7ec1c41e498f3300f9322a019c370509cb72b1f36826bfcf40d6af7c4101
|
| 3 |
+
size 11434540
|
models/DPTNet_WHAMR_enhsingle_16k/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/cankeles/DPTNet_WHAMR_enhsingle_16k
|
models/DPTNet_jaCappella_VES_48k/.gitattributes
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
models/DPTNet_jaCappella_VES_48k/README.md
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: cc-by-nc-4.0
|
| 3 |
+
language:
|
| 4 |
+
- ja
|
| 5 |
+
tags:
|
| 6 |
+
- music
|
| 7 |
+
- speech
|
| 8 |
+
- audio
|
| 9 |
+
- audio-to-audio
|
| 10 |
+
- a cappella
|
| 11 |
+
- vocal ensemble
|
| 12 |
+
datasets:
|
| 13 |
+
- jaCappella
|
| 14 |
+
metrics:
|
| 15 |
+
- SI-SDR
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
# DPTNet trained with the jaCappella corpus for vocal ensemble separation
|
| 19 |
+
|
| 20 |
+
This model was trained by Tomohiko Nakamura using [the codebase](https://github.com/TomohikoNakamura/asteroid_jaCappella)).
|
| 21 |
+
It was trained on the vocal ensemble separation task of [the jaCappella dataset](https://tomohikonakamura.github.io/jaCappella_corpus/).
|
| 22 |
+
[The paper](https://doi.org/10.1109/ICASSP49357.2023.10095569) was published in ICASSP 2023 ([arXiv](https://arxiv.org/abs/2211.16028)).
|
| 23 |
+
|
| 24 |
+
# License
|
| 25 |
+
See [the jaCappella dataset page](https://tomohikonakamura.github.io/jaCappella_corpus/).
|
| 26 |
+
|
| 27 |
+
# Citation
|
| 28 |
+
See [the jaCappella dataset page](https://tomohikonakamura.github.io/jaCappella_corpus/).
|
| 29 |
+
|
| 30 |
+
# Configuration
|
| 31 |
+
```yaml
|
| 32 |
+
data:
|
| 33 |
+
num_workers: 12
|
| 34 |
+
sample_rate: 48000
|
| 35 |
+
samples_per_track: 13
|
| 36 |
+
seed: 42
|
| 37 |
+
seq_dur: 5.046
|
| 38 |
+
source_augmentations:
|
| 39 |
+
- gain
|
| 40 |
+
sources:
|
| 41 |
+
- vocal_percussion
|
| 42 |
+
- bass
|
| 43 |
+
- alto
|
| 44 |
+
- tenor
|
| 45 |
+
- soprano
|
| 46 |
+
- lead_vocal
|
| 47 |
+
filterbank:
|
| 48 |
+
kernel_size: 32
|
| 49 |
+
n_filters: 64
|
| 50 |
+
stride: 16
|
| 51 |
+
masknet:
|
| 52 |
+
bidirectional: true
|
| 53 |
+
chunk_size: 174
|
| 54 |
+
dropout: 0
|
| 55 |
+
ff_activation: relu
|
| 56 |
+
ff_hid: 256
|
| 57 |
+
hop_size: 128
|
| 58 |
+
in_chan: 64
|
| 59 |
+
mask_act: sigmoid
|
| 60 |
+
n_repeats: 8
|
| 61 |
+
n_src: 6
|
| 62 |
+
norm_type: gLN
|
| 63 |
+
out_chan: 64
|
| 64 |
+
optim:
|
| 65 |
+
lr: 0.005
|
| 66 |
+
optimizer: adam
|
| 67 |
+
weight_decay: 1.0e-05
|
| 68 |
+
training:
|
| 69 |
+
batch_size: 1
|
| 70 |
+
early_stop: true
|
| 71 |
+
epochs: 600
|
| 72 |
+
gradient_clipping: 5
|
| 73 |
+
half_lr: true
|
| 74 |
+
loss_func: pit_sisdr
|
| 75 |
+
```
|
| 76 |
+
# Results (SI-SDR [dB]) on vocal ensemble separation
|
| 77 |
+
|
| 78 |
+
| Method | Lead vocal | Soprano | Alto | Tenor | Bass |Vocal percussion|
|
| 79 |
+
|:---------------:|:--------------:|:--------------:|:--------------:|:--------------:|:--------------:|:--------------:|
|
| 80 |
+
| DPTNet | 8.9 | 8.5 | 11.9 | 14.9 | 19.7 | 21.9 |
|
models/DPTNet_jaCappella_VES_48k/best_model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2d0738ae01145bdf074e8e3a2312f1a21ff2e0c96f2a4f42b1cd0d2c7f4780ac
|
| 3 |
+
size 45639083
|
models/DPTNet_jaCappella_VES_48k/conf.yml
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data:
|
| 2 |
+
num_workers: 12
|
| 3 |
+
sample_rate: 48000
|
| 4 |
+
samples_per_track: 13
|
| 5 |
+
seed: 42
|
| 6 |
+
seq_dur: 5.046
|
| 7 |
+
source_augmentations:
|
| 8 |
+
- gain
|
| 9 |
+
sources:
|
| 10 |
+
- vocal_percussion
|
| 11 |
+
- bass
|
| 12 |
+
- alto
|
| 13 |
+
- tenor
|
| 14 |
+
- soprano
|
| 15 |
+
- lead_vocal
|
| 16 |
+
filterbank:
|
| 17 |
+
kernel_size: 32
|
| 18 |
+
n_filters: 64
|
| 19 |
+
stride: 16
|
| 20 |
+
main_args:
|
| 21 |
+
help: null
|
| 22 |
+
masknet:
|
| 23 |
+
bidirectional: true
|
| 24 |
+
chunk_size: 174
|
| 25 |
+
dropout: 0
|
| 26 |
+
ff_activation: relu
|
| 27 |
+
ff_hid: 256
|
| 28 |
+
hop_size: 128
|
| 29 |
+
in_chan: 64
|
| 30 |
+
mask_act: sigmoid
|
| 31 |
+
n_repeats: 8
|
| 32 |
+
n_src: 6
|
| 33 |
+
norm_type: gLN
|
| 34 |
+
out_chan: 64
|
| 35 |
+
optim:
|
| 36 |
+
lr: 0.005
|
| 37 |
+
optimizer: adam
|
| 38 |
+
weight_decay: 1.0e-05
|
| 39 |
+
positional arguments: {}
|
| 40 |
+
training:
|
| 41 |
+
batch_size: 1
|
| 42 |
+
early_stop: true
|
| 43 |
+
epochs: 600
|
| 44 |
+
gradient_clipping: 5
|
| 45 |
+
half_lr: true
|
| 46 |
+
loss_func: pit_sisdr
|
models/DPTNet_jaCappella_VES_48k/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/jaCappella/DPTNet_jaCappella_VES_48k
|
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/.gitattributes
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/README.md
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- espnet
|
| 4 |
+
- audio
|
| 5 |
+
- audio-to-audio
|
| 6 |
+
language:
|
| 7 |
+
datasets:
|
| 8 |
+
- wsj0-2mix
|
| 9 |
+
license: cc-by-4.0
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
## ESPnet2 ENH model
|
| 13 |
+
|
| 14 |
+
### `espnet/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw`
|
| 15 |
+
|
| 16 |
+
This model was trained by Wangyou Zhang using wsj0_2mix recipe in [espnet](https://github.com/espnet/espnet/).
|
| 17 |
+
|
| 18 |
+
### Demo: How to use in ESPnet2
|
| 19 |
+
|
| 20 |
+
```bash
|
| 21 |
+
cd espnet
|
| 22 |
+
|
| 23 |
+
pip install -e .
|
| 24 |
+
cd egs2/wsj0_2mix/enh1
|
| 25 |
+
./run.sh --skip_data_prep false --skip_train true --download_model espnet/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
## ENH config
|
| 31 |
+
|
| 32 |
+
<details><summary>expand</summary>
|
| 33 |
+
|
| 34 |
+
```
|
| 35 |
+
config: conf/tuning/train_enh_dptnet.yaml
|
| 36 |
+
print_config: false
|
| 37 |
+
log_level: INFO
|
| 38 |
+
dry_run: false
|
| 39 |
+
iterator_type: chunk
|
| 40 |
+
output_dir: exp/enh_train_enh_dptnet_raw
|
| 41 |
+
ngpu: 1
|
| 42 |
+
seed: 0
|
| 43 |
+
num_workers: 4
|
| 44 |
+
num_att_plot: 3
|
| 45 |
+
dist_backend: nccl
|
| 46 |
+
dist_init_method: env://
|
| 47 |
+
dist_world_size: 4
|
| 48 |
+
dist_rank: 0
|
| 49 |
+
local_rank: 0
|
| 50 |
+
dist_master_addr: localhost
|
| 51 |
+
dist_master_port: 53094
|
| 52 |
+
dist_launcher: null
|
| 53 |
+
multiprocessing_distributed: true
|
| 54 |
+
unused_parameters: true
|
| 55 |
+
sharded_ddp: false
|
| 56 |
+
cudnn_enabled: true
|
| 57 |
+
cudnn_benchmark: false
|
| 58 |
+
cudnn_deterministic: true
|
| 59 |
+
collect_stats: false
|
| 60 |
+
write_collected_feats: false
|
| 61 |
+
validate_train_iter: false
|
| 62 |
+
max_epoch: 150
|
| 63 |
+
patience: 10
|
| 64 |
+
val_scheduler_criterion:
|
| 65 |
+
- valid
|
| 66 |
+
- loss
|
| 67 |
+
early_stopping_criterion:
|
| 68 |
+
- valid
|
| 69 |
+
- loss
|
| 70 |
+
- min
|
| 71 |
+
best_model_criterion:
|
| 72 |
+
- - valid
|
| 73 |
+
- si_snr
|
| 74 |
+
- max
|
| 75 |
+
- - valid
|
| 76 |
+
- loss
|
| 77 |
+
- min
|
| 78 |
+
keep_nbest_models: 1
|
| 79 |
+
nbest_averaging_interval: 0
|
| 80 |
+
grad_clip: 5
|
| 81 |
+
grad_clip_type: 2.0
|
| 82 |
+
grad_noise: false
|
| 83 |
+
accum_grad: 1
|
| 84 |
+
no_forward_run: false
|
| 85 |
+
resume: true
|
| 86 |
+
train_dtype: float32
|
| 87 |
+
use_amp: false
|
| 88 |
+
log_interval: null
|
| 89 |
+
use_matplotlib: true
|
| 90 |
+
use_tensorboard: true
|
| 91 |
+
use_wandb: false
|
| 92 |
+
wandb_project: null
|
| 93 |
+
wandb_id: null
|
| 94 |
+
wandb_entity: null
|
| 95 |
+
wandb_name: null
|
| 96 |
+
wandb_model_log_interval: -1
|
| 97 |
+
detect_anomaly: false
|
| 98 |
+
pretrain_path: null
|
| 99 |
+
init_param: []
|
| 100 |
+
ignore_init_mismatch: false
|
| 101 |
+
freeze_param: []
|
| 102 |
+
num_iters_per_epoch: null
|
| 103 |
+
batch_size: 4
|
| 104 |
+
valid_batch_size: null
|
| 105 |
+
batch_bins: 1000000
|
| 106 |
+
valid_batch_bins: null
|
| 107 |
+
train_shape_file:
|
| 108 |
+
- exp/enh_stats_8k/train/speech_mix_shape
|
| 109 |
+
- exp/enh_stats_8k/train/speech_ref1_shape
|
| 110 |
+
- exp/enh_stats_8k/train/speech_ref2_shape
|
| 111 |
+
valid_shape_file:
|
| 112 |
+
- exp/enh_stats_8k/valid/speech_mix_shape
|
| 113 |
+
- exp/enh_stats_8k/valid/speech_ref1_shape
|
| 114 |
+
- exp/enh_stats_8k/valid/speech_ref2_shape
|
| 115 |
+
batch_type: folded
|
| 116 |
+
valid_batch_type: null
|
| 117 |
+
fold_length:
|
| 118 |
+
- 80000
|
| 119 |
+
- 80000
|
| 120 |
+
- 80000
|
| 121 |
+
sort_in_batch: descending
|
| 122 |
+
sort_batch: descending
|
| 123 |
+
multiple_iterator: false
|
| 124 |
+
chunk_length: 20000
|
| 125 |
+
chunk_shift_ratio: 0.5
|
| 126 |
+
num_cache_chunks: 1024
|
| 127 |
+
train_data_path_and_name_and_type:
|
| 128 |
+
- - dump/raw/tr_min_8k/wav.scp
|
| 129 |
+
- speech_mix
|
| 130 |
+
- sound
|
| 131 |
+
- - dump/raw/tr_min_8k/spk1.scp
|
| 132 |
+
- speech_ref1
|
| 133 |
+
- sound
|
| 134 |
+
- - dump/raw/tr_min_8k/spk2.scp
|
| 135 |
+
- speech_ref2
|
| 136 |
+
- sound
|
| 137 |
+
valid_data_path_and_name_and_type:
|
| 138 |
+
- - dump/raw/cv_min_8k/wav.scp
|
| 139 |
+
- speech_mix
|
| 140 |
+
- sound
|
| 141 |
+
- - dump/raw/cv_min_8k/spk1.scp
|
| 142 |
+
- speech_ref1
|
| 143 |
+
- sound
|
| 144 |
+
- - dump/raw/cv_min_8k/spk2.scp
|
| 145 |
+
- speech_ref2
|
| 146 |
+
- sound
|
| 147 |
+
allow_variable_data_keys: false
|
| 148 |
+
max_cache_size: 0.0
|
| 149 |
+
max_cache_fd: 32
|
| 150 |
+
valid_max_cache_size: null
|
| 151 |
+
optim: adam
|
| 152 |
+
optim_conf:
|
| 153 |
+
lr: 0.0004
|
| 154 |
+
eps: 1.0e-08
|
| 155 |
+
weight_decay: 1.0e-05
|
| 156 |
+
scheduler: warmupsteplr
|
| 157 |
+
scheduler_conf:
|
| 158 |
+
warmup_steps: 4000
|
| 159 |
+
steps_per_epoch: 14273
|
| 160 |
+
step_size: 2
|
| 161 |
+
gamma: 0.98
|
| 162 |
+
init: null
|
| 163 |
+
model_conf:
|
| 164 |
+
stft_consistency: false
|
| 165 |
+
loss_type: mask_mse
|
| 166 |
+
mask_type: null
|
| 167 |
+
criterions:
|
| 168 |
+
- name: si_snr
|
| 169 |
+
conf:
|
| 170 |
+
eps: 1.0e-07
|
| 171 |
+
wrapper: pit
|
| 172 |
+
wrapper_conf:
|
| 173 |
+
weight: 1.0
|
| 174 |
+
independent_perm: true
|
| 175 |
+
use_preprocessor: false
|
| 176 |
+
encoder: conv
|
| 177 |
+
encoder_conf:
|
| 178 |
+
channel: 64
|
| 179 |
+
kernel_size: 2
|
| 180 |
+
stride: 1
|
| 181 |
+
separator: dptnet
|
| 182 |
+
separator_conf:
|
| 183 |
+
num_spk: 2
|
| 184 |
+
post_enc_relu: true
|
| 185 |
+
layer: 6
|
| 186 |
+
rnn_type: lstm
|
| 187 |
+
bidirectional: true
|
| 188 |
+
unit: 128
|
| 189 |
+
att_heads: 4
|
| 190 |
+
dropout: 0.0
|
| 191 |
+
activation: relu
|
| 192 |
+
norm_type: gLN
|
| 193 |
+
segment_size: 250
|
| 194 |
+
nonlinear: relu
|
| 195 |
+
decoder: conv
|
| 196 |
+
decoder_conf:
|
| 197 |
+
channel: 64
|
| 198 |
+
kernel_size: 2
|
| 199 |
+
stride: 1
|
| 200 |
+
required:
|
| 201 |
+
- output_dir
|
| 202 |
+
version: 0.10.7a1
|
| 203 |
+
distributed: true
|
| 204 |
+
```
|
| 205 |
+
|
| 206 |
+
</details>
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
### Citing ESPnet
|
| 211 |
+
|
| 212 |
+
```BibTex
|
| 213 |
+
@inproceedings{watanabe2018espnet,
|
| 214 |
+
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
|
| 215 |
+
title={{ESPnet}: End-to-End Speech Processing Toolkit},
|
| 216 |
+
year={2018},
|
| 217 |
+
booktitle={Proceedings of Interspeech},
|
| 218 |
+
pages={2207--2211},
|
| 219 |
+
doi={10.21437/Interspeech.2018-1456},
|
| 220 |
+
url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
@inproceedings{li2021espnetse,
|
| 224 |
+
title={{ESPnet-SE}: End-to-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
|
| 225 |
+
author={Li, Chenda and Shi, Jing and Zhang, Wangyou and Subramanian, Aswin Shanmugam and Chang, Xuankai and Kamo, Naoyuki and Hira, Moto and Hayashi, Tomoki and Boeddeker, Christoph and Chen, Zhuo and Watanabe, Shinji},
|
| 226 |
+
booktitle={Proc. IEEE Spoken Language Technology Workshop (SLT)},
|
| 227 |
+
pages={785--792},
|
| 228 |
+
year={2021},
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
or arXiv:
|
| 234 |
+
|
| 235 |
+
```bibtex
|
| 236 |
+
@misc{watanabe2018espnet,
|
| 237 |
+
title={ESPnet: End-to-End Speech Processing Toolkit},
|
| 238 |
+
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
|
| 239 |
+
year={2018},
|
| 240 |
+
eprint={1804.00015},
|
| 241 |
+
archivePrefix={arXiv},
|
| 242 |
+
primaryClass={cs.CL}
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
@inproceedings{li2021espnetse,
|
| 246 |
+
title={{ESPnet-SE}: End-to-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
|
| 247 |
+
author={Li, Chenda and Shi, Jing and Zhang, Wangyou and Subramanian, Aswin Shanmugam and Chang, Xuankai and Kamo, Naoyuki and Hira, Moto and Hayashi, Tomoki and Boeddeker, Christoph and Chen, Zhuo and Watanabe, Shinji},
|
| 248 |
+
year={2020},
|
| 249 |
+
eprint={2011.03706},
|
| 250 |
+
archivePrefix={arXiv},
|
| 251 |
+
primaryClass={eess.AS}
|
| 252 |
+
}
|
| 253 |
+
```
|
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_stats_8k/train/feats_stats.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d890c44023968991b362b31f39fcecc453f0d619071befb36205d610e8aabb8b
|
| 3 |
+
size 778
|
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/99epoch.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:34bbfa87de88766844af4c3d313e34ef99e15194e6f394df354ca5fb6564bb0c
|
| 3 |
+
size 11274659
|
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/RESULTS.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!-- Generated by ./scripts/utils/show_enh_score.sh -->
|
| 2 |
+
# RESULTS
|
| 3 |
+
## Environments
|
| 4 |
+
- date: `Tue Jun 21 20:50:00 CST 2022`
|
| 5 |
+
- python version: `3.8.12 (default, Oct 12 2021, 13:49:34) [GCC 7.5.0]`
|
| 6 |
+
- espnet version: `espnet 0.10.7a1`
|
| 7 |
+
- pytorch version: `pytorch 1.10.2+cu102`
|
| 8 |
+
- Git hash: `9c24b3adddbde3402530080cb58ae08a6f4dd642`
|
| 9 |
+
- Commit date: `Wed Feb 23 14:49:15 2022 -0500`
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## enh_train_enh_dptnet_orig_raw
|
| 13 |
+
|
| 14 |
+
config: conf/tuning/train_enh_dptnet.yaml
|
| 15 |
+
|
| 16 |
+
|dataset|STOI|SAR|SDR|SIR|SI_SNR|
|
| 17 |
+
|---|---|---|---|---|---|
|
| 18 |
+
|enhanced_cv_min_8k|97.43|21.39|20.98|32.17|20.63|
|
| 19 |
+
|enhanced_tt_min_8k|98.18|21.47|21.06|32.48|20.72|
|
| 20 |
+
|
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/config.yaml
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
config: conf/tuning/train_enh_dptnet.yaml
|
| 2 |
+
print_config: false
|
| 3 |
+
log_level: INFO
|
| 4 |
+
dry_run: false
|
| 5 |
+
iterator_type: chunk
|
| 6 |
+
output_dir: exp/enh_train_enh_dptnet_raw
|
| 7 |
+
ngpu: 1
|
| 8 |
+
seed: 0
|
| 9 |
+
num_workers: 4
|
| 10 |
+
num_att_plot: 3
|
| 11 |
+
dist_backend: nccl
|
| 12 |
+
dist_init_method: env://
|
| 13 |
+
dist_world_size: 4
|
| 14 |
+
dist_rank: 0
|
| 15 |
+
local_rank: 0
|
| 16 |
+
dist_master_addr: localhost
|
| 17 |
+
dist_master_port: 53094
|
| 18 |
+
dist_launcher: null
|
| 19 |
+
multiprocessing_distributed: true
|
| 20 |
+
unused_parameters: true
|
| 21 |
+
sharded_ddp: false
|
| 22 |
+
cudnn_enabled: true
|
| 23 |
+
cudnn_benchmark: false
|
| 24 |
+
cudnn_deterministic: true
|
| 25 |
+
collect_stats: false
|
| 26 |
+
write_collected_feats: false
|
| 27 |
+
validate_train_iter: false
|
| 28 |
+
max_epoch: 150
|
| 29 |
+
patience: 10
|
| 30 |
+
val_scheduler_criterion:
|
| 31 |
+
- valid
|
| 32 |
+
- loss
|
| 33 |
+
early_stopping_criterion:
|
| 34 |
+
- valid
|
| 35 |
+
- loss
|
| 36 |
+
- min
|
| 37 |
+
best_model_criterion:
|
| 38 |
+
- - valid
|
| 39 |
+
- si_snr
|
| 40 |
+
- max
|
| 41 |
+
- - valid
|
| 42 |
+
- loss
|
| 43 |
+
- min
|
| 44 |
+
keep_nbest_models: 1
|
| 45 |
+
nbest_averaging_interval: 0
|
| 46 |
+
grad_clip: 5
|
| 47 |
+
grad_clip_type: 2.0
|
| 48 |
+
grad_noise: false
|
| 49 |
+
accum_grad: 1
|
| 50 |
+
no_forward_run: false
|
| 51 |
+
resume: true
|
| 52 |
+
train_dtype: float32
|
| 53 |
+
use_amp: false
|
| 54 |
+
log_interval: null
|
| 55 |
+
use_matplotlib: true
|
| 56 |
+
use_tensorboard: true
|
| 57 |
+
use_wandb: false
|
| 58 |
+
wandb_project: null
|
| 59 |
+
wandb_id: null
|
| 60 |
+
wandb_entity: null
|
| 61 |
+
wandb_name: null
|
| 62 |
+
wandb_model_log_interval: -1
|
| 63 |
+
detect_anomaly: false
|
| 64 |
+
pretrain_path: null
|
| 65 |
+
init_param: []
|
| 66 |
+
ignore_init_mismatch: false
|
| 67 |
+
freeze_param: []
|
| 68 |
+
num_iters_per_epoch: null
|
| 69 |
+
batch_size: 4
|
| 70 |
+
valid_batch_size: null
|
| 71 |
+
batch_bins: 1000000
|
| 72 |
+
valid_batch_bins: null
|
| 73 |
+
train_shape_file:
|
| 74 |
+
- exp/enh_stats_8k/train/speech_mix_shape
|
| 75 |
+
- exp/enh_stats_8k/train/speech_ref1_shape
|
| 76 |
+
- exp/enh_stats_8k/train/speech_ref2_shape
|
| 77 |
+
valid_shape_file:
|
| 78 |
+
- exp/enh_stats_8k/valid/speech_mix_shape
|
| 79 |
+
- exp/enh_stats_8k/valid/speech_ref1_shape
|
| 80 |
+
- exp/enh_stats_8k/valid/speech_ref2_shape
|
| 81 |
+
batch_type: folded
|
| 82 |
+
valid_batch_type: null
|
| 83 |
+
fold_length:
|
| 84 |
+
- 80000
|
| 85 |
+
- 80000
|
| 86 |
+
- 80000
|
| 87 |
+
sort_in_batch: descending
|
| 88 |
+
sort_batch: descending
|
| 89 |
+
multiple_iterator: false
|
| 90 |
+
chunk_length: 20000
|
| 91 |
+
chunk_shift_ratio: 0.5
|
| 92 |
+
num_cache_chunks: 1024
|
| 93 |
+
train_data_path_and_name_and_type:
|
| 94 |
+
- - dump/raw/tr_min_8k/wav.scp
|
| 95 |
+
- speech_mix
|
| 96 |
+
- sound
|
| 97 |
+
- - dump/raw/tr_min_8k/spk1.scp
|
| 98 |
+
- speech_ref1
|
| 99 |
+
- sound
|
| 100 |
+
- - dump/raw/tr_min_8k/spk2.scp
|
| 101 |
+
- speech_ref2
|
| 102 |
+
- sound
|
| 103 |
+
valid_data_path_and_name_and_type:
|
| 104 |
+
- - dump/raw/cv_min_8k/wav.scp
|
| 105 |
+
- speech_mix
|
| 106 |
+
- sound
|
| 107 |
+
- - dump/raw/cv_min_8k/spk1.scp
|
| 108 |
+
- speech_ref1
|
| 109 |
+
- sound
|
| 110 |
+
- - dump/raw/cv_min_8k/spk2.scp
|
| 111 |
+
- speech_ref2
|
| 112 |
+
- sound
|
| 113 |
+
allow_variable_data_keys: false
|
| 114 |
+
max_cache_size: 0.0
|
| 115 |
+
max_cache_fd: 32
|
| 116 |
+
valid_max_cache_size: null
|
| 117 |
+
optim: adam
|
| 118 |
+
optim_conf:
|
| 119 |
+
lr: 0.0004
|
| 120 |
+
eps: 1.0e-08
|
| 121 |
+
weight_decay: 1.0e-05
|
| 122 |
+
scheduler: warmupsteplr
|
| 123 |
+
scheduler_conf:
|
| 124 |
+
warmup_steps: 4000
|
| 125 |
+
steps_per_epoch: 14273
|
| 126 |
+
step_size: 2
|
| 127 |
+
gamma: 0.98
|
| 128 |
+
init: null
|
| 129 |
+
model_conf:
|
| 130 |
+
stft_consistency: false
|
| 131 |
+
loss_type: mask_mse
|
| 132 |
+
mask_type: null
|
| 133 |
+
criterions:
|
| 134 |
+
- name: si_snr
|
| 135 |
+
conf:
|
| 136 |
+
eps: 1.0e-07
|
| 137 |
+
wrapper: pit
|
| 138 |
+
wrapper_conf:
|
| 139 |
+
weight: 1.0
|
| 140 |
+
independent_perm: true
|
| 141 |
+
use_preprocessor: false
|
| 142 |
+
encoder: conv
|
| 143 |
+
encoder_conf:
|
| 144 |
+
channel: 64
|
| 145 |
+
kernel_size: 2
|
| 146 |
+
stride: 1
|
| 147 |
+
separator: dptnet
|
| 148 |
+
separator_conf:
|
| 149 |
+
num_spk: 2
|
| 150 |
+
post_enc_relu: true
|
| 151 |
+
layer: 6
|
| 152 |
+
rnn_type: lstm
|
| 153 |
+
bidirectional: true
|
| 154 |
+
unit: 128
|
| 155 |
+
att_heads: 4
|
| 156 |
+
dropout: 0.0
|
| 157 |
+
activation: relu
|
| 158 |
+
norm_type: gLN
|
| 159 |
+
segment_size: 250
|
| 160 |
+
nonlinear: relu
|
| 161 |
+
decoder: conv
|
| 162 |
+
decoder_conf:
|
| 163 |
+
channel: 64
|
| 164 |
+
kernel_size: 2
|
| 165 |
+
stride: 1
|
| 166 |
+
required:
|
| 167 |
+
- output_dir
|
| 168 |
+
version: 0.10.7a1
|
| 169 |
+
distributed: true
|
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/backward_time.png
ADDED
|
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/forward_time.png
ADDED
|
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/gpu_max_cached_mem_GB.png
ADDED
|
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/iter_time.png
ADDED
|
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/loss.png
ADDED
|
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/optim0_lr0.png
ADDED
|
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/optim_step_time.png
ADDED
|
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/si_snr_loss.png
ADDED
|
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/train_time.png
ADDED
|
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/valid.loss.best.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a799382546fa95a82961f2e0d7a35ccfc3c984d6c2ecbd294ff8cf198b0357fa
|
| 3 |
+
size 11
|
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/meta.yaml
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
espnet: 0.10.7a1
|
| 2 |
+
files:
|
| 3 |
+
model_file: exp/enh_train_enh_dptnet_raw/99epoch.pth
|
| 4 |
+
python: "3.8.12 (default, Oct 12 2021, 13:49:34) \n[GCC 7.5.0]"
|
| 5 |
+
timestamp: 1655818843.663898
|
| 6 |
+
torch: 1.10.2+cu102
|
| 7 |
+
yaml_files:
|
| 8 |
+
train_config: exp/enh_train_enh_dptnet_raw/config.yaml
|
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/espnet/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw
|