DPRNN (code, models, paper)
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- Dual-Path Transformer Network. Direct Context-Aware Modeling for End-to-End Monaural Speech Separation.pdf +3 -0
- code/DPTNet [Anyuan96] +1 Dual-Path-Transformer-Network-PyTorch.zip +3 -0
- code/Dual-Path-Transformer-Network-PyTorch [ramincre] +4.zip +3 -0
- code/Dual-Path-Transformer-Network-PyTorch.zip +3 -0
- models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/.gitattributes +28 -0
- models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/README.md +251 -0
- models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_stats_8k/train/feats_stats.npz +3 -0
- models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/96epoch.pth +3 -0
- models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/RESULTS.md +20 -0
- models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/config.yaml +147 -0
- models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/images/backward_time.png +0 -0
- models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/images/forward_time.png +0 -0
- models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/images/gpu_max_cached_mem_GB.png +0 -0
- models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/images/iter_time.png +0 -0
- models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/images/loss.png +0 -0
- models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/images/optim0_lr0.png +0 -0
- models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/images/optim_step_time.png +0 -0
- models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/images/si_snr.png +0 -0
- models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/images/train_time.png +0 -0
- models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/meta.yaml +8 -0
- models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/source.txt +1 -0
- models/DPRNNTasNet-ks16_WHAM_sepclean/.gitattributes +8 -0
- models/DPRNNTasNet-ks16_WHAM_sepclean/README.md +107 -0
- models/DPRNNTasNet-ks16_WHAM_sepclean/pytorch_model.bin +3 -0
- models/DPRNNTasNet-ks16_WHAM_sepclean/source.txt +1 -0
- models/DPRNNTasNet-ks2_Libri1Mix_enhsingle_16k/.gitattributes +8 -0
- models/DPRNNTasNet-ks2_Libri1Mix_enhsingle_16k/README.md +82 -0
- models/DPRNNTasNet-ks2_Libri1Mix_enhsingle_16k/pytorch_model.bin +3 -0
- models/DPRNNTasNet-ks2_Libri1Mix_enhsingle_16k/source.txt +1 -0
- models/DPRNNTasNet-ks2_WHAM_sepclean/.gitattributes +8 -0
- models/DPRNNTasNet-ks2_WHAM_sepclean/README.md +84 -0
- models/DPRNNTasNet-ks2_WHAM_sepclean/pytorch_model.bin +3 -0
- models/DPRNNTasNet-ks2_WHAM_sepclean/source.txt +1 -0
- models/DPRNNTasNet_LibriMix_sepclean/.gitattributes +35 -0
- models/DPRNNTasNet_LibriMix_sepclean/epoch=9-step=1000.ckpt +3 -0
- models/DPRNNTasNet_LibriMix_sepclean/source.txt +1 -0
- models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/.gitattributes +27 -0
- models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/README.md +251 -0
- models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_stats_16k/train/feats_stats.npz +3 -0
- models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/299epoch.pth +3 -0
- models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/RESULTS.md +20 -0
- models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/config.yaml +149 -0
- models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/images/backward_time.png +0 -0
- models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/images/forward_time.png +0 -0
- models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/images/gpu_max_cached_mem_GB.png +0 -0
- models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/images/iter_time.png +0 -0
- models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/images/loss.png +0 -0
- models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/images/optim0_lr0.png +0 -0
- models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/images/optim_step_time.png +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
Dual-Path[[:space:]]Transformer[[:space:]]Network.[[:space:]]Direct[[:space:]]Context-Aware[[:space:]]Modeling[[:space:]]for[[:space:]]End-to-End[[:space:]]Monaural[[:space:]]Speech[[:space:]]Separation.pdf filter=lfs diff=lfs merge=lfs -text
|
Dual-Path Transformer Network. Direct Context-Aware Modeling for End-to-End Monaural Speech Separation.pdf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff7d3877cae709c58afa97a92fa7acc5b8529b68e6aebdc171625c3021008044
|
| 3 |
+
size 478343
|
code/DPTNet [Anyuan96] +1 Dual-Path-Transformer-Network-PyTorch.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ce7da2cfb5c0e372222d04289ab7433c9ffbc1359e6615862141ea6b82704d78
|
| 3 |
+
size 4735940
|
code/Dual-Path-Transformer-Network-PyTorch [ramincre] +4.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1b173491858a88fcdb3e10c971e060005f543ba74ee5c75ec77b4908489e5c0f
|
| 3 |
+
size 5964111
|
code/Dual-Path-Transformer-Network-PyTorch.zip
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:721fae34b2ba68815ab05c4bd3dc3c7fa9c882e9f092f43748b15d22cfe7178d
|
| 3 |
+
size 4735712
|
models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/.gitattributes
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/README.md
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- espnet
|
| 4 |
+
- audio
|
| 5 |
+
- audio-to-audio
|
| 6 |
+
language: en
|
| 7 |
+
datasets:
|
| 8 |
+
- wsj0_2mix
|
| 9 |
+
license: cc-by-4.0
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
## ESPnet2 ENH model
|
| 13 |
+
|
| 14 |
+
### `lichenda/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet`
|
| 15 |
+
|
| 16 |
+
This model was trained by LiChenda using wsj0_2mix recipe in [espnet](https://github.com/espnet/espnet/).
|
| 17 |
+
|
| 18 |
+
Imported from [zenodo](https://zenodo.org/record/4688000).
|
| 19 |
+
|
| 20 |
+
### Demo: How to use in ESPnet2
|
| 21 |
+
|
| 22 |
+
```bash
|
| 23 |
+
cd espnet
|
| 24 |
+
git checkout 54919e2529d6f58f4550d4a72960f57b83f66dc9
|
| 25 |
+
pip install -e .
|
| 26 |
+
cd egs2/wsj0_2mix/enh1
|
| 27 |
+
./run.sh --skip_data_prep false --skip_train true --download_model lichenda/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
<!-- Generated by ./scripts/utils/show_enh_score.sh -->
|
| 31 |
+
# RESULTS
|
| 32 |
+
## Environments
|
| 33 |
+
- date: `Thu Apr 15 00:03:19 CST 2021`
|
| 34 |
+
- python version: `3.7.10 (default, Feb 26 2021, 18:47:35) [GCC 7.3.0]`
|
| 35 |
+
- espnet version: `espnet 0.9.8`
|
| 36 |
+
- pytorch version: `pytorch 1.5.0`
|
| 37 |
+
- Git hash: `2aa2f151b5929dc9ffa4df39a8d8c26ca4dbdb85`
|
| 38 |
+
- Commit date: `Tue Mar 30 09:08:27 2021 +0900`
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
## enh_train_enh_dprnn_tasnet_raw
|
| 42 |
+
|
| 43 |
+
config: conf/tuning/train_enh_dprnn_tasnet.yaml
|
| 44 |
+
|
| 45 |
+
|dataset|STOI|SAR|SDR|SIR|
|
| 46 |
+
|---|---|---|---|---|
|
| 47 |
+
|enhanced_cv_min_8k|0.960037|19.0476|18.5438|29.1591|
|
| 48 |
+
|enhanced_tt_min_8k|0.968376|18.8209|18.2925|28.929|
|
| 49 |
+
|
| 50 |
+
## ENH config
|
| 51 |
+
|
| 52 |
+
<details><summary>expand</summary>
|
| 53 |
+
|
| 54 |
+
```
|
| 55 |
+
config: conf/tuning/train_enh_dprnn_tasnet.yaml
|
| 56 |
+
print_config: false
|
| 57 |
+
log_level: INFO
|
| 58 |
+
dry_run: false
|
| 59 |
+
iterator_type: chunk
|
| 60 |
+
output_dir: exp/enh_train_enh_dprnn_tasnet_raw
|
| 61 |
+
ngpu: 1
|
| 62 |
+
seed: 0
|
| 63 |
+
num_workers: 4
|
| 64 |
+
num_att_plot: 3
|
| 65 |
+
dist_backend: nccl
|
| 66 |
+
dist_init_method: env://
|
| 67 |
+
dist_world_size: 4
|
| 68 |
+
dist_rank: 0
|
| 69 |
+
local_rank: 0
|
| 70 |
+
dist_master_addr: localhost
|
| 71 |
+
dist_master_port: 45126
|
| 72 |
+
dist_launcher: null
|
| 73 |
+
multiprocessing_distributed: true
|
| 74 |
+
unused_parameters: false
|
| 75 |
+
sharded_ddp: false
|
| 76 |
+
cudnn_enabled: true
|
| 77 |
+
cudnn_benchmark: false
|
| 78 |
+
cudnn_deterministic: true
|
| 79 |
+
collect_stats: false
|
| 80 |
+
write_collected_feats: false
|
| 81 |
+
max_epoch: 150
|
| 82 |
+
patience: 4
|
| 83 |
+
val_scheduler_criterion:
|
| 84 |
+
- valid
|
| 85 |
+
- loss
|
| 86 |
+
early_stopping_criterion:
|
| 87 |
+
- valid
|
| 88 |
+
- loss
|
| 89 |
+
- min
|
| 90 |
+
best_model_criterion:
|
| 91 |
+
- - valid
|
| 92 |
+
- si_snr
|
| 93 |
+
- max
|
| 94 |
+
- - valid
|
| 95 |
+
- loss
|
| 96 |
+
- min
|
| 97 |
+
keep_nbest_models: 1
|
| 98 |
+
grad_clip: 5.0
|
| 99 |
+
grad_clip_type: 2.0
|
| 100 |
+
grad_noise: false
|
| 101 |
+
accum_grad: 1
|
| 102 |
+
no_forward_run: false
|
| 103 |
+
resume: true
|
| 104 |
+
train_dtype: float32
|
| 105 |
+
use_amp: false
|
| 106 |
+
log_interval: null
|
| 107 |
+
use_tensorboard: true
|
| 108 |
+
use_wandb: false
|
| 109 |
+
wandb_project: null
|
| 110 |
+
wandb_id: null
|
| 111 |
+
detect_anomaly: false
|
| 112 |
+
pretrain_path: null
|
| 113 |
+
init_param: []
|
| 114 |
+
freeze_param: []
|
| 115 |
+
num_iters_per_epoch: null
|
| 116 |
+
batch_size: 4
|
| 117 |
+
valid_batch_size: null
|
| 118 |
+
batch_bins: 1000000
|
| 119 |
+
valid_batch_bins: null
|
| 120 |
+
train_shape_file:
|
| 121 |
+
- exp/enh_stats_8k/train/speech_mix_shape
|
| 122 |
+
- exp/enh_stats_8k/train/speech_ref1_shape
|
| 123 |
+
- exp/enh_stats_8k/train/speech_ref2_shape
|
| 124 |
+
valid_shape_file:
|
| 125 |
+
- exp/enh_stats_8k/valid/speech_mix_shape
|
| 126 |
+
- exp/enh_stats_8k/valid/speech_ref1_shape
|
| 127 |
+
- exp/enh_stats_8k/valid/speech_ref2_shape
|
| 128 |
+
batch_type: folded
|
| 129 |
+
valid_batch_type: null
|
| 130 |
+
fold_length:
|
| 131 |
+
- 80000
|
| 132 |
+
- 80000
|
| 133 |
+
- 80000
|
| 134 |
+
sort_in_batch: descending
|
| 135 |
+
sort_batch: descending
|
| 136 |
+
multiple_iterator: false
|
| 137 |
+
chunk_length: 32000
|
| 138 |
+
chunk_shift_ratio: 0.5
|
| 139 |
+
num_cache_chunks: 1024
|
| 140 |
+
train_data_path_and_name_and_type:
|
| 141 |
+
- - dump/raw/tr_min_8k/wav.scp
|
| 142 |
+
- speech_mix
|
| 143 |
+
- sound
|
| 144 |
+
- - dump/raw/tr_min_8k/spk1.scp
|
| 145 |
+
- speech_ref1
|
| 146 |
+
- sound
|
| 147 |
+
- - dump/raw/tr_min_8k/spk2.scp
|
| 148 |
+
- speech_ref2
|
| 149 |
+
- sound
|
| 150 |
+
valid_data_path_and_name_and_type:
|
| 151 |
+
- - dump/raw/cv_min_8k/wav.scp
|
| 152 |
+
- speech_mix
|
| 153 |
+
- sound
|
| 154 |
+
- - dump/raw/cv_min_8k/spk1.scp
|
| 155 |
+
- speech_ref1
|
| 156 |
+
- sound
|
| 157 |
+
- - dump/raw/cv_min_8k/spk2.scp
|
| 158 |
+
- speech_ref2
|
| 159 |
+
- sound
|
| 160 |
+
allow_variable_data_keys: false
|
| 161 |
+
max_cache_size: 0.0
|
| 162 |
+
max_cache_fd: 32
|
| 163 |
+
valid_max_cache_size: null
|
| 164 |
+
optim: adam
|
| 165 |
+
optim_conf:
|
| 166 |
+
lr: 0.001
|
| 167 |
+
eps: 1.0e-08
|
| 168 |
+
weight_decay: 0
|
| 169 |
+
scheduler: reducelronplateau
|
| 170 |
+
scheduler_conf:
|
| 171 |
+
mode: min
|
| 172 |
+
factor: 0.7
|
| 173 |
+
patience: 1
|
| 174 |
+
init: xavier_uniform
|
| 175 |
+
model_conf:
|
| 176 |
+
loss_type: si_snr
|
| 177 |
+
use_preprocessor: false
|
| 178 |
+
encoder: conv
|
| 179 |
+
encoder_conf:
|
| 180 |
+
channel: 64
|
| 181 |
+
kernel_size: 2
|
| 182 |
+
stride: 1
|
| 183 |
+
separator: dprnn
|
| 184 |
+
separator_conf:
|
| 185 |
+
num_spk: 2
|
| 186 |
+
layer: 6
|
| 187 |
+
rnn_type: lstm
|
| 188 |
+
bidirectional: true
|
| 189 |
+
nonlinear: relu
|
| 190 |
+
unit: 128
|
| 191 |
+
segment_size: 250
|
| 192 |
+
dropout: 0.1
|
| 193 |
+
decoder: conv
|
| 194 |
+
decoder_conf:
|
| 195 |
+
channel: 64
|
| 196 |
+
kernel_size: 2
|
| 197 |
+
stride: 1
|
| 198 |
+
required:
|
| 199 |
+
- output_dir
|
| 200 |
+
version: 0.9.8
|
| 201 |
+
distributed: true
|
| 202 |
+
```
|
| 203 |
+
|
| 204 |
+
</details>
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
### Citing ESPnet
|
| 209 |
+
|
| 210 |
+
```BibTex
|
| 211 |
+
@inproceedings{watanabe2018espnet,
|
| 212 |
+
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
|
| 213 |
+
title={{ESPnet}: End-to-End Speech Processing Toolkit},
|
| 214 |
+
year={2018},
|
| 215 |
+
booktitle={Proceedings of Interspeech},
|
| 216 |
+
pages={2207--2211},
|
| 217 |
+
doi={10.21437/Interspeech.2018-1456},
|
| 218 |
+
url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
@inproceedings{ESPnet-SE,
|
| 223 |
+
author = {Chenda Li and Jing Shi and Wangyou Zhang and Aswin Shanmugam Subramanian and Xuankai Chang and
|
| 224 |
+
Naoyuki Kamo and Moto Hira and Tomoki Hayashi and Christoph B{"{o}}ddeker and Zhuo Chen and Shinji Watanabe},
|
| 225 |
+
title = {ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
|
| 226 |
+
booktitle = {{IEEE} Spoken Language Technology Workshop, {SLT} 2021, Shenzhen, China, January 19-22, 2021},
|
| 227 |
+
pages = {785--792},
|
| 228 |
+
publisher = {{IEEE}},
|
| 229 |
+
year = {2021},
|
| 230 |
+
url = {https://doi.org/10.1109/SLT48900.2021.9383615},
|
| 231 |
+
doi = {10.1109/SLT48900.2021.9383615},
|
| 232 |
+
timestamp = {Mon, 12 Apr 2021 17:08:59 +0200},
|
| 233 |
+
biburl = {https://dblp.org/rec/conf/slt/Li0ZSCKHHBC021.bib},
|
| 234 |
+
bibsource = {dblp computer science bibliography, https://dblp.org}
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
```
|
| 239 |
+
|
| 240 |
+
or arXiv:
|
| 241 |
+
|
| 242 |
+
```bibtex
|
| 243 |
+
@misc{watanabe2018espnet,
|
| 244 |
+
title={ESPnet: End-to-End Speech Processing Toolkit},
|
| 245 |
+
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
|
| 246 |
+
year={2018},
|
| 247 |
+
eprint={1804.00015},
|
| 248 |
+
archivePrefix={arXiv},
|
| 249 |
+
primaryClass={cs.CL}
|
| 250 |
+
}
|
| 251 |
+
```
|
models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_stats_8k/train/feats_stats.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d890c44023968991b362b31f39fcecc453f0d619071befb36205d610e8aabb8b
|
| 3 |
+
size 778
|
models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/96epoch.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:338bc12bf9db30b178247f8b0b3ecbc24b1eff7739c4771f01aaaf1d456c5212
|
| 3 |
+
size 10393743
|
models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/RESULTS.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!-- Generated by ./scripts/utils/show_enh_score.sh -->
|
| 2 |
+
# RESULTS
|
| 3 |
+
## Environments
|
| 4 |
+
- date: `Thu Apr 15 00:03:19 CST 2021`
|
| 5 |
+
- python version: `3.7.10 (default, Feb 26 2021, 18:47:35) [GCC 7.3.0]`
|
| 6 |
+
- espnet version: `espnet 0.9.8`
|
| 7 |
+
- pytorch version: `pytorch 1.5.0`
|
| 8 |
+
- Git hash: `2aa2f151b5929dc9ffa4df39a8d8c26ca4dbdb85`
|
| 9 |
+
- Commit date: `Tue Mar 30 09:08:27 2021 +0900`
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## enh_train_enh_dprnn_tasnet_raw
|
| 13 |
+
|
| 14 |
+
config: conf/tuning/train_enh_dprnn_tasnet.yaml
|
| 15 |
+
|
| 16 |
+
|dataset|STOI|SAR|SDR|SIR|
|
| 17 |
+
|---|---|---|---|---|
|
| 18 |
+
|enhanced_cv_min_8k|0.960037|19.0476|18.5438|29.1591|
|
| 19 |
+
|enhanced_tt_min_8k|0.968376|18.8209|18.2925|28.929|
|
| 20 |
+
|
models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/config.yaml
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
config: conf/tuning/train_enh_dprnn_tasnet.yaml
|
| 2 |
+
print_config: false
|
| 3 |
+
log_level: INFO
|
| 4 |
+
dry_run: false
|
| 5 |
+
iterator_type: chunk
|
| 6 |
+
output_dir: exp/enh_train_enh_dprnn_tasnet_raw
|
| 7 |
+
ngpu: 1
|
| 8 |
+
seed: 0
|
| 9 |
+
num_workers: 4
|
| 10 |
+
num_att_plot: 3
|
| 11 |
+
dist_backend: nccl
|
| 12 |
+
dist_init_method: env://
|
| 13 |
+
dist_world_size: 4
|
| 14 |
+
dist_rank: 0
|
| 15 |
+
local_rank: 0
|
| 16 |
+
dist_master_addr: localhost
|
| 17 |
+
dist_master_port: 45126
|
| 18 |
+
dist_launcher: null
|
| 19 |
+
multiprocessing_distributed: true
|
| 20 |
+
unused_parameters: false
|
| 21 |
+
sharded_ddp: false
|
| 22 |
+
cudnn_enabled: true
|
| 23 |
+
cudnn_benchmark: false
|
| 24 |
+
cudnn_deterministic: true
|
| 25 |
+
collect_stats: false
|
| 26 |
+
write_collected_feats: false
|
| 27 |
+
max_epoch: 150
|
| 28 |
+
patience: 4
|
| 29 |
+
val_scheduler_criterion:
|
| 30 |
+
- valid
|
| 31 |
+
- loss
|
| 32 |
+
early_stopping_criterion:
|
| 33 |
+
- valid
|
| 34 |
+
- loss
|
| 35 |
+
- min
|
| 36 |
+
best_model_criterion:
|
| 37 |
+
- - valid
|
| 38 |
+
- si_snr
|
| 39 |
+
- max
|
| 40 |
+
- - valid
|
| 41 |
+
- loss
|
| 42 |
+
- min
|
| 43 |
+
keep_nbest_models: 1
|
| 44 |
+
grad_clip: 5.0
|
| 45 |
+
grad_clip_type: 2.0
|
| 46 |
+
grad_noise: false
|
| 47 |
+
accum_grad: 1
|
| 48 |
+
no_forward_run: false
|
| 49 |
+
resume: true
|
| 50 |
+
train_dtype: float32
|
| 51 |
+
use_amp: false
|
| 52 |
+
log_interval: null
|
| 53 |
+
use_tensorboard: true
|
| 54 |
+
use_wandb: false
|
| 55 |
+
wandb_project: null
|
| 56 |
+
wandb_id: null
|
| 57 |
+
detect_anomaly: false
|
| 58 |
+
pretrain_path: null
|
| 59 |
+
init_param: []
|
| 60 |
+
freeze_param: []
|
| 61 |
+
num_iters_per_epoch: null
|
| 62 |
+
batch_size: 4
|
| 63 |
+
valid_batch_size: null
|
| 64 |
+
batch_bins: 1000000
|
| 65 |
+
valid_batch_bins: null
|
| 66 |
+
train_shape_file:
|
| 67 |
+
- exp/enh_stats_8k/train/speech_mix_shape
|
| 68 |
+
- exp/enh_stats_8k/train/speech_ref1_shape
|
| 69 |
+
- exp/enh_stats_8k/train/speech_ref2_shape
|
| 70 |
+
valid_shape_file:
|
| 71 |
+
- exp/enh_stats_8k/valid/speech_mix_shape
|
| 72 |
+
- exp/enh_stats_8k/valid/speech_ref1_shape
|
| 73 |
+
- exp/enh_stats_8k/valid/speech_ref2_shape
|
| 74 |
+
batch_type: folded
|
| 75 |
+
valid_batch_type: null
|
| 76 |
+
fold_length:
|
| 77 |
+
- 80000
|
| 78 |
+
- 80000
|
| 79 |
+
- 80000
|
| 80 |
+
sort_in_batch: descending
|
| 81 |
+
sort_batch: descending
|
| 82 |
+
multiple_iterator: false
|
| 83 |
+
chunk_length: 32000
|
| 84 |
+
chunk_shift_ratio: 0.5
|
| 85 |
+
num_cache_chunks: 1024
|
| 86 |
+
train_data_path_and_name_and_type:
|
| 87 |
+
- - dump/raw/tr_min_8k/wav.scp
|
| 88 |
+
- speech_mix
|
| 89 |
+
- sound
|
| 90 |
+
- - dump/raw/tr_min_8k/spk1.scp
|
| 91 |
+
- speech_ref1
|
| 92 |
+
- sound
|
| 93 |
+
- - dump/raw/tr_min_8k/spk2.scp
|
| 94 |
+
- speech_ref2
|
| 95 |
+
- sound
|
| 96 |
+
valid_data_path_and_name_and_type:
|
| 97 |
+
- - dump/raw/cv_min_8k/wav.scp
|
| 98 |
+
- speech_mix
|
| 99 |
+
- sound
|
| 100 |
+
- - dump/raw/cv_min_8k/spk1.scp
|
| 101 |
+
- speech_ref1
|
| 102 |
+
- sound
|
| 103 |
+
- - dump/raw/cv_min_8k/spk2.scp
|
| 104 |
+
- speech_ref2
|
| 105 |
+
- sound
|
| 106 |
+
allow_variable_data_keys: false
|
| 107 |
+
max_cache_size: 0.0
|
| 108 |
+
max_cache_fd: 32
|
| 109 |
+
valid_max_cache_size: null
|
| 110 |
+
optim: adam
|
| 111 |
+
optim_conf:
|
| 112 |
+
lr: 0.001
|
| 113 |
+
eps: 1.0e-08
|
| 114 |
+
weight_decay: 0
|
| 115 |
+
scheduler: reducelronplateau
|
| 116 |
+
scheduler_conf:
|
| 117 |
+
mode: min
|
| 118 |
+
factor: 0.7
|
| 119 |
+
patience: 1
|
| 120 |
+
init: xavier_uniform
|
| 121 |
+
model_conf:
|
| 122 |
+
loss_type: si_snr
|
| 123 |
+
use_preprocessor: false
|
| 124 |
+
encoder: conv
|
| 125 |
+
encoder_conf:
|
| 126 |
+
channel: 64
|
| 127 |
+
kernel_size: 2
|
| 128 |
+
stride: 1
|
| 129 |
+
separator: dprnn
|
| 130 |
+
separator_conf:
|
| 131 |
+
num_spk: 2
|
| 132 |
+
layer: 6
|
| 133 |
+
rnn_type: lstm
|
| 134 |
+
bidirectional: true
|
| 135 |
+
nonlinear: relu
|
| 136 |
+
unit: 128
|
| 137 |
+
segment_size: 250
|
| 138 |
+
dropout: 0.1
|
| 139 |
+
decoder: conv
|
| 140 |
+
decoder_conf:
|
| 141 |
+
channel: 64
|
| 142 |
+
kernel_size: 2
|
| 143 |
+
stride: 1
|
| 144 |
+
required:
|
| 145 |
+
- output_dir
|
| 146 |
+
version: 0.9.8
|
| 147 |
+
distributed: true
|
models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/images/backward_time.png
ADDED
|
models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/images/forward_time.png
ADDED
|
models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/images/gpu_max_cached_mem_GB.png
ADDED
|
models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/images/iter_time.png
ADDED
|
models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/images/loss.png
ADDED
|
models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/images/optim0_lr0.png
ADDED
|
models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/images/optim_step_time.png
ADDED
|
models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/images/si_snr.png
ADDED
|
models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/exp/enh_train_enh_dprnn_tasnet_raw/images/train_time.png
ADDED
|
models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/meta.yaml
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
espnet: 0.10.7a1
|
| 2 |
+
files:
|
| 3 |
+
model_file: exp/enh_train_enh_dprnn_tasnet_raw/96epoch.pth
|
| 4 |
+
python: "3.7.11 (default, Jul 27 2021, 14:32:16) \n[GCC 7.5.0]"
|
| 5 |
+
timestamp: 1649682775.265407
|
| 6 |
+
torch: 1.8.1
|
| 7 |
+
yaml_files:
|
| 8 |
+
train_config: exp/enh_train_enh_dprnn_tasnet_raw/config.yaml
|
models/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/lichenda/Chenda_Li_wsj0_2mix_enh_dprnn_tasnet
|
models/DPRNNTasNet-ks16_WHAM_sepclean/.gitattributes
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
models/DPRNNTasNet-ks16_WHAM_sepclean/README.md
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- audio-to-audio
|
| 4 |
+
- asteroid
|
| 5 |
+
- audio
|
| 6 |
+
- audio-source-separation
|
| 7 |
+
datasets:
|
| 8 |
+
- wham
|
| 9 |
+
- sep_clean
|
| 10 |
+
license: cc-by-sa-4.0
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
## Asteroid model `mpariente/DPRNNTasNet(ks=16)_WHAM!_sepclean`
|
| 14 |
+
|
| 15 |
+
♻️ Imported from https://zenodo.org/record/3903795#.X8pMBRNKjUI
|
| 16 |
+
|
| 17 |
+
This model was trained by Manuel Pariente using the wham/DPRNN recipe in [Asteroid](https://github.com/asteroid-team/asteroid). It was trained on the sep_clean task of the WHAM! dataset.
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
### Demo: How to use in Asteroid
|
| 21 |
+
|
| 22 |
+
```python
|
| 23 |
+
# coming soon
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
### Training config
|
| 28 |
+
|
| 29 |
+
- data:
|
| 30 |
+
- mode: min
|
| 31 |
+
- nondefault_nsrc: None
|
| 32 |
+
- sample_rate: 8000
|
| 33 |
+
- segment: 2.0
|
| 34 |
+
- task: sep_clean
|
| 35 |
+
- train_dir: data/wav8k/min/tr
|
| 36 |
+
- valid_dir: data/wav8k/min/cv
|
| 37 |
+
- filterbank:
|
| 38 |
+
- kernel_size: 16
|
| 39 |
+
- n_filters: 64
|
| 40 |
+
- stride: 8
|
| 41 |
+
- main_args:
|
| 42 |
+
- exp_dir: exp/train_dprnn_ks16/
|
| 43 |
+
- help: None
|
| 44 |
+
- masknet:
|
| 45 |
+
- bidirectional: True
|
| 46 |
+
- bn_chan: 128
|
| 47 |
+
- chunk_size: 100
|
| 48 |
+
- dropout: 0
|
| 49 |
+
- hid_size: 128
|
| 50 |
+
- hop_size: 50
|
| 51 |
+
- in_chan: 64
|
| 52 |
+
- mask_act: sigmoid
|
| 53 |
+
- n_repeats: 6
|
| 54 |
+
- n_src: 2
|
| 55 |
+
- out_chan: 64
|
| 56 |
+
- optim:
|
| 57 |
+
- lr: 0.001
|
| 58 |
+
- optimizer: adam
|
| 59 |
+
- weight_decay: 1e-05
|
| 60 |
+
- positional arguments:
|
| 61 |
+
- training:
|
| 62 |
+
- batch_size: 6
|
| 63 |
+
- early_stop: True
|
| 64 |
+
- epochs: 200
|
| 65 |
+
- gradient_clipping: 5
|
| 66 |
+
- half_lr: True
|
| 67 |
+
- num_workers: 6
|
| 68 |
+
|
| 69 |
+
#### Results
|
| 70 |
+
|
| 71 |
+
- `si_sdr`: 18.227683982688003
|
| 72 |
+
- `si_sdr_imp`: 18.22883576588251
|
| 73 |
+
- `sdr`: 18.617789605060587
|
| 74 |
+
- `sdr_imp`: 18.466745426438173
|
| 75 |
+
- `sir`: 29.22773720052717
|
| 76 |
+
- `sir_imp`: 29.07669302190474
|
| 77 |
+
- `sar`: 19.116352171914485
|
| 78 |
+
- `sar_imp`: -130.06009796503054
|
| 79 |
+
- `stoi`: 0.9722025377865715
|
| 80 |
+
- `stoi_imp`: 0.23415680987800583
|
| 81 |
+
|
| 82 |
+
### Citing Asteroid
|
| 83 |
+
|
| 84 |
+
```BibTex
|
| 85 |
+
@inproceedings{Pariente2020Asteroid,
|
| 86 |
+
title={Asteroid: the {PyTorch}-based audio source separation toolkit for researchers},
|
| 87 |
+
author={Manuel Pariente and Samuele Cornell and Joris Cosentino and Sunit Sivasankaran and
|
| 88 |
+
Efthymios Tzinis and Jens Heitkaemper and Michel Olvera and Fabian-Robert Stöter and
|
| 89 |
+
Mathieu Hu and Juan M. Martín-Doñas and David Ditter and Ariel Frank and Antoine Deleforge
|
| 90 |
+
and Emmanuel Vincent},
|
| 91 |
+
year={2020},
|
| 92 |
+
booktitle={Proc. Interspeech},
|
| 93 |
+
}
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
Or on arXiv:
|
| 97 |
+
|
| 98 |
+
```bibtex
|
| 99 |
+
@misc{pariente2020asteroid,
|
| 100 |
+
title={Asteroid: the PyTorch-based audio source separation toolkit for researchers},
|
| 101 |
+
author={Manuel Pariente and Samuele Cornell and Joris Cosentino and Sunit Sivasankaran and Efthymios Tzinis and Jens Heitkaemper and Michel Olvera and Fabian-Robert Stöter and Mathieu Hu and Juan M. Martín-Doñas and David Ditter and Ariel Frank and Antoine Deleforge and Emmanuel Vincent},
|
| 102 |
+
year={2020},
|
| 103 |
+
eprint={2005.04132},
|
| 104 |
+
archivePrefix={arXiv},
|
| 105 |
+
primaryClass={eess.AS}
|
| 106 |
+
}
|
| 107 |
+
```
|
models/DPRNNTasNet-ks16_WHAM_sepclean/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:aca83e3a61eb6414f78e87350631cfcd77e04737c2c9bf7844dcde6ac0c576d8
|
| 3 |
+
size 14671835
|
models/DPRNNTasNet-ks16_WHAM_sepclean/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/julien-c/DPRNNTasNet-ks16_WHAM_sepclean
|
models/DPRNNTasNet-ks2_Libri1Mix_enhsingle_16k/.gitattributes
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
models/DPRNNTasNet-ks2_Libri1Mix_enhsingle_16k/README.md
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- asteroid
|
| 4 |
+
- audio
|
| 5 |
+
- DPRNNTasNet
|
| 6 |
+
- audio-to-audio
|
| 7 |
+
datasets:
|
| 8 |
+
- Libri1Mix
|
| 9 |
+
- enh_single
|
| 10 |
+
license: cc-by-sa-4.0
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
## Asteroid model `JorisCos/DPRNNTasNet_Libri1Mix_enhsignle_16k`
|
| 14 |
+
|
| 15 |
+
Description:
|
| 16 |
+
|
| 17 |
+
This model was trained by Joris Cosentino using the librimix recipe in [Asteroid](https://github.com/asteroid-team/asteroid).
|
| 18 |
+
It was trained on the `enh_single` task of the Libri1Mix dataset.
|
| 19 |
+
|
| 20 |
+
Training config:
|
| 21 |
+
|
| 22 |
+
```yml
|
| 23 |
+
data:
|
| 24 |
+
n_src: 1
|
| 25 |
+
sample_rate: 16000
|
| 26 |
+
segment: 1
|
| 27 |
+
task: enh_single
|
| 28 |
+
train_dir: data/wav16k/min/train-360
|
| 29 |
+
valid_dir: data/wav16k/min/dev
|
| 30 |
+
filterbank:
|
| 31 |
+
kernel_size: 2
|
| 32 |
+
n_filters: 64
|
| 33 |
+
stride: 1
|
| 34 |
+
masknet:
|
| 35 |
+
bidirectional: true
|
| 36 |
+
bn_chan: 128
|
| 37 |
+
chunk_size: 250
|
| 38 |
+
dropout: 0
|
| 39 |
+
hid_size: 128
|
| 40 |
+
hop_size: 125
|
| 41 |
+
in_chan: 64
|
| 42 |
+
mask_act: sigmoid
|
| 43 |
+
n_repeats: 6
|
| 44 |
+
n_src: 1
|
| 45 |
+
out_chan: 64
|
| 46 |
+
optim:
|
| 47 |
+
lr: 0.001
|
| 48 |
+
optimizer: adam
|
| 49 |
+
weight_decay: 1.0e-05
|
| 50 |
+
training:
|
| 51 |
+
batch_size: 2
|
| 52 |
+
early_stop: true
|
| 53 |
+
epochs: 200
|
| 54 |
+
gradient_clipping: 5
|
| 55 |
+
half_lr: true
|
| 56 |
+
num_workers: 4
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
Results:
|
| 61 |
+
|
| 62 |
+
On Libri1Mix min test set :
|
| 63 |
+
```yml
|
| 64 |
+
si_sdr: 14.7228101708889
|
| 65 |
+
si_sdr_imp: 11.2730288650292
|
| 66 |
+
sdr: 15.35661405197161
|
| 67 |
+
sdr_imp: 11.853951252758595
|
| 68 |
+
sir: Infinity
|
| 69 |
+
sir_imp: NaN
|
| 70 |
+
sar: 15.35661405197161
|
| 71 |
+
sar_imp: 11.853951252758595
|
| 72 |
+
stoi: 0.9300461826351578
|
| 73 |
+
stoi_imp: 0.13412635909461715
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
License notice:
|
| 78 |
+
|
| 79 |
+
This work "DPRNNTasNet_Libri1Mix_enhsignle_16k" is a derivative of [LibriSpeech ASR corpus](http://www.openslr.org/12) by Vassil Panayotov,
|
| 80 |
+
used under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/); of The WSJ0 Hipster Ambient Mixtures
|
| 81 |
+
dataset by [Whisper.ai](http://wham.whisper.ai/), used under [CC BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/) (Research only).
|
| 82 |
+
"DPRNNTasNet_Libri1Mix_enhsignle_16k" is licensed under [Attribution-ShareAlike 3.0 Unported](https://creativecommons.org/licenses/by-sa/3.0/) by Joris Cosentino
|
models/DPRNNTasNet-ks2_Libri1Mix_enhsingle_16k/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6b510c07fae3a3db18473b5749316cb9df8dc4f78164c3cdfbb50d3783ee779d
|
| 3 |
+
size 14595773
|
models/DPRNNTasNet-ks2_Libri1Mix_enhsingle_16k/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/JorisCos/DPRNNTasNet-ks2_Libri1Mix_enhsingle_16k
|
models/DPRNNTasNet-ks2_WHAM_sepclean/.gitattributes
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
models/DPRNNTasNet-ks2_WHAM_sepclean/README.md
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- asteroid
|
| 4 |
+
- audio
|
| 5 |
+
- DPRNNTasNet
|
| 6 |
+
- audio-to-audio
|
| 7 |
+
datasets:
|
| 8 |
+
- wham
|
| 9 |
+
- sep_clean
|
| 10 |
+
license: cc-by-sa-4.0
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
## Asteroid model `mpariente/DPRNNTasNet-ks2_WHAM_sepclean`
|
| 14 |
+
Imported from [Zenodo](https://zenodo.org/record/3862942)
|
| 15 |
+
|
| 16 |
+
### Description:
|
| 17 |
+
This model was trained by Manuel Pariente
|
| 18 |
+
using the wham/DPRNN recipe in [Asteroid](https://github.com/asteroid-team/asteroid).
|
| 19 |
+
It was trained on the `sep_clean` task of the WHAM! dataset.
|
| 20 |
+
|
| 21 |
+
### Training config:
|
| 22 |
+
```yaml
|
| 23 |
+
data:
|
| 24 |
+
mode: min
|
| 25 |
+
nondefault_nsrc: None
|
| 26 |
+
sample_rate: 8000
|
| 27 |
+
segment: 2.0
|
| 28 |
+
task: sep_clean
|
| 29 |
+
train_dir: data/wav8k/min/tr
|
| 30 |
+
valid_dir: data/wav8k/min/cv
|
| 31 |
+
filterbank:
|
| 32 |
+
kernel_size: 2
|
| 33 |
+
n_filters: 64
|
| 34 |
+
stride: 1
|
| 35 |
+
main_args:
|
| 36 |
+
exp_dir: exp/train_dprnn_new/
|
| 37 |
+
gpus: -1
|
| 38 |
+
help: None
|
| 39 |
+
masknet:
|
| 40 |
+
bidirectional: True
|
| 41 |
+
bn_chan: 128
|
| 42 |
+
chunk_size: 250
|
| 43 |
+
dropout: 0
|
| 44 |
+
hid_size: 128
|
| 45 |
+
hop_size: 125
|
| 46 |
+
in_chan: 64
|
| 47 |
+
mask_act: sigmoid
|
| 48 |
+
n_repeats: 6
|
| 49 |
+
n_src: 2
|
| 50 |
+
out_chan: 64
|
| 51 |
+
optim:
|
| 52 |
+
lr: 0.001
|
| 53 |
+
optimizer: adam
|
| 54 |
+
weight_decay: 1e-05
|
| 55 |
+
positional arguments:
|
| 56 |
+
training:
|
| 57 |
+
batch_size: 3
|
| 58 |
+
early_stop: True
|
| 59 |
+
epochs: 200
|
| 60 |
+
gradient_clipping: 5
|
| 61 |
+
half_lr: True
|
| 62 |
+
num_workers: 8
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
### Results:
|
| 66 |
+
```yaml
|
| 67 |
+
si_sdr: 19.316743490695334
|
| 68 |
+
si_sdr_imp: 19.317895273889842
|
| 69 |
+
sdr: 19.68085347190952
|
| 70 |
+
sdr_imp: 19.5298092932871
|
| 71 |
+
sir: 30.362213998701232
|
| 72 |
+
sir_imp: 30.21116982007881
|
| 73 |
+
sar: 20.15553251343315
|
| 74 |
+
sar_imp: -129.02091762351188
|
| 75 |
+
stoi: 0.97772664309074
|
| 76 |
+
stoi_imp: 0.23968091518217424
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
### License notice:
|
| 80 |
+
This work "DPRNNTasNet-ks2_WHAM_sepclean" is a derivative of [CSR-I (WSJ0) Complete](https://catalog.ldc.upenn.edu/LDC93S6A)
|
| 81 |
+
by [LDC](https://www.ldc.upenn.edu/), used under [LDC User Agreement for
|
| 82 |
+
Non-Members](https://catalog.ldc.upenn.edu/license/ldc-non-members-agreement.pdf) (Research only).
|
| 83 |
+
"DPRNNTasNet-ks2_WHAM_sepclean" is licensed under [Attribution-ShareAlike 3.0 Unported](https://creativecommons.org/licenses/by-sa/3.0/)
|
| 84 |
+
by Manuel Pariente.
|
models/DPRNNTasNet-ks2_WHAM_sepclean/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cc1f1fecf24ea3e486521029dc0e1444686bd4b6fdf9715e7757936cbd9ffdf6
|
| 3 |
+
size 14664381
|
models/DPRNNTasNet-ks2_WHAM_sepclean/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/mpariente/DPRNNTasNet-ks2_WHAM_sepclean
|
models/DPRNNTasNet_LibriMix_sepclean/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
models/DPRNNTasNet_LibriMix_sepclean/epoch=9-step=1000.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e95100fd3792c3a80ed7f9655e55b85c05eadaf944e31549e05af2910b2fba2d
|
| 3 |
+
size 44009512
|
models/DPRNNTasNet_LibriMix_sepclean/source.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
https://huggingface.co/Ehsanshr/DPRNNTasNet_LibriMix_sepclean
|
models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/.gitattributes
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/README.md
ADDED
|
@@ -0,0 +1,251 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- espnet
|
| 4 |
+
- audio
|
| 5 |
+
- audio-to-audio
|
| 6 |
+
language: noinfo
|
| 7 |
+
datasets:
|
| 8 |
+
- l3das22
|
| 9 |
+
license: cc-by-4.0
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
## ESPnet2 ENH model
|
| 13 |
+
|
| 14 |
+
### `espnet/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave`
|
| 15 |
+
|
| 16 |
+
This model was trained by neillu23 using l3das22 recipe in [espnet](https://github.com/espnet/espnet/).
|
| 17 |
+
|
| 18 |
+
### Demo: How to use in ESPnet2
|
| 19 |
+
|
| 20 |
+
```bash
|
| 21 |
+
cd espnet
|
| 22 |
+
git checkout da2266fea920e22bb74471565e1a41a89f4cf62c
|
| 23 |
+
pip install -e .
|
| 24 |
+
cd egs2/l3das22/enh1
|
| 25 |
+
./run.sh --skip_data_prep false --skip_train true --download_model espnet/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
<!-- Generated by ./scripts/utils/show_enh_score.sh -->
|
| 29 |
+
# RESULTS
|
| 30 |
+
## Environments
|
| 31 |
+
- date: `Thu Jun 16 09:52:57 UTC 2022`
|
| 32 |
+
- python version: `3.8.13 (default, Mar 28 2022, 11:38:47) [GCC 7.5.0]`
|
| 33 |
+
- espnet version: `espnet 202204`
|
| 34 |
+
- pytorch version: `pytorch 1.8.1`
|
| 35 |
+
- Git hash: `da2266fea920e22bb74471565e1a41a89f4cf62c`
|
| 36 |
+
- Commit date: `Wed Jun 15 11:46:35 2022 +0000`
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
## enh_train_enh_dprnntac_fasnet_raw
|
| 40 |
+
|
| 41 |
+
config: conf/tuning/train_enh_dprnntac_fasnet.yaml
|
| 42 |
+
|
| 43 |
+
|dataset|STOI|SAR|SDR|SIR|SI_SNR|
|
| 44 |
+
|---|---|---|---|---|---|
|
| 45 |
+
|enhanced_dev_multich|73.58|3.52|3.52|0.00|-3.47|
|
| 46 |
+
|enhanced_test_multich|73.93|2.83|2.83|0.00|-4.79|
|
| 47 |
+
|
| 48 |
+
## ENH config
|
| 49 |
+
|
| 50 |
+
<details><summary>expand</summary>
|
| 51 |
+
|
| 52 |
+
```
|
| 53 |
+
config: conf/tuning/train_enh_dprnntac_fasnet.yaml
|
| 54 |
+
print_config: false
|
| 55 |
+
log_level: INFO
|
| 56 |
+
dry_run: false
|
| 57 |
+
iterator_type: chunk
|
| 58 |
+
output_dir: exp/enh_train_enh_dprnntac_fasnet_raw
|
| 59 |
+
ngpu: 1
|
| 60 |
+
seed: 0
|
| 61 |
+
num_workers: 4
|
| 62 |
+
num_att_plot: 3
|
| 63 |
+
dist_backend: nccl
|
| 64 |
+
dist_init_method: env://
|
| 65 |
+
dist_world_size: 2
|
| 66 |
+
dist_rank: 0
|
| 67 |
+
local_rank: 0
|
| 68 |
+
dist_master_addr: localhost
|
| 69 |
+
dist_master_port: 51533
|
| 70 |
+
dist_launcher: null
|
| 71 |
+
multiprocessing_distributed: true
|
| 72 |
+
unused_parameters: false
|
| 73 |
+
sharded_ddp: false
|
| 74 |
+
cudnn_enabled: true
|
| 75 |
+
cudnn_benchmark: false
|
| 76 |
+
cudnn_deterministic: true
|
| 77 |
+
collect_stats: false
|
| 78 |
+
write_collected_feats: false
|
| 79 |
+
max_epoch: 300
|
| 80 |
+
patience: 10
|
| 81 |
+
val_scheduler_criterion:
|
| 82 |
+
- valid
|
| 83 |
+
- loss
|
| 84 |
+
early_stopping_criterion:
|
| 85 |
+
- valid
|
| 86 |
+
- loss
|
| 87 |
+
- min
|
| 88 |
+
best_model_criterion:
|
| 89 |
+
- - valid
|
| 90 |
+
- si_snr
|
| 91 |
+
- max
|
| 92 |
+
- - valid
|
| 93 |
+
- loss
|
| 94 |
+
- min
|
| 95 |
+
keep_nbest_models: 1
|
| 96 |
+
nbest_averaging_interval: 0
|
| 97 |
+
grad_clip: 5.0
|
| 98 |
+
grad_clip_type: 2.0
|
| 99 |
+
grad_noise: false
|
| 100 |
+
accum_grad: 1
|
| 101 |
+
no_forward_run: false
|
| 102 |
+
resume: true
|
| 103 |
+
train_dtype: float32
|
| 104 |
+
use_amp: false
|
| 105 |
+
log_interval: null
|
| 106 |
+
use_matplotlib: true
|
| 107 |
+
use_tensorboard: true
|
| 108 |
+
use_wandb: false
|
| 109 |
+
wandb_project: null
|
| 110 |
+
wandb_id: null
|
| 111 |
+
wandb_entity: null
|
| 112 |
+
wandb_name: null
|
| 113 |
+
wandb_model_log_interval: -1
|
| 114 |
+
detect_anomaly: false
|
| 115 |
+
pretrain_path: null
|
| 116 |
+
init_param: []
|
| 117 |
+
ignore_init_mismatch: false
|
| 118 |
+
freeze_param: []
|
| 119 |
+
num_iters_per_epoch: null
|
| 120 |
+
batch_size: 24
|
| 121 |
+
valid_batch_size: null
|
| 122 |
+
batch_bins: 1000000
|
| 123 |
+
valid_batch_bins: null
|
| 124 |
+
train_shape_file:
|
| 125 |
+
- exp/enh_stats_16k/train/speech_mix_shape
|
| 126 |
+
- exp/enh_stats_16k/train/speech_ref1_shape
|
| 127 |
+
valid_shape_file:
|
| 128 |
+
- exp/enh_stats_16k/valid/speech_mix_shape
|
| 129 |
+
- exp/enh_stats_16k/valid/speech_ref1_shape
|
| 130 |
+
batch_type: folded
|
| 131 |
+
valid_batch_type: null
|
| 132 |
+
fold_length:
|
| 133 |
+
- 80000
|
| 134 |
+
- 80000
|
| 135 |
+
sort_in_batch: descending
|
| 136 |
+
sort_batch: descending
|
| 137 |
+
multiple_iterator: false
|
| 138 |
+
chunk_length: 32000
|
| 139 |
+
chunk_shift_ratio: 0.5
|
| 140 |
+
num_cache_chunks: 1024
|
| 141 |
+
train_data_path_and_name_and_type:
|
| 142 |
+
- - dump/raw/train_multich/wav.scp
|
| 143 |
+
- speech_mix
|
| 144 |
+
- sound
|
| 145 |
+
- - dump/raw/train_multich/spk1.scp
|
| 146 |
+
- speech_ref1
|
| 147 |
+
- sound
|
| 148 |
+
valid_data_path_and_name_and_type:
|
| 149 |
+
- - dump/raw/dev_multich/wav.scp
|
| 150 |
+
- speech_mix
|
| 151 |
+
- sound
|
| 152 |
+
- - dump/raw/dev_multich/spk1.scp
|
| 153 |
+
- speech_ref1
|
| 154 |
+
- sound
|
| 155 |
+
allow_variable_data_keys: false
|
| 156 |
+
max_cache_size: 0.0
|
| 157 |
+
max_cache_fd: 32
|
| 158 |
+
valid_max_cache_size: null
|
| 159 |
+
optim: adam
|
| 160 |
+
optim_conf:
|
| 161 |
+
lr: 0.001
|
| 162 |
+
eps: 1.0e-08
|
| 163 |
+
weight_decay: 0
|
| 164 |
+
scheduler: steplr
|
| 165 |
+
scheduler_conf:
|
| 166 |
+
step_size: 2
|
| 167 |
+
gamma: 0.98
|
| 168 |
+
init: xavier_uniform
|
| 169 |
+
model_conf:
|
| 170 |
+
stft_consistency: false
|
| 171 |
+
loss_type: mask_mse
|
| 172 |
+
mask_type: null
|
| 173 |
+
criterions:
|
| 174 |
+
- name: si_snr
|
| 175 |
+
conf:
|
| 176 |
+
eps: 1.0e-07
|
| 177 |
+
wrapper: fixed_order
|
| 178 |
+
wrapper_conf:
|
| 179 |
+
weight: 1.0
|
| 180 |
+
use_preprocessor: false
|
| 181 |
+
encoder: same
|
| 182 |
+
encoder_conf: {}
|
| 183 |
+
separator: fasnet
|
| 184 |
+
separator_conf:
|
| 185 |
+
enc_dim: 64
|
| 186 |
+
feature_dim: 64
|
| 187 |
+
hidden_dim: 128
|
| 188 |
+
layer: 6
|
| 189 |
+
segment_size: 24
|
| 190 |
+
num_spk: 1
|
| 191 |
+
win_len: 16
|
| 192 |
+
context_len: 16
|
| 193 |
+
sr: 16000
|
| 194 |
+
fasnet_type: fasnet
|
| 195 |
+
dropout: 0.2
|
| 196 |
+
decoder: same
|
| 197 |
+
decoder_conf: {}
|
| 198 |
+
required:
|
| 199 |
+
- output_dir
|
| 200 |
+
version: '202204'
|
| 201 |
+
distributed: true
|
| 202 |
+
```
|
| 203 |
+
|
| 204 |
+
</details>
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
### Citing ESPnet
|
| 209 |
+
|
| 210 |
+
```BibTex
|
| 211 |
+
@inproceedings{watanabe2018espnet,
|
| 212 |
+
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
|
| 213 |
+
title={{ESPnet}: End-to-End Speech Processing Toolkit},
|
| 214 |
+
year={2018},
|
| 215 |
+
booktitle={Proceedings of Interspeech},
|
| 216 |
+
pages={2207--2211},
|
| 217 |
+
doi={10.21437/Interspeech.2018-1456},
|
| 218 |
+
url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
|
| 219 |
+
}
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
@inproceedings{ESPnet-SE,
|
| 223 |
+
author = {Chenda Li and Jing Shi and Wangyou Zhang and Aswin Shanmugam Subramanian and Xuankai Chang and
|
| 224 |
+
Naoyuki Kamo and Moto Hira and Tomoki Hayashi and Christoph B{"{o}}ddeker and Zhuo Chen and Shinji Watanabe},
|
| 225 |
+
title = {ESPnet-SE: End-To-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
|
| 226 |
+
booktitle = {{IEEE} Spoken Language Technology Workshop, {SLT} 2021, Shenzhen, China, January 19-22, 2021},
|
| 227 |
+
pages = {785--792},
|
| 228 |
+
publisher = {{IEEE}},
|
| 229 |
+
year = {2021},
|
| 230 |
+
url = {https://doi.org/10.1109/SLT48900.2021.9383615},
|
| 231 |
+
doi = {10.1109/SLT48900.2021.9383615},
|
| 232 |
+
timestamp = {Mon, 12 Apr 2021 17:08:59 +0200},
|
| 233 |
+
biburl = {https://dblp.org/rec/conf/slt/Li0ZSCKHHBC021.bib},
|
| 234 |
+
bibsource = {dblp computer science bibliography, https://dblp.org}
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
```
|
| 239 |
+
|
| 240 |
+
or arXiv:
|
| 241 |
+
|
| 242 |
+
```bibtex
|
| 243 |
+
@misc{watanabe2018espnet,
|
| 244 |
+
title={ESPnet: End-to-End Speech Processing Toolkit},
|
| 245 |
+
author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
|
| 246 |
+
year={2018},
|
| 247 |
+
eprint={1804.00015},
|
| 248 |
+
archivePrefix={arXiv},
|
| 249 |
+
primaryClass={cs.CL}
|
| 250 |
+
}
|
| 251 |
+
```
|
models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_stats_16k/train/feats_stats.npz
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3f92b851bbc9340886cd3cb2a322006ca939d3221047336847ff2093861f7db9
|
| 3 |
+
size 826
|
models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/299epoch.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ef3ddd09d01e8fa6903e5a879c0635b7dcf3c9b4847233cc2a43efc022ee12b
|
| 3 |
+
size 16366144
|
models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/RESULTS.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!-- Generated by ./scripts/utils/show_enh_score.sh -->
|
| 2 |
+
# RESULTS
|
| 3 |
+
## Environments
|
| 4 |
+
- date: `Thu Jun 16 09:52:57 UTC 2022`
|
| 5 |
+
- python version: `3.8.13 (default, Mar 28 2022, 11:38:47) [GCC 7.5.0]`
|
| 6 |
+
- espnet version: `espnet 202204`
|
| 7 |
+
- pytorch version: `pytorch 1.8.1`
|
| 8 |
+
- Git hash: `da2266fea920e22bb74471565e1a41a89f4cf62c`
|
| 9 |
+
- Commit date: `Wed Jun 15 11:46:35 2022 +0000`
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
## enh_train_enh_dprnntac_fasnet_raw
|
| 13 |
+
|
| 14 |
+
config: conf/tuning/train_enh_dprnntac_fasnet.yaml
|
| 15 |
+
|
| 16 |
+
|dataset|STOI|SAR|SDR|SIR|SI_SNR|
|
| 17 |
+
|---|---|---|---|---|---|
|
| 18 |
+
|enhanced_dev_multich|73.58|3.52|3.52|0.00|-3.47|
|
| 19 |
+
|enhanced_test_multich|73.93|2.83|2.83|0.00|-4.79|
|
| 20 |
+
|
models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/config.yaml
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
config: conf/tuning/train_enh_dprnntac_fasnet.yaml
|
| 2 |
+
print_config: false
|
| 3 |
+
log_level: INFO
|
| 4 |
+
dry_run: false
|
| 5 |
+
iterator_type: chunk
|
| 6 |
+
output_dir: exp/enh_train_enh_dprnntac_fasnet_raw
|
| 7 |
+
ngpu: 1
|
| 8 |
+
seed: 0
|
| 9 |
+
num_workers: 4
|
| 10 |
+
num_att_plot: 3
|
| 11 |
+
dist_backend: nccl
|
| 12 |
+
dist_init_method: env://
|
| 13 |
+
dist_world_size: 2
|
| 14 |
+
dist_rank: 0
|
| 15 |
+
local_rank: 0
|
| 16 |
+
dist_master_addr: localhost
|
| 17 |
+
dist_master_port: 51533
|
| 18 |
+
dist_launcher: null
|
| 19 |
+
multiprocessing_distributed: true
|
| 20 |
+
unused_parameters: false
|
| 21 |
+
sharded_ddp: false
|
| 22 |
+
cudnn_enabled: true
|
| 23 |
+
cudnn_benchmark: false
|
| 24 |
+
cudnn_deterministic: true
|
| 25 |
+
collect_stats: false
|
| 26 |
+
write_collected_feats: false
|
| 27 |
+
max_epoch: 300
|
| 28 |
+
patience: 10
|
| 29 |
+
val_scheduler_criterion:
|
| 30 |
+
- valid
|
| 31 |
+
- loss
|
| 32 |
+
early_stopping_criterion:
|
| 33 |
+
- valid
|
| 34 |
+
- loss
|
| 35 |
+
- min
|
| 36 |
+
best_model_criterion:
|
| 37 |
+
- - valid
|
| 38 |
+
- si_snr
|
| 39 |
+
- max
|
| 40 |
+
- - valid
|
| 41 |
+
- loss
|
| 42 |
+
- min
|
| 43 |
+
keep_nbest_models: 1
|
| 44 |
+
nbest_averaging_interval: 0
|
| 45 |
+
grad_clip: 5.0
|
| 46 |
+
grad_clip_type: 2.0
|
| 47 |
+
grad_noise: false
|
| 48 |
+
accum_grad: 1
|
| 49 |
+
no_forward_run: false
|
| 50 |
+
resume: true
|
| 51 |
+
train_dtype: float32
|
| 52 |
+
use_amp: false
|
| 53 |
+
log_interval: null
|
| 54 |
+
use_matplotlib: true
|
| 55 |
+
use_tensorboard: true
|
| 56 |
+
use_wandb: false
|
| 57 |
+
wandb_project: null
|
| 58 |
+
wandb_id: null
|
| 59 |
+
wandb_entity: null
|
| 60 |
+
wandb_name: null
|
| 61 |
+
wandb_model_log_interval: -1
|
| 62 |
+
detect_anomaly: false
|
| 63 |
+
pretrain_path: null
|
| 64 |
+
init_param: []
|
| 65 |
+
ignore_init_mismatch: false
|
| 66 |
+
freeze_param: []
|
| 67 |
+
num_iters_per_epoch: null
|
| 68 |
+
batch_size: 24
|
| 69 |
+
valid_batch_size: null
|
| 70 |
+
batch_bins: 1000000
|
| 71 |
+
valid_batch_bins: null
|
| 72 |
+
train_shape_file:
|
| 73 |
+
- exp/enh_stats_16k/train/speech_mix_shape
|
| 74 |
+
- exp/enh_stats_16k/train/speech_ref1_shape
|
| 75 |
+
valid_shape_file:
|
| 76 |
+
- exp/enh_stats_16k/valid/speech_mix_shape
|
| 77 |
+
- exp/enh_stats_16k/valid/speech_ref1_shape
|
| 78 |
+
batch_type: folded
|
| 79 |
+
valid_batch_type: null
|
| 80 |
+
fold_length:
|
| 81 |
+
- 80000
|
| 82 |
+
- 80000
|
| 83 |
+
sort_in_batch: descending
|
| 84 |
+
sort_batch: descending
|
| 85 |
+
multiple_iterator: false
|
| 86 |
+
chunk_length: 32000
|
| 87 |
+
chunk_shift_ratio: 0.5
|
| 88 |
+
num_cache_chunks: 1024
|
| 89 |
+
train_data_path_and_name_and_type:
|
| 90 |
+
- - dump/raw/train_multich/wav.scp
|
| 91 |
+
- speech_mix
|
| 92 |
+
- sound
|
| 93 |
+
- - dump/raw/train_multich/spk1.scp
|
| 94 |
+
- speech_ref1
|
| 95 |
+
- sound
|
| 96 |
+
valid_data_path_and_name_and_type:
|
| 97 |
+
- - dump/raw/dev_multich/wav.scp
|
| 98 |
+
- speech_mix
|
| 99 |
+
- sound
|
| 100 |
+
- - dump/raw/dev_multich/spk1.scp
|
| 101 |
+
- speech_ref1
|
| 102 |
+
- sound
|
| 103 |
+
allow_variable_data_keys: false
|
| 104 |
+
max_cache_size: 0.0
|
| 105 |
+
max_cache_fd: 32
|
| 106 |
+
valid_max_cache_size: null
|
| 107 |
+
optim: adam
|
| 108 |
+
optim_conf:
|
| 109 |
+
lr: 0.001
|
| 110 |
+
eps: 1.0e-08
|
| 111 |
+
weight_decay: 0
|
| 112 |
+
scheduler: steplr
|
| 113 |
+
scheduler_conf:
|
| 114 |
+
step_size: 2
|
| 115 |
+
gamma: 0.98
|
| 116 |
+
init: xavier_uniform
|
| 117 |
+
model_conf:
|
| 118 |
+
stft_consistency: false
|
| 119 |
+
loss_type: mask_mse
|
| 120 |
+
mask_type: null
|
| 121 |
+
criterions:
|
| 122 |
+
- name: si_snr
|
| 123 |
+
conf:
|
| 124 |
+
eps: 1.0e-07
|
| 125 |
+
wrapper: fixed_order
|
| 126 |
+
wrapper_conf:
|
| 127 |
+
weight: 1.0
|
| 128 |
+
use_preprocessor: false
|
| 129 |
+
encoder: same
|
| 130 |
+
encoder_conf: {}
|
| 131 |
+
separator: fasnet
|
| 132 |
+
separator_conf:
|
| 133 |
+
enc_dim: 64
|
| 134 |
+
feature_dim: 64
|
| 135 |
+
hidden_dim: 128
|
| 136 |
+
layer: 6
|
| 137 |
+
segment_size: 24
|
| 138 |
+
num_spk: 1
|
| 139 |
+
win_len: 16
|
| 140 |
+
context_len: 16
|
| 141 |
+
sr: 16000
|
| 142 |
+
fasnet_type: fasnet
|
| 143 |
+
dropout: 0.2
|
| 144 |
+
decoder: same
|
| 145 |
+
decoder_conf: {}
|
| 146 |
+
required:
|
| 147 |
+
- output_dir
|
| 148 |
+
version: '202204'
|
| 149 |
+
distributed: true
|
models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/images/backward_time.png
ADDED
|
models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/images/forward_time.png
ADDED
|
models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/images/gpu_max_cached_mem_GB.png
ADDED
|
models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/images/iter_time.png
ADDED
|
models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/images/loss.png
ADDED
|
models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/images/optim0_lr0.png
ADDED
|
models/Yen-Ju_Lu_l3das22_enh_train_dprnntac_fasnet_valid.loss.ave/exp/enh_train_enh_dprnntac_fasnet_raw/images/optim_step_time.png
ADDED
|