niobures commited on
Commit
2351781
·
verified ·
1 Parent(s): 3281d86

DPTNet (code, models, paper)

Browse files
Files changed (37) hide show
  1. .gitattributes +1 -0
  2. DPTNet. A Dual-Path Transformer Architecture for Scene Text Detection.pdf +3 -0
  3. code/DPTNet [ilyakava] +3.zip +3 -0
  4. code/DPTNet.zip +3 -0
  5. code/TargetSpeakerEnhance.zip +3 -0
  6. code/dptnet_mindspore.zip +3 -0
  7. models/DPTNet_Libri1Mix_enhsingle_16k/.gitattributes +8 -0
  8. models/DPTNet_Libri1Mix_enhsingle_16k/README.md +86 -0
  9. models/DPTNet_Libri1Mix_enhsingle_16k/pytorch_model.bin +3 -0
  10. models/DPTNet_Libri1Mix_enhsingle_16k/source.txt +1 -0
  11. models/DPTNet_WHAMR_enhsingle_16k/.gitattributes +27 -0
  12. models/DPTNet_WHAMR_enhsingle_16k/README.md +82 -0
  13. models/DPTNet_WHAMR_enhsingle_16k/pytorch_model.bin +3 -0
  14. models/DPTNet_WHAMR_enhsingle_16k/source.txt +1 -0
  15. models/DPTNet_jaCappella_VES_48k/.gitattributes +34 -0
  16. models/DPTNet_jaCappella_VES_48k/README.md +80 -0
  17. models/DPTNet_jaCappella_VES_48k/best_model.pth +3 -0
  18. models/DPTNet_jaCappella_VES_48k/conf.yml +46 -0
  19. models/DPTNet_jaCappella_VES_48k/source.txt +1 -0
  20. models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/.gitattributes +27 -0
  21. models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/README.md +253 -0
  22. models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_stats_8k/train/feats_stats.npz +3 -0
  23. models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/99epoch.pth +3 -0
  24. models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/RESULTS.md +20 -0
  25. models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/config.yaml +169 -0
  26. models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/backward_time.png +0 -0
  27. models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/forward_time.png +0 -0
  28. models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/gpu_max_cached_mem_GB.png +0 -0
  29. models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/iter_time.png +0 -0
  30. models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/loss.png +0 -0
  31. models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/optim0_lr0.png +0 -0
  32. models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/optim_step_time.png +0 -0
  33. models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/si_snr_loss.png +0 -0
  34. models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/train_time.png +0 -0
  35. models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/valid.loss.best.pth +3 -0
  36. models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/meta.yaml +8 -0
  37. models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/source.txt +1 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ DPTNet.[[:space:]]A[[:space:]]Dual-Path[[:space:]]Transformer[[:space:]]Architecture[[:space:]]for[[:space:]]Scene[[:space:]]Text[[:space:]]Detection.pdf filter=lfs diff=lfs merge=lfs -text
DPTNet. A Dual-Path Transformer Architecture for Scene Text Detection.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8419807e87010892c99f4def064fad96cb0bcfd25ec692c0d40426900e34921c
3
+ size 1920411
code/DPTNet [ilyakava] +3.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e1a744916e8e5ba210a5768f0c218e031911b044b618ed129ac158dcc52ab0e
3
+ size 74404
code/DPTNet.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d9e69327140cf7f62d3c6f2c41f4636ecf65d35524575c9c249dd3eb9a326ec
3
+ size 51997
code/TargetSpeakerEnhance.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d51a6516e72917aca4a9d43a7df92c568b49b26e40bd28a0ef909e3ab8eb8139
3
+ size 2300975
code/dptnet_mindspore.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe820cc93c5b217a14e9f516d4d156ff1334261ac1eb8b56d61fcb4214573b2c
3
+ size 78533
models/DPTNet_Libri1Mix_enhsingle_16k/.gitattributes ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
models/DPTNet_Libri1Mix_enhsingle_16k/README.md ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - asteroid
4
+ - audio
5
+ - DPTNet
6
+ - audio-to-audio
7
+ datasets:
8
+ - Libri1Mix
9
+ - enh_single
10
+ license: cc-by-sa-4.0
11
+ ---
12
+
13
+ ## Asteroid model `JorisCos/DPTNet_Libri1Mix_enhsignle_16k`
14
+
15
+ Description:
16
+
17
+ This model was trained by Joris Cosentino using the librimix recipe in [Asteroid](https://github.com/asteroid-team/asteroid).
18
+ It was trained on the `enh_single` task of the Libri1Mix dataset.
19
+
20
+ Training config:
21
+
22
+ ```yml
23
+ data:
24
+ n_src: 1
25
+ sample_rate: 16000
26
+ segment: 3
27
+ task: enh_single
28
+ train_dir: data/wav16k/min/train-360
29
+ valid_dir: data/wav16k/min/dev
30
+ filterbank:
31
+ kernel_size: 16
32
+ n_filters: 64
33
+ stride: 8
34
+ masknet:
35
+ bidirectional: true
36
+ chunk_size: 100
37
+ dropout: 0
38
+ ff_activation: relu
39
+ ff_hid: 256
40
+ hop_size: 50
41
+ in_chan: 64
42
+ mask_act: sigmoid
43
+ n_repeats: 2
44
+ n_src: 1
45
+ norm_type: gLN
46
+ out_chan: 64
47
+ optim:
48
+ lr: 0.001
49
+ optimizer: adam
50
+ weight_decay: 1.0e-05
51
+ scheduler:
52
+ d_model: 64
53
+ steps_per_epoch: 10000
54
+ training:
55
+ batch_size: 4
56
+ early_stop: true
57
+ epochs: 200
58
+ gradient_clipping: 5
59
+ half_lr: true
60
+ num_workers: 4
61
+ ```
62
+
63
+
64
+ Results:
65
+
66
+ On Libri1Mix min test set :
67
+ ```yml
68
+ si_sdr: 14.829670037349064
69
+ si_sdr_imp: 11.379888731489366
70
+ sdr: 15.395712644737149
71
+ sdr_imp: 11.893049845524112
72
+ sir: Infinity
73
+ sir_imp: NaN
74
+ sar: 15.395712644737149
75
+ sar_imp: 11.893049845524112
76
+ stoi: 0.9301948391058859
77
+ stoi_imp: 0.13427501556534832
78
+ ```
79
+
80
+
81
+ License notice:
82
+
83
+ This work "DPTNet_Libri1Mix_enhsignle_16k" is a derivative of [LibriSpeech ASR corpus](http://www.openslr.org/12) by Vassil Panayotov,
84
+ used under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/); of The WSJ0 Hipster Ambient Mixtures
85
+ dataset by [Whisper.ai](http://wham.whisper.ai/), used under [CC BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/) (Research only).
86
+ "DPTNet_Libri1Mix_enhsignle_16k" is licensed under [Attribution-ShareAlike 3.0 Unported](https://creativecommons.org/licenses/by-sa/3.0/) by Joris Cosentino
models/DPTNet_Libri1Mix_enhsingle_16k/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f4f38dc0be2bcb479364b4b49fdc0c92d77fc3f1aa6049090cd3ea0db95019f
3
+ size 11437018
models/DPTNet_Libri1Mix_enhsingle_16k/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/JorisCos/DPTNet_Libri1Mix_enhsingle_16k
models/DPTNet_WHAMR_enhsingle_16k/.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/DPTNet_WHAMR_enhsingle_16k/README.md ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - asteroid
4
+ - audio
5
+ - DPTNet
6
+ - audio-to-audio
7
+ datasets:
8
+ - Libri1Mix
9
+ - enh_single
10
+ license: cc-by-sa-4.0
11
+ ---
12
+ ## Asteroid model `cankeles/DPTNet_WHAMR_enhsignle_16k`
13
+
14
+ Description:
15
+
16
+ This model was trained by M. Can Keleş using the librimix recipe in [Asteroid](https://github.com/asteroid-team/asteroid).
17
+ It was trained on the `enh_single` task of the Libri1Mix dataset.
18
+
19
+ Training config:
20
+
21
+ ```yml
22
+ data:
23
+ mode: min
24
+ nondefault_nsrc: null
25
+ sample_rate: 16000
26
+ segment: 2.0
27
+ task: enh_single
28
+ train_dir: wav16k/min/tr/
29
+ valid_dir: wav16k/min/cv/
30
+ filterbank:
31
+ kernel_size: 16
32
+ n_filters: 64
33
+ stride: 8
34
+ main_args:
35
+ exp_dir: exp/tmp
36
+ help: null
37
+ masknet:
38
+ bidirectional: true
39
+ chunk_size: 100
40
+ dropout: 0
41
+ ff_activation: relu
42
+ ff_hid: 256
43
+ hop_size: 50
44
+ in_chan: 64
45
+ mask_act: sigmoid
46
+ n_repeats: 2
47
+ n_src: 1
48
+ norm_type: gLN
49
+ out_chan: 64
50
+ optim:
51
+ lr: 0.001
52
+ optimizer: adam
53
+ weight_decay: 1.0e-05
54
+ positional arguments: {}
55
+ scheduler:
56
+ d_model: 64
57
+ steps_per_epoch: 10000
58
+ training:
59
+ batch_size: 4
60
+ early_stop: true
61
+ epochs: 60
62
+ gradient_clipping: 5
63
+ half_lr: true
64
+ num_workers: 4
65
+ ```
66
+
67
+
68
+ Results:
69
+
70
+ On custom min test set :
71
+ ```yml
72
+ 'sar': 12.853384266251018,
73
+ 'sar_imp': 8.950332361953906,
74
+ 'sdr': 12.853384266251018,
75
+ 'sdr_imp': 8.950332361953906,
76
+ 'si_sdr': 12.247012621312548,
77
+ 'si_sdr_imp': 8.429646186633407,
78
+ 'sir': inf,
79
+ 'sir_imp': nan,
80
+ 'stoi': 0.9022338865380519,
81
+ 'stoi_imp': 0.09735707619500522
82
+ ```
models/DPTNet_WHAMR_enhsingle_16k/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:950d7ec1c41e498f3300f9322a019c370509cb72b1f36826bfcf40d6af7c4101
3
+ size 11434540
models/DPTNet_WHAMR_enhsingle_16k/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/cankeles/DPTNet_WHAMR_enhsingle_16k
models/DPTNet_jaCappella_VES_48k/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/DPTNet_jaCappella_VES_48k/README.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: cc-by-nc-4.0
3
+ language:
4
+ - ja
5
+ tags:
6
+ - music
7
+ - speech
8
+ - audio
9
+ - audio-to-audio
10
+ - a cappella
11
+ - vocal ensemble
12
+ datasets:
13
+ - jaCappella
14
+ metrics:
15
+ - SI-SDR
16
+ ---
17
+
18
+ # DPTNet trained with the jaCappella corpus for vocal ensemble separation
19
+
20
+ This model was trained by Tomohiko Nakamura using [the codebase](https://github.com/TomohikoNakamura/asteroid_jaCappella)).
21
+ It was trained on the vocal ensemble separation task of [the jaCappella dataset](https://tomohikonakamura.github.io/jaCappella_corpus/).
22
+ [The paper](https://doi.org/10.1109/ICASSP49357.2023.10095569) was published in ICASSP 2023 ([arXiv](https://arxiv.org/abs/2211.16028)).
23
+
24
+ # License
25
+ See [the jaCappella dataset page](https://tomohikonakamura.github.io/jaCappella_corpus/).
26
+
27
+ # Citation
28
+ See [the jaCappella dataset page](https://tomohikonakamura.github.io/jaCappella_corpus/).
29
+
30
+ # Configuration
31
+ ```yaml
32
+ data:
33
+ num_workers: 12
34
+ sample_rate: 48000
35
+ samples_per_track: 13
36
+ seed: 42
37
+ seq_dur: 5.046
38
+ source_augmentations:
39
+ - gain
40
+ sources:
41
+ - vocal_percussion
42
+ - bass
43
+ - alto
44
+ - tenor
45
+ - soprano
46
+ - lead_vocal
47
+ filterbank:
48
+ kernel_size: 32
49
+ n_filters: 64
50
+ stride: 16
51
+ masknet:
52
+ bidirectional: true
53
+ chunk_size: 174
54
+ dropout: 0
55
+ ff_activation: relu
56
+ ff_hid: 256
57
+ hop_size: 128
58
+ in_chan: 64
59
+ mask_act: sigmoid
60
+ n_repeats: 8
61
+ n_src: 6
62
+ norm_type: gLN
63
+ out_chan: 64
64
+ optim:
65
+ lr: 0.005
66
+ optimizer: adam
67
+ weight_decay: 1.0e-05
68
+ training:
69
+ batch_size: 1
70
+ early_stop: true
71
+ epochs: 600
72
+ gradient_clipping: 5
73
+ half_lr: true
74
+ loss_func: pit_sisdr
75
+ ```
76
+ # Results (SI-SDR [dB]) on vocal ensemble separation
77
+
78
+ | Method | Lead vocal | Soprano | Alto | Tenor | Bass |Vocal percussion|
79
+ |:---------------:|:--------------:|:--------------:|:--------------:|:--------------:|:--------------:|:--------------:|
80
+ | DPTNet | 8.9 | 8.5 | 11.9 | 14.9 | 19.7 | 21.9 |
models/DPTNet_jaCappella_VES_48k/best_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d0738ae01145bdf074e8e3a2312f1a21ff2e0c96f2a4f42b1cd0d2c7f4780ac
3
+ size 45639083
models/DPTNet_jaCappella_VES_48k/conf.yml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ data:
2
+ num_workers: 12
3
+ sample_rate: 48000
4
+ samples_per_track: 13
5
+ seed: 42
6
+ seq_dur: 5.046
7
+ source_augmentations:
8
+ - gain
9
+ sources:
10
+ - vocal_percussion
11
+ - bass
12
+ - alto
13
+ - tenor
14
+ - soprano
15
+ - lead_vocal
16
+ filterbank:
17
+ kernel_size: 32
18
+ n_filters: 64
19
+ stride: 16
20
+ main_args:
21
+ help: null
22
+ masknet:
23
+ bidirectional: true
24
+ chunk_size: 174
25
+ dropout: 0
26
+ ff_activation: relu
27
+ ff_hid: 256
28
+ hop_size: 128
29
+ in_chan: 64
30
+ mask_act: sigmoid
31
+ n_repeats: 8
32
+ n_src: 6
33
+ norm_type: gLN
34
+ out_chan: 64
35
+ optim:
36
+ lr: 0.005
37
+ optimizer: adam
38
+ weight_decay: 1.0e-05
39
+ positional arguments: {}
40
+ training:
41
+ batch_size: 1
42
+ early_stop: true
43
+ epochs: 600
44
+ gradient_clipping: 5
45
+ half_lr: true
46
+ loss_func: pit_sisdr
models/DPTNet_jaCappella_VES_48k/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/jaCappella/DPTNet_jaCappella_VES_48k
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/.gitattributes ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ftz filter=lfs diff=lfs merge=lfs -text
6
+ *.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.h5 filter=lfs diff=lfs merge=lfs -text
8
+ *.joblib filter=lfs diff=lfs merge=lfs -text
9
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
+ *.model filter=lfs diff=lfs merge=lfs -text
11
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
12
+ *.onnx filter=lfs diff=lfs merge=lfs -text
13
+ *.ot filter=lfs diff=lfs merge=lfs -text
14
+ *.parquet filter=lfs diff=lfs merge=lfs -text
15
+ *.pb filter=lfs diff=lfs merge=lfs -text
16
+ *.pt filter=lfs diff=lfs merge=lfs -text
17
+ *.pth filter=lfs diff=lfs merge=lfs -text
18
+ *.rar filter=lfs diff=lfs merge=lfs -text
19
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
20
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
21
+ *.tflite filter=lfs diff=lfs merge=lfs -text
22
+ *.tgz filter=lfs diff=lfs merge=lfs -text
23
+ *.wasm filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/README.md ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - audio-to-audio
6
+ language:
7
+ datasets:
8
+ - wsj0-2mix
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 ENH model
13
+
14
+ ### `espnet/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw`
15
+
16
+ This model was trained by Wangyou Zhang using wsj0_2mix recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+
23
+ pip install -e .
24
+ cd egs2/wsj0_2mix/enh1
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw
26
+ ```
27
+
28
+
29
+
30
+ ## ENH config
31
+
32
+ <details><summary>expand</summary>
33
+
34
+ ```
35
+ config: conf/tuning/train_enh_dptnet.yaml
36
+ print_config: false
37
+ log_level: INFO
38
+ dry_run: false
39
+ iterator_type: chunk
40
+ output_dir: exp/enh_train_enh_dptnet_raw
41
+ ngpu: 1
42
+ seed: 0
43
+ num_workers: 4
44
+ num_att_plot: 3
45
+ dist_backend: nccl
46
+ dist_init_method: env://
47
+ dist_world_size: 4
48
+ dist_rank: 0
49
+ local_rank: 0
50
+ dist_master_addr: localhost
51
+ dist_master_port: 53094
52
+ dist_launcher: null
53
+ multiprocessing_distributed: true
54
+ unused_parameters: true
55
+ sharded_ddp: false
56
+ cudnn_enabled: true
57
+ cudnn_benchmark: false
58
+ cudnn_deterministic: true
59
+ collect_stats: false
60
+ write_collected_feats: false
61
+ validate_train_iter: false
62
+ max_epoch: 150
63
+ patience: 10
64
+ val_scheduler_criterion:
65
+ - valid
66
+ - loss
67
+ early_stopping_criterion:
68
+ - valid
69
+ - loss
70
+ - min
71
+ best_model_criterion:
72
+ - - valid
73
+ - si_snr
74
+ - max
75
+ - - valid
76
+ - loss
77
+ - min
78
+ keep_nbest_models: 1
79
+ nbest_averaging_interval: 0
80
+ grad_clip: 5
81
+ grad_clip_type: 2.0
82
+ grad_noise: false
83
+ accum_grad: 1
84
+ no_forward_run: false
85
+ resume: true
86
+ train_dtype: float32
87
+ use_amp: false
88
+ log_interval: null
89
+ use_matplotlib: true
90
+ use_tensorboard: true
91
+ use_wandb: false
92
+ wandb_project: null
93
+ wandb_id: null
94
+ wandb_entity: null
95
+ wandb_name: null
96
+ wandb_model_log_interval: -1
97
+ detect_anomaly: false
98
+ pretrain_path: null
99
+ init_param: []
100
+ ignore_init_mismatch: false
101
+ freeze_param: []
102
+ num_iters_per_epoch: null
103
+ batch_size: 4
104
+ valid_batch_size: null
105
+ batch_bins: 1000000
106
+ valid_batch_bins: null
107
+ train_shape_file:
108
+ - exp/enh_stats_8k/train/speech_mix_shape
109
+ - exp/enh_stats_8k/train/speech_ref1_shape
110
+ - exp/enh_stats_8k/train/speech_ref2_shape
111
+ valid_shape_file:
112
+ - exp/enh_stats_8k/valid/speech_mix_shape
113
+ - exp/enh_stats_8k/valid/speech_ref1_shape
114
+ - exp/enh_stats_8k/valid/speech_ref2_shape
115
+ batch_type: folded
116
+ valid_batch_type: null
117
+ fold_length:
118
+ - 80000
119
+ - 80000
120
+ - 80000
121
+ sort_in_batch: descending
122
+ sort_batch: descending
123
+ multiple_iterator: false
124
+ chunk_length: 20000
125
+ chunk_shift_ratio: 0.5
126
+ num_cache_chunks: 1024
127
+ train_data_path_and_name_and_type:
128
+ - - dump/raw/tr_min_8k/wav.scp
129
+ - speech_mix
130
+ - sound
131
+ - - dump/raw/tr_min_8k/spk1.scp
132
+ - speech_ref1
133
+ - sound
134
+ - - dump/raw/tr_min_8k/spk2.scp
135
+ - speech_ref2
136
+ - sound
137
+ valid_data_path_and_name_and_type:
138
+ - - dump/raw/cv_min_8k/wav.scp
139
+ - speech_mix
140
+ - sound
141
+ - - dump/raw/cv_min_8k/spk1.scp
142
+ - speech_ref1
143
+ - sound
144
+ - - dump/raw/cv_min_8k/spk2.scp
145
+ - speech_ref2
146
+ - sound
147
+ allow_variable_data_keys: false
148
+ max_cache_size: 0.0
149
+ max_cache_fd: 32
150
+ valid_max_cache_size: null
151
+ optim: adam
152
+ optim_conf:
153
+ lr: 0.0004
154
+ eps: 1.0e-08
155
+ weight_decay: 1.0e-05
156
+ scheduler: warmupsteplr
157
+ scheduler_conf:
158
+ warmup_steps: 4000
159
+ steps_per_epoch: 14273
160
+ step_size: 2
161
+ gamma: 0.98
162
+ init: null
163
+ model_conf:
164
+ stft_consistency: false
165
+ loss_type: mask_mse
166
+ mask_type: null
167
+ criterions:
168
+ - name: si_snr
169
+ conf:
170
+ eps: 1.0e-07
171
+ wrapper: pit
172
+ wrapper_conf:
173
+ weight: 1.0
174
+ independent_perm: true
175
+ use_preprocessor: false
176
+ encoder: conv
177
+ encoder_conf:
178
+ channel: 64
179
+ kernel_size: 2
180
+ stride: 1
181
+ separator: dptnet
182
+ separator_conf:
183
+ num_spk: 2
184
+ post_enc_relu: true
185
+ layer: 6
186
+ rnn_type: lstm
187
+ bidirectional: true
188
+ unit: 128
189
+ att_heads: 4
190
+ dropout: 0.0
191
+ activation: relu
192
+ norm_type: gLN
193
+ segment_size: 250
194
+ nonlinear: relu
195
+ decoder: conv
196
+ decoder_conf:
197
+ channel: 64
198
+ kernel_size: 2
199
+ stride: 1
200
+ required:
201
+ - output_dir
202
+ version: 0.10.7a1
203
+ distributed: true
204
+ ```
205
+
206
+ </details>
207
+
208
+
209
+
210
+ ### Citing ESPnet
211
+
212
+ ```BibTex
213
+ @inproceedings{watanabe2018espnet,
214
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
215
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
216
+ year={2018},
217
+ booktitle={Proceedings of Interspeech},
218
+ pages={2207--2211},
219
+ doi={10.21437/Interspeech.2018-1456},
220
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
221
+ }
222
+
223
+ @inproceedings{li2021espnetse,
224
+ title={{ESPnet-SE}: End-to-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
225
+ author={Li, Chenda and Shi, Jing and Zhang, Wangyou and Subramanian, Aswin Shanmugam and Chang, Xuankai and Kamo, Naoyuki and Hira, Moto and Hayashi, Tomoki and Boeddeker, Christoph and Chen, Zhuo and Watanabe, Shinji},
226
+ booktitle={Proc. IEEE Spoken Language Technology Workshop (SLT)},
227
+ pages={785--792},
228
+ year={2021},
229
+ }
230
+
231
+ ```
232
+
233
+ or arXiv:
234
+
235
+ ```bibtex
236
+ @misc{watanabe2018espnet,
237
+ title={ESPnet: End-to-End Speech Processing Toolkit},
238
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
239
+ year={2018},
240
+ eprint={1804.00015},
241
+ archivePrefix={arXiv},
242
+ primaryClass={cs.CL}
243
+ }
244
+
245
+ @inproceedings{li2021espnetse,
246
+ title={{ESPnet-SE}: End-to-End Speech Enhancement and Separation Toolkit Designed for {ASR} Integration},
247
+ author={Li, Chenda and Shi, Jing and Zhang, Wangyou and Subramanian, Aswin Shanmugam and Chang, Xuankai and Kamo, Naoyuki and Hira, Moto and Hayashi, Tomoki and Boeddeker, Christoph and Chen, Zhuo and Watanabe, Shinji},
248
+ year={2020},
249
+ eprint={2011.03706},
250
+ archivePrefix={arXiv},
251
+ primaryClass={eess.AS}
252
+ }
253
+ ```
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_stats_8k/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d890c44023968991b362b31f39fcecc453f0d619071befb36205d610e8aabb8b
3
+ size 778
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/99epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34bbfa87de88766844af4c3d313e34ef99e15194e6f394df354ca5fb6564bb0c
3
+ size 11274659
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/RESULTS.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- Generated by ./scripts/utils/show_enh_score.sh -->
2
+ # RESULTS
3
+ ## Environments
4
+ - date: `Tue Jun 21 20:50:00 CST 2022`
5
+ - python version: `3.8.12 (default, Oct 12 2021, 13:49:34) [GCC 7.5.0]`
6
+ - espnet version: `espnet 0.10.7a1`
7
+ - pytorch version: `pytorch 1.10.2+cu102`
8
+ - Git hash: `9c24b3adddbde3402530080cb58ae08a6f4dd642`
9
+ - Commit date: `Wed Feb 23 14:49:15 2022 -0500`
10
+
11
+
12
+ ## enh_train_enh_dptnet_orig_raw
13
+
14
+ config: conf/tuning/train_enh_dptnet.yaml
15
+
16
+ |dataset|STOI|SAR|SDR|SIR|SI_SNR|
17
+ |---|---|---|---|---|---|
18
+ |enhanced_cv_min_8k|97.43|21.39|20.98|32.17|20.63|
19
+ |enhanced_tt_min_8k|98.18|21.47|21.06|32.48|20.72|
20
+
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/config.yaml ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_enh_dptnet.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: chunk
6
+ output_dir: exp/enh_train_enh_dptnet_raw
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 4
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 53094
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: true
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ validate_train_iter: false
28
+ max_epoch: 150
29
+ patience: 10
30
+ val_scheduler_criterion:
31
+ - valid
32
+ - loss
33
+ early_stopping_criterion:
34
+ - valid
35
+ - loss
36
+ - min
37
+ best_model_criterion:
38
+ - - valid
39
+ - si_snr
40
+ - max
41
+ - - valid
42
+ - loss
43
+ - min
44
+ keep_nbest_models: 1
45
+ nbest_averaging_interval: 0
46
+ grad_clip: 5
47
+ grad_clip_type: 2.0
48
+ grad_noise: false
49
+ accum_grad: 1
50
+ no_forward_run: false
51
+ resume: true
52
+ train_dtype: float32
53
+ use_amp: false
54
+ log_interval: null
55
+ use_matplotlib: true
56
+ use_tensorboard: true
57
+ use_wandb: false
58
+ wandb_project: null
59
+ wandb_id: null
60
+ wandb_entity: null
61
+ wandb_name: null
62
+ wandb_model_log_interval: -1
63
+ detect_anomaly: false
64
+ pretrain_path: null
65
+ init_param: []
66
+ ignore_init_mismatch: false
67
+ freeze_param: []
68
+ num_iters_per_epoch: null
69
+ batch_size: 4
70
+ valid_batch_size: null
71
+ batch_bins: 1000000
72
+ valid_batch_bins: null
73
+ train_shape_file:
74
+ - exp/enh_stats_8k/train/speech_mix_shape
75
+ - exp/enh_stats_8k/train/speech_ref1_shape
76
+ - exp/enh_stats_8k/train/speech_ref2_shape
77
+ valid_shape_file:
78
+ - exp/enh_stats_8k/valid/speech_mix_shape
79
+ - exp/enh_stats_8k/valid/speech_ref1_shape
80
+ - exp/enh_stats_8k/valid/speech_ref2_shape
81
+ batch_type: folded
82
+ valid_batch_type: null
83
+ fold_length:
84
+ - 80000
85
+ - 80000
86
+ - 80000
87
+ sort_in_batch: descending
88
+ sort_batch: descending
89
+ multiple_iterator: false
90
+ chunk_length: 20000
91
+ chunk_shift_ratio: 0.5
92
+ num_cache_chunks: 1024
93
+ train_data_path_and_name_and_type:
94
+ - - dump/raw/tr_min_8k/wav.scp
95
+ - speech_mix
96
+ - sound
97
+ - - dump/raw/tr_min_8k/spk1.scp
98
+ - speech_ref1
99
+ - sound
100
+ - - dump/raw/tr_min_8k/spk2.scp
101
+ - speech_ref2
102
+ - sound
103
+ valid_data_path_and_name_and_type:
104
+ - - dump/raw/cv_min_8k/wav.scp
105
+ - speech_mix
106
+ - sound
107
+ - - dump/raw/cv_min_8k/spk1.scp
108
+ - speech_ref1
109
+ - sound
110
+ - - dump/raw/cv_min_8k/spk2.scp
111
+ - speech_ref2
112
+ - sound
113
+ allow_variable_data_keys: false
114
+ max_cache_size: 0.0
115
+ max_cache_fd: 32
116
+ valid_max_cache_size: null
117
+ optim: adam
118
+ optim_conf:
119
+ lr: 0.0004
120
+ eps: 1.0e-08
121
+ weight_decay: 1.0e-05
122
+ scheduler: warmupsteplr
123
+ scheduler_conf:
124
+ warmup_steps: 4000
125
+ steps_per_epoch: 14273
126
+ step_size: 2
127
+ gamma: 0.98
128
+ init: null
129
+ model_conf:
130
+ stft_consistency: false
131
+ loss_type: mask_mse
132
+ mask_type: null
133
+ criterions:
134
+ - name: si_snr
135
+ conf:
136
+ eps: 1.0e-07
137
+ wrapper: pit
138
+ wrapper_conf:
139
+ weight: 1.0
140
+ independent_perm: true
141
+ use_preprocessor: false
142
+ encoder: conv
143
+ encoder_conf:
144
+ channel: 64
145
+ kernel_size: 2
146
+ stride: 1
147
+ separator: dptnet
148
+ separator_conf:
149
+ num_spk: 2
150
+ post_enc_relu: true
151
+ layer: 6
152
+ rnn_type: lstm
153
+ bidirectional: true
154
+ unit: 128
155
+ att_heads: 4
156
+ dropout: 0.0
157
+ activation: relu
158
+ norm_type: gLN
159
+ segment_size: 250
160
+ nonlinear: relu
161
+ decoder: conv
162
+ decoder_conf:
163
+ channel: 64
164
+ kernel_size: 2
165
+ stride: 1
166
+ required:
167
+ - output_dir
168
+ version: 0.10.7a1
169
+ distributed: true
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/backward_time.png ADDED
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/forward_time.png ADDED
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/gpu_max_cached_mem_GB.png ADDED
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/iter_time.png ADDED
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/loss.png ADDED
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/optim0_lr0.png ADDED
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/optim_step_time.png ADDED
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/si_snr_loss.png ADDED
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/images/train_time.png ADDED
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/exp/enh_train_enh_dptnet_raw/valid.loss.best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a799382546fa95a82961f2e0d7a35ccfc3c984d6c2ecbd294ff8cf198b0357fa
3
+ size 11
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.10.7a1
2
+ files:
3
+ model_file: exp/enh_train_enh_dptnet_raw/99epoch.pth
4
+ python: "3.8.12 (default, Oct 12 2021, 13:49:34) \n[GCC 7.5.0]"
5
+ timestamp: 1655818843.663898
6
+ torch: 1.10.2+cu102
7
+ yaml_files:
8
+ train_config: exp/enh_train_enh_dptnet_raw/config.yaml
models/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/espnet/Wangyou_Zhang_wsj0_2mix_enh_train_enh_dptnet_raw