Politrees commited on
Commit
c9b303e
·
verified ·
1 Parent(s): 1aa25f6
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Roformer_models/BS_Inst_EXP_VRL.ckpt +0 -3
  2. Roformer_models/BS_Inst_EXP_VRL.yaml +0 -124
  3. Roformer_models/MelBandRoformerBigSYHFTV1.ckpt +0 -3
  4. Roformer_models/MelBandRoformerSYHFT.ckpt +0 -3
  5. Roformer_models/MelBandRoformerSYHFTV2.5.ckpt +0 -3
  6. Roformer_models/MelBandRoformerSYHFTV2.ckpt +0 -3
  7. Roformer_models/MelBandRoformerSYHFTV3Epsilon.ckpt +0 -3
  8. Roformer_models/aspiration_mel_band_roformer_less_aggr_sdr_18.1201.ckpt +0 -3
  9. Roformer_models/aspiration_mel_band_roformer_sdr_18.9845.ckpt +0 -3
  10. Roformer_models/bs_roformer_male_female_by_aufr33_sdr_7.2889.ckpt +0 -3
  11. Roformer_models/bs_roformer_voc_gabox.ckpt +0 -3
  12. Roformer_models/config_aspiration_mel_band_roformer.yaml +0 -76
  13. Roformer_models/config_bs_roformer_voc_gabox.yaml +0 -133
  14. Roformer_models/config_chorus_male_female_bs_roformer.yaml +0 -125
  15. Roformer_models/config_dereverb-echo_mel_band_roformer.yaml +0 -76
  16. Roformer_models/config_dereverb-echo_mel_band_roformer_sdr_13.4843_v2.yaml +0 -64
  17. Roformer_models/config_dereverb_echo_mel_band_roformer_v2.yaml +0 -64
  18. Roformer_models/config_mel_band_roformer_bleed_suppressor_v1.yaml +0 -51
  19. Roformer_models/config_mel_band_roformer_inst_gabox.yaml +0 -51
  20. Roformer_models/config_mel_band_roformer_instrumental_becruily.yaml +0 -72
  21. Roformer_models/config_mel_band_roformer_kim_ft_unwa.yaml +0 -72
  22. Roformer_models/config_mel_band_roformer_voc_gabox.yaml +0 -51
  23. Roformer_models/config_mel_band_roformer_vocals_becruily.yaml +0 -72
  24. Roformer_models/config_mel_band_roformer_vocals_fullness_aname.yaml +0 -54
  25. Roformer_models/config_melband_roformer_big_beta5e.yaml +0 -51
  26. Roformer_models/config_melband_roformer_big_beta6.yaml +0 -72
  27. Roformer_models/config_melband_roformer_big_beta6x.yaml +0 -72
  28. Roformer_models/config_melband_roformer_small_by_aname.yaml +0 -52
  29. Roformer_models/config_melbandroformer_big_beta4.yaml +0 -51
  30. Roformer_models/config_melbandroformer_inst.yaml +0 -51
  31. Roformer_models/config_melbandroformer_inst_v2.yaml +0 -51
  32. Roformer_models/config_melbandroformer_instvoc_duality.yaml +0 -51
  33. Roformer_models/config_vocals_mel_band_roformer_big_v1_ft.yaml +0 -51
  34. Roformer_models/config_vocals_mel_band_roformer_ft.yaml +0 -72
  35. Roformer_models/denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768.ckpt +0 -3
  36. Roformer_models/denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768_config.yaml +0 -71
  37. Roformer_models/denoise_mel_band_roformer_aufr33_sdr_27.9959.ckpt +0 -3
  38. Roformer_models/denoise_mel_band_roformer_aufr33_sdr_27.9959_config.yaml +0 -71
  39. Roformer_models/dereverb-echo_mel_band_roformer_sdr_10.0169.ckpt +0 -3
  40. Roformer_models/dereverb-echo_mel_band_roformer_sdr_13.4843_v2.ckpt +0 -3
  41. Roformer_models/dereverb_big_mbr_ep_362.ckpt +0 -3
  42. Roformer_models/dereverb_echo_mbr_fused.ckpt +0 -3
  43. Roformer_models/dereverb_mel_band_roformer_anvuew.yaml +0 -76
  44. Roformer_models/dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt +0 -3
  45. Roformer_models/dereverb_mel_band_roformer_less_aggressive_anvuew_sdr_18.8050.ckpt +0 -3
  46. Roformer_models/dereverb_mel_band_roformer_mono_anvuew_sdr_20.4029.ckpt +0 -3
  47. Roformer_models/dereverb_super_big_mbr_ep_346.ckpt +0 -3
  48. Roformer_models/deverb_bs_roformer_8_384dim_10depth.ckpt +0 -3
  49. Roformer_models/deverb_bs_roformer_8_384dim_10depth_config.yaml +0 -137
  50. Roformer_models/mel_band_roformer_bleed_suppressor_v1.ckpt +0 -3
Roformer_models/BS_Inst_EXP_VRL.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c035e2a102243405e45bf33faa175f62fd7118f63b62771fafdf81062b804131
3
- size 393351501
 
 
 
 
Roformer_models/BS_Inst_EXP_VRL.yaml DELETED
@@ -1,124 +0,0 @@
1
- audio:
2
- chunk_size: 485100 #352800 #485100
3
- dim_f: 1024
4
- dim_t: 801
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 384
13
- depth: 12
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- freqs_per_bands: !!python/tuple
19
- - 2
20
- - 2
21
- - 2
22
- - 2
23
- - 2
24
- - 2
25
- - 2
26
- - 2
27
- - 2
28
- - 2
29
- - 2
30
- - 2
31
- - 2
32
- - 2
33
- - 2
34
- - 2
35
- - 2
36
- - 2
37
- - 2
38
- - 2
39
- - 2
40
- - 2
41
- - 2
42
- - 2
43
- - 4
44
- - 4
45
- - 4
46
- - 4
47
- - 4
48
- - 4
49
- - 4
50
- - 4
51
- - 4
52
- - 4
53
- - 4
54
- - 4
55
- - 12
56
- - 12
57
- - 12
58
- - 12
59
- - 12
60
- - 12
61
- - 12
62
- - 12
63
- - 24
64
- - 24
65
- - 24
66
- - 24
67
- - 24
68
- - 24
69
- - 24
70
- - 24
71
- - 48
72
- - 48
73
- - 48
74
- - 48
75
- - 48
76
- - 48
77
- - 48
78
- - 48
79
- - 128
80
- - 129
81
- dim_head: 64
82
- heads: 8
83
- attn_dropout: 0
84
- ff_dropout: 0
85
- flash_attn: true
86
- dim_freqs_in: 1025
87
- stft_n_fft: 2048
88
- stft_hop_length: 441
89
- stft_win_length: 2048
90
- stft_normalized: false
91
- mask_estimator_depth: 2
92
- multi_stft_resolution_loss_weight: 1.0
93
- multi_stft_resolutions_window_sizes: !!python/tuple
94
- - 4096
95
- - 2048
96
- - 1024
97
- - 512
98
- - 256
99
- multi_stft_hop_size: 147
100
- multi_stft_normalized: False
101
- training:
102
- batch_size: 1
103
- gradient_accumulation_steps: 1
104
- grad_clip: 0
105
- instruments:
106
- - Vocals
107
- - Instrumental
108
- lr: 1.0e-04
109
- patience: 2
110
- reduce_factor: 0.95
111
- target_instrument: Instrumental
112
- num_epochs: 1
113
- num_steps: 1000
114
- q: 0.95
115
- coarse_loss_clip: true
116
- ema_momentum: 0.999
117
- optimizer: adamw
118
- other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
119
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
120
-
121
- inference:
122
- batch_size: 1
123
- dim_t: 1101
124
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/MelBandRoformerBigSYHFTV1.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2327e3e81f19e67c307f8c830c54267c09ecb0e9c6ad2b40a80c310899c955f
3
- size 1479738496
 
 
 
 
Roformer_models/MelBandRoformerSYHFT.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f319dfcde4396ea3106658f457f5eb0bc577e113491f61ae8bab216fe84b0c0c
3
- size 913096702
 
 
 
 
Roformer_models/MelBandRoformerSYHFTV2.5.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:916e3a2c1e63b1457bcad823b98ca705e4933deffd2a5ab3a370e10f68bf47e2
3
- size 913090472
 
 
 
 
Roformer_models/MelBandRoformerSYHFTV2.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2e99f8efa5315300c197295592bd7e56c21c1d77e1884c904b5128c54a2a4632
3
- size 913095346
 
 
 
 
Roformer_models/MelBandRoformerSYHFTV3Epsilon.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c886092e4aae13aa089263a0d54d483643f58c16ec221aed37268e2c1031397
3
- size 913090472
 
 
 
 
Roformer_models/aspiration_mel_band_roformer_less_aggr_sdr_18.1201.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:83bfe991cec4fbadde9f30d1f79cd5293ad0b1f936256be327bba5cbb4883374
3
- size 835982664
 
 
 
 
Roformer_models/aspiration_mel_band_roformer_sdr_18.9845.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e791258c866c6c8da66052693d8cc3b64f1f42c01e052dbdc570cd278380cc5
3
- size 835983746
 
 
 
 
Roformer_models/bs_roformer_male_female_by_aufr33_sdr_7.2889.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3cf11736d1b42a11ae55d8299316585921477dd2a671b24b663660846ca9861b
3
- size 527119779
 
 
 
 
Roformer_models/bs_roformer_voc_gabox.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:18d58efe5e949e70fab11b875329af6d06ef11ccc29574bfe943fb57cc827f38
3
- size 639254584
 
 
 
 
Roformer_models/config_aspiration_mel_band_roformer.yaml DELETED
@@ -1,76 +0,0 @@
1
- audio:
2
- chunk_size: 352800
3
- dim_f: 1024
4
- dim_t: 801 # don't work (use in model)
5
- hop_length: 441 # don't work (use in model)
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 256
13
- depth: 8
14
- stereo: true
15
- num_stems: 2
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0.1
22
- ff_dropout: 0.1
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- batch_size: 1
43
- gradient_accumulation_steps: 8
44
- grad_clip: 0
45
- instruments:
46
- - aspiration
47
- - other
48
- lr: 4.0e-05
49
- patience: 2
50
- reduce_factor: 0.95
51
- target_instrument: null
52
- num_epochs: 1000
53
- num_steps: 1000
54
- q: 0.95
55
- coarse_loss_clip: true
56
- ema_momentum: 0.999
57
- optimizer: adam
58
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
59
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
60
-
61
- augmentations:
62
- enable: true # enable or disable all augmentations (to fast disable if needed)
63
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
64
- loudness_min: 0.5
65
- loudness_max: 1.5
66
- mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
67
- mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
68
- - 0.2
69
- - 0.02
70
- mixup_loudness_min: 0.5
71
- mixup_loudness_max: 1.5
72
-
73
- inference:
74
- batch_size: 4
75
- dim_t: 801
76
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_bs_roformer_voc_gabox.yaml DELETED
@@ -1,133 +0,0 @@
1
- audio:
2
- chunk_size: 352800
3
- dim_f: 1024
4
- dim_t: 801 # don't work (use in model)
5
- hop_length: 441 # don't work (use in model)
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.001
10
-
11
- model:
12
- dim: 512
13
- depth: 12
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- freqs_per_bands: !!python/tuple
19
- - 2
20
- - 2
21
- - 2
22
- - 2
23
- - 2
24
- - 2
25
- - 2
26
- - 2
27
- - 2
28
- - 2
29
- - 2
30
- - 2
31
- - 2
32
- - 2
33
- - 2
34
- - 2
35
- - 2
36
- - 2
37
- - 2
38
- - 2
39
- - 2
40
- - 2
41
- - 2
42
- - 2
43
- - 4
44
- - 4
45
- - 4
46
- - 4
47
- - 4
48
- - 4
49
- - 4
50
- - 4
51
- - 4
52
- - 4
53
- - 4
54
- - 4
55
- - 12
56
- - 12
57
- - 12
58
- - 12
59
- - 12
60
- - 12
61
- - 12
62
- - 12
63
- - 24
64
- - 24
65
- - 24
66
- - 24
67
- - 24
68
- - 24
69
- - 24
70
- - 24
71
- - 48
72
- - 48
73
- - 48
74
- - 48
75
- - 48
76
- - 48
77
- - 48
78
- - 48
79
- - 128
80
- - 129
81
- dim_head: 64
82
- heads: 8
83
- attn_dropout: 0.1
84
- ff_dropout: 0.1
85
- flash_attn: true
86
- dim_freqs_in: 1025
87
- stft_n_fft: 2048
88
- stft_hop_length: 441
89
- stft_win_length: 2048
90
- stft_normalized: false
91
- mask_estimator_depth: 2
92
- multi_stft_resolution_loss_weight: 1.0
93
- multi_stft_resolutions_window_sizes: !!python/tuple
94
- - 4096
95
- - 2048
96
- - 1024
97
- - 512
98
- - 256
99
- multi_stft_hop_size: 147
100
- multi_stft_normalized: False
101
-
102
- training:
103
- batch_size: 16
104
- gradient_accumulation_steps: 1
105
- grad_clip: 0
106
- instruments:
107
- - Vocals
108
- - Instrumental
109
- lr: 5.0e-05
110
- patience: 2
111
- reduce_factor: 0.95
112
- target_instrument: Vocals
113
- num_epochs: 1000
114
- num_steps: 1000
115
- augmentation: false # enable augmentations by audiomentations and pedalboard
116
- augmentation_type: simple1
117
- use_mp3_compress: false # Deprecated
118
- augmentation_mix: true # Mix several stems of the same type with some probability
119
- augmentation_loudness: true # randomly change loudness of each stem
120
- augmentation_loudness_type: 1 # Type 1 or 2
121
- augmentation_loudness_min: 0.5
122
- augmentation_loudness_max: 1.5
123
- q: 0.95
124
- coarse_loss_clip: true
125
- ema_momentum: 0.999
126
- optimizer: adam
127
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
128
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
129
-
130
- inference:
131
- batch_size: 1
132
- dim_t: 801
133
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_chorus_male_female_bs_roformer.yaml DELETED
@@ -1,125 +0,0 @@
1
- audio:
2
- chunk_size: 352800
3
- dim_f: 1024
4
- dim_t: 801 # don't work (use in model)
5
- hop_length: 441 # don't work (use in model)
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 384
13
- depth: 8
14
- stereo: true
15
- num_stems: 2
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- freqs_per_bands: !!python/tuple
19
- - 2
20
- - 2
21
- - 2
22
- - 2
23
- - 2
24
- - 2
25
- - 2
26
- - 2
27
- - 2
28
- - 2
29
- - 2
30
- - 2
31
- - 2
32
- - 2
33
- - 2
34
- - 2
35
- - 2
36
- - 2
37
- - 2
38
- - 2
39
- - 2
40
- - 2
41
- - 2
42
- - 2
43
- - 4
44
- - 4
45
- - 4
46
- - 4
47
- - 4
48
- - 4
49
- - 4
50
- - 4
51
- - 4
52
- - 4
53
- - 4
54
- - 4
55
- - 12
56
- - 12
57
- - 12
58
- - 12
59
- - 12
60
- - 12
61
- - 12
62
- - 12
63
- - 24
64
- - 24
65
- - 24
66
- - 24
67
- - 24
68
- - 24
69
- - 24
70
- - 24
71
- - 48
72
- - 48
73
- - 48
74
- - 48
75
- - 48
76
- - 48
77
- - 48
78
- - 48
79
- - 128
80
- - 129
81
- dim_head: 64
82
- heads: 8
83
- attn_dropout: 0.0
84
- ff_dropout: 0.0
85
- flash_attn: true
86
- dim_freqs_in: 1025
87
- stft_n_fft: 2048
88
- stft_hop_length: 441
89
- stft_win_length: 2048
90
- stft_normalized: false
91
- mask_estimator_depth: 2
92
- multi_stft_resolution_loss_weight: 1.0
93
- multi_stft_resolutions_window_sizes: !!python/tuple
94
- - 4096
95
- - 2048
96
- - 1024
97
- - 512
98
- - 256
99
- multi_stft_hop_size: 147
100
- multi_stft_normalized: False
101
-
102
- training:
103
- batch_size: 1
104
- gradient_accumulation_steps: 1
105
- grad_clip: 0
106
- instruments:
107
- - male
108
- - female
109
- lr: 1.0e-05
110
- patience: 2
111
- reduce_factor: 0.95
112
- target_instrument: null
113
- num_epochs: 1000
114
- num_steps: 1000
115
- q: 0.95
116
- coarse_loss_clip: true
117
- ema_momentum: 0.999
118
- optimizer: adam
119
- other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
120
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
121
-
122
- inference:
123
- batch_size: 1
124
- dim_t: 801
125
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_dereverb-echo_mel_band_roformer.yaml DELETED
@@ -1,76 +0,0 @@
1
- audio:
2
- chunk_size: 352800
3
- dim_f: 1024
4
- dim_t: 801 # don't work (use in model)
5
- hop_length: 441 # don't work (use in model)
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 256
13
- depth: 8
14
- stereo: true
15
- num_stems: 2
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0.1
22
- ff_dropout: 0.1
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- batch_size: 1
43
- gradient_accumulation_steps: 8
44
- grad_clip: 0
45
- instruments:
46
- - dry
47
- - No dry
48
- lr: 4.0e-05
49
- patience: 2
50
- reduce_factor: 0.95
51
- target_instrument: null
52
- num_epochs: 1000
53
- num_steps: 1000
54
- q: 0.95
55
- coarse_loss_clip: true
56
- ema_momentum: 0.999
57
- optimizer: adam
58
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
59
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
60
-
61
- augmentations:
62
- enable: true # enable or disable all augmentations (to fast disable if needed)
63
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
64
- loudness_min: 0.5
65
- loudness_max: 1.5
66
- mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
67
- mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
68
- - 0.2
69
- - 0.02
70
- mixup_loudness_min: 0.5
71
- mixup_loudness_max: 1.5
72
-
73
- inference:
74
- batch_size: 4
75
- dim_t: 801
76
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_dereverb-echo_mel_band_roformer_sdr_13.4843_v2.yaml DELETED
@@ -1,64 +0,0 @@
1
- audio:
2
- chunk_size: 352800
3
- dim_f: 1024
4
- dim_t: 801
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 256
13
- depth: 8
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0.1
22
- ff_dropout: 0.1
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- batch_size: 1
43
- gradient_accumulation_steps: 8
44
- grad_clip: 0
45
- instruments:
46
- - dry
47
- - No dry
48
- lr: 1.0e-05
49
- patience: 2
50
- reduce_factor: 0.95
51
- target_instrument: dry
52
- num_epochs: 1000
53
- num_steps: 1000
54
- q: 0.95
55
- coarse_loss_clip: true
56
- ema_momentum: 0.999
57
- optimizer: adam
58
- other_fix: false
59
- use_amp: true
60
-
61
- inference:
62
- batch_size: 1
63
- dim_t: 801
64
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_dereverb_echo_mel_band_roformer_v2.yaml DELETED
@@ -1,64 +0,0 @@
1
- audio:
2
- chunk_size: 352800
3
- dim_f: 1024
4
- dim_t: 801
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 256
13
- depth: 8
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0.1
22
- ff_dropout: 0.1
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- batch_size: 1
43
- gradient_accumulation_steps: 8
44
- grad_clip: 0
45
- instruments:
46
- - dry
47
- - other
48
- lr: 1.0e-05
49
- patience: 2
50
- reduce_factor: 0.95
51
- target_instrument: dry
52
- num_epochs: 1000
53
- num_steps: 1000
54
- q: 0.95
55
- coarse_loss_clip: true
56
- ema_momentum: 0.999
57
- optimizer: adam
58
- other_fix: false
59
- use_amp: true
60
-
61
- inference:
62
- batch_size: 1
63
- dim_t: 801
64
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_mel_band_roformer_bleed_suppressor_v1.yaml DELETED
@@ -1,51 +0,0 @@
1
- audio:
2
- chunk_size: 485100
3
- dim_f: 1024
4
- dim_t: 801
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 384
13
- depth: 6
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- instruments:
43
- - Instrumental
44
- - Bleed
45
- target_instrument: Instrumental
46
- use_amp: True
47
-
48
- inference:
49
- batch_size: 1
50
- dim_t: 801
51
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_mel_band_roformer_inst_gabox.yaml DELETED
@@ -1,51 +0,0 @@
1
- audio:
2
- chunk_size: 485100
3
- dim_f: 1024
4
- dim_t: 1101
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 384
13
- depth: 6
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- instruments:
43
- - Instrumental
44
- - Vocals
45
- target_instrument: Instrumental
46
- use_amp: True
47
-
48
- inference:
49
- batch_size: 1
50
- dim_t: 1101
51
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_mel_band_roformer_instrumental_becruily.yaml DELETED
@@ -1,72 +0,0 @@
1
- audio:
2
- chunk_size: 352800
3
- dim_f: 1024
4
- dim_t: 256
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 384
13
- depth: 6
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- batch_size: 1
43
- gradient_accumulation_steps: 1
44
- grad_clip: 0
45
- instruments:
46
- - Instrumental
47
- - Vocals
48
- lr: 0.0005
49
- patience: 2
50
- reduce_factor: 0.95
51
- target_instrument: Instrumental
52
- num_epochs: 1000
53
- num_steps: 1000
54
- augmentation: false # enable augmentations by audiomentations and pedalboard
55
- augmentation_type: null
56
- use_mp3_compress: false # Deprecated
57
- augmentation_mix: false # Mix several stems of the same type with some probability
58
- augmentation_loudness: false # randomly change loudness of each stem
59
- augmentation_loudness_type: 1 # Type 1 or 2
60
- augmentation_loudness_min: 0
61
- augmentation_loudness_max: 0
62
- q: 0.95
63
- coarse_loss_clip: false
64
- ema_momentum: 0.999
65
- optimizer: adamw
66
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
67
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
68
-
69
- inference:
70
- batch_size: 1
71
- dim_t: 1101
72
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_mel_band_roformer_kim_ft_unwa.yaml DELETED
@@ -1,72 +0,0 @@
1
- audio:
2
- chunk_size: 485100
3
- dim_f: 1024
4
- dim_t: 801
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 384
13
- depth: 6
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- batch_size: 1
43
- gradient_accumulation_steps: 1
44
- grad_clip: 0
45
- instruments:
46
- - vocals
47
- - other
48
- lr: 1.0e-05
49
- patience: 2
50
- reduce_factor: 0.95
51
- target_instrument: vocals
52
- num_epochs: 1000
53
- num_steps: 1000
54
- augmentation: false # enable augmentations by audiomentations and pedalboard
55
- augmentation_type: null
56
- use_mp3_compress: false # Deprecated
57
- augmentation_mix: false # Mix several stems of the same type with some probability
58
- augmentation_loudness: false # randomly change loudness of each stem
59
- augmentation_loudness_type: 1 # Type 1 or 2
60
- augmentation_loudness_min: 0
61
- augmentation_loudness_max: 0
62
- q: 0.95
63
- coarse_loss_clip: false
64
- ema_momentum: 0.999
65
- optimizer: adam
66
- other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
67
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
68
-
69
- inference:
70
- batch_size: 1
71
- dim_t: 801
72
- num_overlap: 8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_mel_band_roformer_voc_gabox.yaml DELETED
@@ -1,51 +0,0 @@
1
- audio:
2
- chunk_size: 352800
3
- dim_f: 1024
4
- dim_t: 256
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.001
10
-
11
- model:
12
- dim: 384
13
- depth: 6
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- instruments:
43
- - Vocals
44
- - Instrumental
45
- target_instrument: Vocals
46
-
47
- inference:
48
- batch_size: 1
49
- dim_t: 1101
50
- num_overlap: 1
51
- chunk_size: 352800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_mel_band_roformer_vocals_becruily.yaml DELETED
@@ -1,72 +0,0 @@
1
- audio:
2
- chunk_size: 352800
3
- dim_f: 1024
4
- dim_t: 256
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 384
13
- depth: 6
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- batch_size: 1
43
- gradient_accumulation_steps: 1
44
- grad_clip: 0
45
- instruments:
46
- - vocals
47
- - other
48
- lr: 0.0005
49
- patience: 2
50
- reduce_factor: 0.95
51
- target_instrument: vocals
52
- num_epochs: 1000
53
- num_steps: 1000
54
- augmentation: false # enable augmentations by audiomentations and pedalboard
55
- augmentation_type: null
56
- use_mp3_compress: false # Deprecated
57
- augmentation_mix: false # Mix several stems of the same type with some probability
58
- augmentation_loudness: false # randomly change loudness of each stem
59
- augmentation_loudness_type: 1 # Type 1 or 2
60
- augmentation_loudness_min: 0
61
- augmentation_loudness_max: 0
62
- q: 0.95
63
- coarse_loss_clip: false
64
- ema_momentum: 0.999
65
- optimizer: adamw
66
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
67
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
68
-
69
- inference:
70
- batch_size: 1
71
- dim_t: 1101
72
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_mel_band_roformer_vocals_fullness_aname.yaml DELETED
@@ -1,54 +0,0 @@
1
- audio:
2
- chunk_size: 661500
3
- dim_f: 1024
4
- dim_t: 256
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.001
10
-
11
- model:
12
- dim: 384
13
- depth: 6
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- batch_size: 1
43
- gradient_accumulation_steps: 1
44
- grad_clip: 0
45
- instruments:
46
- - vocals
47
- - other
48
- target_instrument: vocals
49
- use_amp: true
50
-
51
- inference:
52
- batch_size: 4
53
- dim_t: 1101
54
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_melband_roformer_big_beta5e.yaml DELETED
@@ -1,51 +0,0 @@
1
- audio:
2
- chunk_size: 485100
3
- dim_f: 1024
4
- dim_t: 801
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 384
13
- depth: 6
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 3
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- instruments:
43
- - vocals
44
- - other
45
- target_instrument: vocals
46
- use_amp: True
47
-
48
- inference:
49
- batch_size: 1
50
- dim_t: 801
51
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_melband_roformer_big_beta6.yaml DELETED
@@ -1,72 +0,0 @@
1
- audio:
2
- chunk_size: 529200
3
- dim_f: 1024
4
- dim_t: 256
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 512
13
- depth: 6
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- batch_size: 1
43
- gradient_accumulation_steps: 1
44
- grad_clip: 0
45
- instruments:
46
- - vocals
47
- - other
48
- lr: 1.0e-05
49
- patience: 2
50
- reduce_factor: 0.95
51
- target_instrument: vocals
52
- num_epochs: 1000
53
- num_steps: 1000
54
- augmentation: false # enable augmentations by audiomentations and pedalboard
55
- augmentation_type: null
56
- use_mp3_compress: false # Deprecated
57
- augmentation_mix: false # Mix several stems of the same type with some probability
58
- augmentation_loudness: false # randomly change loudness of each stem
59
- augmentation_loudness_type: 1 # Type 1 or 2
60
- augmentation_loudness_min: 0
61
- augmentation_loudness_max: 0
62
- q: 0.95
63
- coarse_loss_clip: false
64
- ema_momentum: 0.999
65
- optimizer: adam
66
- other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
67
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
68
-
69
- inference:
70
- batch_size: 2
71
- dim_t: 1201
72
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_melband_roformer_big_beta6x.yaml DELETED
@@ -1,72 +0,0 @@
1
- audio:
2
- chunk_size: 529200
3
- dim_f: 1024
4
- dim_t: 256
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 512
13
- depth: 12
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- batch_size: 1
43
- gradient_accumulation_steps: 1
44
- grad_clip: 0
45
- instruments:
46
- - vocals
47
- - other
48
- lr: 1.0e-05
49
- patience: 2
50
- reduce_factor: 0.95
51
- target_instrument: vocals
52
- num_epochs: 1000
53
- num_steps: 1000
54
- augmentation: false # enable augmentations by audiomentations and pedalboard
55
- augmentation_type: null
56
- use_mp3_compress: false # Deprecated
57
- augmentation_mix: false # Mix several stems of the same type with some probability
58
- augmentation_loudness: false # randomly change loudness of each stem
59
- augmentation_loudness_type: 1 # Type 1 or 2
60
- augmentation_loudness_min: 0
61
- augmentation_loudness_max: 0
62
- q: 0.95
63
- coarse_loss_clip: false
64
- ema_momentum: 0.999
65
- optimizer: adam
66
- other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
67
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
68
-
69
- inference:
70
- batch_size: 2
71
- dim_t: 1201
72
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_melband_roformer_small_by_aname.yaml DELETED
@@ -1,52 +0,0 @@
1
- audio:
2
- chunk_size: 485100
3
- dim_f: 1024
4
- dim_t: 1101
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.0
10
-
11
- model:
12
- dim: 384
13
- depth: 6
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
- mlp_expansion_factor: 1
41
-
42
- training:
43
- instruments:
44
- - Instrumental
45
- - Vocals
46
- target_instrument: null
47
- use_amp: true
48
-
49
- inference:
50
- batch_size: 2
51
- dim_t: 1101
52
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_melbandroformer_big_beta4.yaml DELETED
@@ -1,51 +0,0 @@
1
- audio:
2
- chunk_size: 485100
3
- dim_f: 1024
4
- dim_t: 1101
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 384
13
- depth: 12
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 3
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- instruments:
43
- - vocals
44
- - other
45
- target_instrument: vocals
46
- use_amp: True
47
-
48
- inference:
49
- batch_size: 1
50
- dim_t: 1101
51
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_melbandroformer_inst.yaml DELETED
@@ -1,51 +0,0 @@
1
- audio:
2
- chunk_size: 485100
3
- dim_f: 1024
4
- dim_t: 1101
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 384
13
- depth: 6
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- instruments:
43
- - other
44
- - vocals
45
- target_instrument: other
46
- use_amp: True
47
-
48
- inference:
49
- batch_size: 1
50
- dim_t: 1101
51
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_melbandroformer_inst_v2.yaml DELETED
@@ -1,51 +0,0 @@
1
- audio:
2
- chunk_size: 485100
3
- dim_f: 1024
4
- dim_t: 1101
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 384
13
- depth: 12
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 3
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- instruments:
43
- - Instrumental
44
- - Vocals
45
- target_instrument: Instrumental
46
- use_amp: True
47
-
48
- inference:
49
- batch_size: 1
50
- dim_t: 1101
51
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_melbandroformer_instvoc_duality.yaml DELETED
@@ -1,51 +0,0 @@
1
- audio:
2
- chunk_size: 485100
3
- dim_f: 1024
4
- dim_t: 256
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 384
13
- depth: 6
14
- stereo: true
15
- num_stems: 2
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- instruments:
43
- - Vocals
44
- - Instrumental
45
- target_instrument: null
46
- use_amp: True
47
-
48
- inference:
49
- batch_size: 1
50
- dim_t: 1101
51
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_vocals_mel_band_roformer_big_v1_ft.yaml DELETED
@@ -1,51 +0,0 @@
1
- audio:
2
- chunk_size: 485100
3
- dim_f: 1024
4
- dim_t: 801
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 384
13
- depth: 6
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 3
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- instruments:
43
- - vocals
44
- - other
45
- target_instrument: vocals
46
- use_amp: True
47
-
48
- inference:
49
- batch_size: 1
50
- dim_t: 801
51
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/config_vocals_mel_band_roformer_ft.yaml DELETED
@@ -1,72 +0,0 @@
1
- audio:
2
- chunk_size: 352800
3
- dim_f: 1024
4
- dim_t: 256
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 384
13
- depth: 6
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- batch_size: 1
43
- gradient_accumulation_steps: 8
44
- grad_clip: 0
45
- instruments:
46
- - vocals
47
- - other
48
- lr: 1.0e-04
49
- patience: 2
50
- reduce_factor: 0.95
51
- target_instrument: vocals
52
- num_epochs: 1000
53
- num_steps: 100
54
- augmentation: true # enable augmentations by audiomentations and pedalboard
55
- augmentation_type: null
56
- use_mp3_compress: false # Deprecated
57
- augmentation_mix: true # Mix several stems of the same type with some probability
58
- augmentation_loudness: true # randomly change loudness of each stem
59
- augmentation_loudness_type: 1 # Type 1 or 2
60
- augmentation_loudness_min: 0
61
- augmentation_loudness_max: 0
62
- q: 0.95
63
- coarse_loss_clip: false
64
- ema_momentum: 0.999
65
- optimizer: adamw8bit
66
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
67
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
68
-
69
- inference:
70
- batch_size: 4
71
- dim_t: 256
72
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a25e3b233722cd81e2de7b8e798a3fef29d4b9799ccacda60b0dc958a1e2a5bb
3
- size 913097300
 
 
 
 
Roformer_models/denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768_config.yaml DELETED
@@ -1,71 +0,0 @@
1
- audio:
2
- chunk_size: 352800
3
- dim_f: 1024
4
- dim_t: 801
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 000
10
-
11
- model:
12
- dim: 384
13
- depth: 6
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- batch_size: 2
43
- gradient_accumulation_steps: 1
44
- grad_clip: 0
45
- instruments:
46
- - dry
47
- - other
48
- lr: 1.0e-05
49
- patience: 8
50
- reduce_factor: 0.95
51
- target_instrument: dry
52
- num_epochs: 1000
53
- num_steps: 4032
54
- augmentation: false # enable augmentations by audiomentations and pedalboard
55
- augmentation_type: null
56
- use_mp3_compress: false # Deprecated
57
- augmentation_mix: false # Mix several stems of the same type with some probability
58
- augmentation_loudness: false # randomly change loudness of each stem
59
- augmentation_loudness_type: 1 # Type 1 or 2
60
- augmentation_loudness_min: 0
61
- augmentation_loudness_max: 0
62
- q: 0.95
63
- coarse_loss_clip: false
64
- ema_momentum: 0.999
65
- optimizer: adam
66
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
67
-
68
- inference:
69
- batch_size: 2
70
- dim_t: 801
71
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/denoise_mel_band_roformer_aufr33_sdr_27.9959.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c1c39191edc34e942ca7f2346ce6b6c0e1208a5f76349ffce6f696bd12910de
3
- size 913097300
 
 
 
 
Roformer_models/denoise_mel_band_roformer_aufr33_sdr_27.9959_config.yaml DELETED
@@ -1,71 +0,0 @@
1
- audio:
2
- chunk_size: 352800
3
- dim_f: 1024
4
- dim_t: 801
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 000
10
-
11
- model:
12
- dim: 384
13
- depth: 6
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- batch_size: 2
43
- gradient_accumulation_steps: 1
44
- grad_clip: 0
45
- instruments:
46
- - dry
47
- - other
48
- lr: 1.0e-05
49
- patience: 8
50
- reduce_factor: 0.95
51
- target_instrument: dry
52
- num_epochs: 1000
53
- num_steps: 4032
54
- augmentation: false # enable augmentations by audiomentations and pedalboard
55
- augmentation_type: null
56
- use_mp3_compress: false # Deprecated
57
- augmentation_mix: false # Mix several stems of the same type with some probability
58
- augmentation_loudness: false # randomly change loudness of each stem
59
- augmentation_loudness_type: 1 # Type 1 or 2
60
- augmentation_loudness_min: 0
61
- augmentation_loudness_max: 0
62
- q: 0.95
63
- coarse_loss_clip: false
64
- ema_momentum: 0.999
65
- optimizer: adam
66
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
67
-
68
- inference:
69
- batch_size: 2
70
- dim_t: 801
71
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/dereverb-echo_mel_band_roformer_sdr_10.0169.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cd2b737a394cfb80cd48cc9fcbaf89f5f4062f6b93066c2911617a06d8b7860a
3
- size 835997896
 
 
 
 
Roformer_models/dereverb-echo_mel_band_roformer_sdr_13.4843_v2.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:396432f5af25992fe82d0286634bd879027c073721db6ab10199e75459708b9f
3
- size 455862568
 
 
 
 
Roformer_models/dereverb_big_mbr_ep_362.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0506455e74ffc02bbec700df9863ae243597034003815f1418227c6dee33b6ea
3
- size 455864012
 
 
 
 
Roformer_models/dereverb_echo_mbr_fused.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1596b1063238f487d54a0510a8c92cb28c000c803a271dd618ac49efc99ef3f7
3
- size 455776577
 
 
 
 
Roformer_models/dereverb_mel_band_roformer_anvuew.yaml DELETED
@@ -1,76 +0,0 @@
1
- audio:
2
- chunk_size: 352800
3
- dim_f: 1024
4
- dim_t: 801
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.000
10
-
11
- model:
12
- dim: 384
13
- depth: 6
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: True
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: False
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 1.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: False
40
-
41
- training:
42
- batch_size: 3
43
- gradient_accumulation_steps: 1
44
- grad_clip: 0
45
- instruments:
46
- - noreverb
47
- - reverb
48
- lr: 5.0e-05
49
- patience: 2
50
- reduce_factor: 0.95
51
- target_instrument: noreverb
52
- num_epochs: 1000
53
- num_steps: 4000
54
- q: 0.95
55
- coarse_loss_clip: false
56
- ema_momentum: 0.999
57
- optimizer: adamw
58
- other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
59
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
60
-
61
- augmentations:
62
- enable: true # enable or disable all augmentations (to fast disable if needed)
63
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
64
- loudness_min: 0.1
65
- loudness_max: 1.0
66
- mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
67
- mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
68
- - 0.2
69
- - 0.02
70
- mixup_loudness_min: 0.5
71
- mixup_loudness_max: 1.5
72
-
73
- inference:
74
- batch_size: 1
75
- dim_t: 801
76
- num_overlap: 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9262877b87e9ebb0fb808a456b0a411fa677f5df31c8383c1254af531c078970
3
- size 913107578
 
 
 
 
Roformer_models/dereverb_mel_band_roformer_less_aggressive_anvuew_sdr_18.8050.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0db8f1b41c00cead1112e967262a12802fd32e76c0c3a8eb207e772bae25d07b
3
- size 913107578
 
 
 
 
Roformer_models/dereverb_mel_band_roformer_mono_anvuew_sdr_20.4029.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f099ee717eb57fb0ad5eb0e7c9ad6787c36168140b61ce2b158b90c2c4ecee79
3
- size 913097978
 
 
 
 
Roformer_models/dereverb_super_big_mbr_ep_346.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:26dda242bce4405555f2d6086d079fe8cc23f1f04e02e501d2689bfe3ece0489
3
- size 455864012
 
 
 
 
Roformer_models/deverb_bs_roformer_8_384dim_10depth.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c38653aaa5e49f2f7b84dd3be2b6b679e0cbea23978e6b48389ee6f0a914768
3
- size 361499604
 
 
 
 
Roformer_models/deverb_bs_roformer_8_384dim_10depth_config.yaml DELETED
@@ -1,137 +0,0 @@
1
- audio:
2
- chunk_size: 352768
3
- dim_f: 1024
4
- dim_t: 801
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 0.001
10
-
11
- model:
12
- dim: 384
13
- depth: 10
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- freqs_per_bands: !!python/tuple
19
- - 2
20
- - 2
21
- - 2
22
- - 2
23
- - 2
24
- - 2
25
- - 2
26
- - 2
27
- - 2
28
- - 2
29
- - 2
30
- - 2
31
- - 2
32
- - 2
33
- - 2
34
- - 2
35
- - 2
36
- - 2
37
- - 2
38
- - 2
39
- - 2
40
- - 2
41
- - 2
42
- - 2
43
- - 4
44
- - 4
45
- - 4
46
- - 4
47
- - 4
48
- - 4
49
- - 4
50
- - 4
51
- - 4
52
- - 4
53
- - 4
54
- - 4
55
- - 12
56
- - 12
57
- - 12
58
- - 12
59
- - 12
60
- - 12
61
- - 12
62
- - 12
63
- - 24
64
- - 24
65
- - 24
66
- - 24
67
- - 24
68
- - 24
69
- - 24
70
- - 24
71
- - 48
72
- - 48
73
- - 48
74
- - 48
75
- - 48
76
- - 48
77
- - 48
78
- - 48
79
- - 128
80
- - 129
81
- dim_head: 64
82
- heads: 8
83
- attn_dropout: 0.1
84
- ff_dropout: 0.1
85
- flash_attn: true
86
- dim_freqs_in: 1025
87
- stft_n_fft: 2048
88
- stft_hop_length: 441
89
- stft_win_length: 2048
90
- stft_normalized: false
91
- mask_estimator_depth: 2
92
- multi_stft_resolution_loss_weight: 1.0
93
- multi_stft_resolutions_window_sizes: !!python/tuple
94
- - 4096
95
- - 2048
96
- - 1024
97
- - 512
98
- - 256
99
- multi_stft_hop_size: 147
100
- multi_stft_normalized: False
101
-
102
- training:
103
- batch_size: 1
104
- gradient_accumulation_steps: 1
105
- grad_clip: 0
106
- instruments:
107
- - noreverb
108
- - reverb
109
- lr: 5.0e-05
110
- patience: 2
111
- reduce_factor: 0.95
112
- target_instrument: noreverb
113
- num_epochs: 1000
114
- num_steps: 1000
115
- q: 0.95
116
- coarse_loss_clip: true
117
- ema_momentum: 0.999
118
- optimizer: adam
119
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
120
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
121
-
122
- augmentations:
123
- enable: true # enable or disable all augmentations (to fast disable if needed)
124
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
125
- loudness_min: 0.5
126
- loudness_max: 1.5
127
- mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
128
- mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
129
- - 0.2
130
- - 0.02
131
- mixup_loudness_min: 0.5
132
- mixup_loudness_max: 1.5
133
-
134
- inference:
135
- batch_size: 4
136
- dim_t: 801
137
- num_overlap: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Roformer_models/mel_band_roformer_bleed_suppressor_v1.ckpt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a9a9d10faa7f8997676a78e66d741d7acb9cc449334763f3c8f626d68ec6e575
3
- size 913102724