noblebarkrr commited on
Commit
370198f
·
verified ·
1 Parent(s): f76ba0e

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. bs_roformer/bs_4stem_aname_config.yaml +191 -0
  2. bs_roformer/bs_4stem_zfturbo.ckpt +3 -0
  3. bs_roformer/bs_4stem_zfturbo_config.yaml +191 -0
  4. bs_roformer/bs_4stemft_syh99999_config.yaml +191 -0
  5. bs_roformer/bs_6stem_config.yaml +33 -0
  6. bs_roformer/bs_6stem_fixed.ckpt +3 -0
  7. bs_roformer/bs_6stem_fixed_config.yaml +194 -0
  8. bs_roformer/bs_bass_beatloo_labs_config.yaml +128 -0
  9. bs_roformer/bs_cr_4stem_zf_turbo_config.yaml +208 -0
  10. bs_roformer/bs_deverb_256_8_anvuew.ckpt +3 -0
  11. bs_roformer/bs_deverb_256_8_anvuew_config.yaml +134 -0
  12. bs_roformer/bs_deverb_384_10_anvuew.ckpt +3 -0
  13. bs_roformer/bs_deverb_384_10_anvuew_config.yaml +134 -0
  14. bs_roformer/bs_deverb_room_anvuew.ckpt +3 -0
  15. bs_roformer/bs_deverb_room_anvuew_config.yaml +127 -0
  16. bs_roformer/bs_drums_beatloo_labs.ckpt +3 -0
  17. bs_roformer/bs_drums_beatloo_labs_config.yaml +128 -0
  18. bs_roformer/bs_inst_fno_unwa.ckpt +3 -0
  19. bs_roformer/bs_inst_fno_unwa_config.yaml +134 -0
  20. bs_roformer/bs_inst_hyperace2_unwa_config.yaml +127 -0
  21. bs_roformer/bs_inst_hyperace_unwa_config.yaml +127 -0
  22. bs_roformer/bs_karaoke_anvuew.ckpt +3 -0
  23. bs_roformer/bs_karaoke_anvuew_config.yaml +126 -0
  24. bs_roformer/bs_karaoke_becruily.ckpt +3 -0
  25. bs_roformer/bs_karaoke_becruily_config.yaml +125 -0
  26. bs_roformer/bs_karaoke_gabox.ckpt +3 -0
  27. bs_roformer/bs_karaoke_gabox_config.yaml +127 -0
  28. bs_roformer/bs_logic_6stem_config.yaml +194 -0
  29. bs_roformer/bs_male_female_146_sucial_config.yaml +123 -0
  30. bs_roformer/bs_male_female_267_sucial.ckpt +3 -0
  31. bs_roformer/bs_male_female_267_sucial_config.yaml +123 -0
  32. bs_roformer/bs_male_female_aufr33_config.yaml +123 -0
  33. bs_roformer/bs_other_viperx.ckpt +3 -0
  34. bs_roformer/bs_other_viperx_config.yaml +134 -0
  35. bs_roformer/bs_resurrection_inst_unwa.ckpt +3 -0
  36. bs_roformer/bs_resurrection_inst_unwa_config.yaml +135 -0
  37. bs_roformer/bs_resurrection_unwa.ckpt +3 -0
  38. bs_roformer/bs_resurrection_unwa_config.yaml +135 -0
  39. bs_roformer/bs_revive1_unwa_config.yaml +131 -0
  40. bs_roformer/bs_revive2_unwa.ckpt +3 -0
  41. bs_roformer/bs_revive2_unwa_config.yaml +131 -0
  42. bs_roformer/bs_revive3e_unwa.ckpt +3 -0
  43. bs_roformer/bs_revive3e_unwa_config.yaml +131 -0
  44. bs_roformer/bs_voc_hyperace2_unwa_config.yaml +127 -0
  45. bs_roformer/bs_vocals_1296_viperx.ckpt +3 -0
  46. bs_roformer/bs_vocals_1296_viperx_config.yaml +130 -0
  47. bs_roformer/bs_vocals_anvuew.ckpt +3 -0
  48. bs_roformer/bs_vocals_anvuew_config.yaml +126 -0
  49. bs_roformer/bs_voctest_gabox.ckpt +3 -0
  50. bs_roformer/bs_voctest_gabox_config.yaml +130 -0
bs_roformer/bs_4stem_aname_config.yaml ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 384
12
+ depth: 8
13
+ stereo: true
14
+ num_stems: 4
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.1
84
+ ff_dropout: 0.1
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 441
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ mlp_expansion_factor: 2
102
+ use_torch_checkpoint: false
103
+ skip_connection: false
104
+ training:
105
+ batch_size: 1
106
+ gradient_accumulation_steps: 16
107
+ grad_clip: 0
108
+ instruments:
109
+ - drums
110
+ - bass
111
+ - other
112
+ - vocals
113
+ patience: 3
114
+ reduce_factor: 0.95
115
+ target_instrument: null
116
+ num_epochs: 10000
117
+ num_steps: 100
118
+ augmentation: true
119
+ augmentation_type: simple1
120
+ use_mp3_compress: false
121
+ augmentation_mix: true
122
+ augmentation_loudness: true
123
+ augmentation_loudness_type: 1
124
+ augmentation_loudness_min: 0.25
125
+ augmentation_loudness_max: 2
126
+ q: 0.95
127
+ coarse_loss_clip: true
128
+ ema_momentum: 0.999
129
+ optimizer: prodigy
130
+ lr: 1.0
131
+ other_fix: false
132
+ use_amp: true
133
+ augmentations:
134
+ enable: true
135
+ loudness: true
136
+ loudness_min: 0.5
137
+ loudness_max: 1.5
138
+ mixup: true
139
+ mixup_probs: !!python/tuple
140
+ - 0.2
141
+ - 0.02
142
+ mixup_loudness_min: 0.5
143
+ mixup_loudness_max: 1.5
144
+ all:
145
+ channel_shuffle: 0.5
146
+ random_inverse: 0.1
147
+ random_polarity: 0.5
148
+ vocals:
149
+ pitch_shift: 0.1
150
+ pitch_shift_min_semitones: -5
151
+ pitch_shift_max_semitones: 5
152
+ seven_band_parametric_eq: 0.1
153
+ seven_band_parametric_eq_min_gain_db: -9
154
+ seven_band_parametric_eq_max_gain_db: 9
155
+ tanh_distortion: 0.1
156
+ tanh_distortion_min: 0.1
157
+ tanh_distortion_max: 0.7
158
+ bass:
159
+ pitch_shift: 0.1
160
+ pitch_shift_min_semitones: -2
161
+ pitch_shift_max_semitones: 2
162
+ seven_band_parametric_eq: 0.1
163
+ seven_band_parametric_eq_min_gain_db: -3
164
+ seven_band_parametric_eq_max_gain_db: 6
165
+ tanh_distortion: 0.1
166
+ tanh_distortion_min: 0.1
167
+ tanh_distortion_max: 0.5
168
+ drums:
169
+ pitch_shift: 0.1
170
+ pitch_shift_min_semitones: -5
171
+ pitch_shift_max_semitones: 5
172
+ seven_band_parametric_eq: 0.1
173
+ seven_band_parametric_eq_min_gain_db: -9
174
+ seven_band_parametric_eq_max_gain_db: 9
175
+ tanh_distortion: 0.1
176
+ tanh_distortion_min: 0.1
177
+ tanh_distortion_max: 0.6
178
+ other:
179
+ pitch_shift: 0.1
180
+ pitch_shift_min_semitones: -4
181
+ pitch_shift_max_semitones: 4
182
+ gaussian_noise: 0.1
183
+ gaussian_noise_min_amplitude: 0.001
184
+ gaussian_noise_max_amplitude: 0.015
185
+ time_stretch: 0.1
186
+ time_stretch_min_rate: 0.8
187
+ time_stretch_max_rate: 1.25
188
+ inference:
189
+ batch_size: 1
190
+ dim_t: 1101
191
+ num_overlap: 2
bs_roformer/bs_4stem_zfturbo.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e9daecd70aaed5b5a0d1f861cc4d77eaa45afb3fc6301b1cf32c1be0f5868fb
3
+ size 527385512
bs_roformer/bs_4stem_zfturbo_config.yaml ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 384
12
+ depth: 8
13
+ stereo: true
14
+ num_stems: 4
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.1
84
+ ff_dropout: 0.1
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 441
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ mlp_expansion_factor: 2
102
+ use_torch_checkpoint: false
103
+ skip_connection: false
104
+ training:
105
+ batch_size: 2
106
+ gradient_accumulation_steps: 1
107
+ grad_clip: 0
108
+ instruments:
109
+ - drums
110
+ - bass
111
+ - other
112
+ - vocals
113
+ patience: 3
114
+ reduce_factor: 0.95
115
+ target_instrument: null
116
+ num_epochs: 1000
117
+ num_steps: 1000
118
+ augmentation: false
119
+ augmentation_type: simple1
120
+ use_mp3_compress: false
121
+ augmentation_mix: true
122
+ augmentation_loudness: true
123
+ augmentation_loudness_type: 1
124
+ augmentation_loudness_min: 0.5
125
+ augmentation_loudness_max: 1.5
126
+ q: 0.95
127
+ coarse_loss_clip: true
128
+ ema_momentum: 0.999
129
+ optimizer: adam
130
+ lr: 1.0e-05
131
+ other_fix: false
132
+ use_amp: true
133
+ augmentations:
134
+ enable: true
135
+ loudness: true
136
+ loudness_min: 0.5
137
+ loudness_max: 1.5
138
+ mixup: true
139
+ mixup_probs: !!python/tuple
140
+ - 0.2
141
+ - 0.02
142
+ mixup_loudness_min: 0.5
143
+ mixup_loudness_max: 1.5
144
+ all:
145
+ channel_shuffle: 0.5
146
+ random_inverse: 0.1
147
+ random_polarity: 0.5
148
+ vocals:
149
+ pitch_shift: 0.1
150
+ pitch_shift_min_semitones: -5
151
+ pitch_shift_max_semitones: 5
152
+ seven_band_parametric_eq: 0.1
153
+ seven_band_parametric_eq_min_gain_db: -9
154
+ seven_band_parametric_eq_max_gain_db: 9
155
+ tanh_distortion: 0.1
156
+ tanh_distortion_min: 0.1
157
+ tanh_distortion_max: 0.7
158
+ bass:
159
+ pitch_shift: 0.1
160
+ pitch_shift_min_semitones: -2
161
+ pitch_shift_max_semitones: 2
162
+ seven_band_parametric_eq: 0.1
163
+ seven_band_parametric_eq_min_gain_db: -3
164
+ seven_band_parametric_eq_max_gain_db: 6
165
+ tanh_distortion: 0.1
166
+ tanh_distortion_min: 0.1
167
+ tanh_distortion_max: 0.5
168
+ drums:
169
+ pitch_shift: 0.1
170
+ pitch_shift_min_semitones: -5
171
+ pitch_shift_max_semitones: 5
172
+ seven_band_parametric_eq: 0.1
173
+ seven_band_parametric_eq_min_gain_db: -9
174
+ seven_band_parametric_eq_max_gain_db: 9
175
+ tanh_distortion: 0.1
176
+ tanh_distortion_min: 0.1
177
+ tanh_distortion_max: 0.6
178
+ other:
179
+ pitch_shift: 0.1
180
+ pitch_shift_min_semitones: -4
181
+ pitch_shift_max_semitones: 4
182
+ gaussian_noise: 0.1
183
+ gaussian_noise_min_amplitude: 0.001
184
+ gaussian_noise_max_amplitude: 0.015
185
+ time_stretch: 0.1
186
+ time_stretch_min_rate: 0.8
187
+ time_stretch_max_rate: 1.25
188
+ inference:
189
+ batch_size: 1
190
+ dim_t: 1101
191
+ num_overlap: 2
bs_roformer/bs_4stemft_syh99999_config.yaml ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 384
12
+ depth: 8
13
+ stereo: true
14
+ num_stems: 4
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.1
84
+ ff_dropout: 0.1
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 441
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ mlp_expansion_factor: 2
102
+ use_torch_checkpoint: false
103
+ skip_connection: false
104
+ training:
105
+ batch_size: 2
106
+ gradient_accumulation_steps: 1
107
+ grad_clip: 0
108
+ instruments:
109
+ - drums
110
+ - bass
111
+ - other
112
+ - vocals
113
+ patience: 3
114
+ reduce_factor: 0.95
115
+ target_instrument: null
116
+ num_epochs: 1000
117
+ num_steps: 1000
118
+ augmentation: false
119
+ augmentation_type: simple1
120
+ use_mp3_compress: false
121
+ augmentation_mix: true
122
+ augmentation_loudness: true
123
+ augmentation_loudness_type: 1
124
+ augmentation_loudness_min: 0.5
125
+ augmentation_loudness_max: 1.5
126
+ q: 0.95
127
+ coarse_loss_clip: true
128
+ ema_momentum: 0.999
129
+ optimizer: adam
130
+ lr: 1.0e-05
131
+ other_fix: false
132
+ use_amp: true
133
+ augmentations:
134
+ enable: true
135
+ loudness: true
136
+ loudness_min: 0.5
137
+ loudness_max: 1.5
138
+ mixup: true
139
+ mixup_probs: !!python/tuple
140
+ - 0.2
141
+ - 0.02
142
+ mixup_loudness_min: 0.5
143
+ mixup_loudness_max: 1.5
144
+ all:
145
+ channel_shuffle: 0.5
146
+ random_inverse: 0.1
147
+ random_polarity: 0.5
148
+ vocals:
149
+ pitch_shift: 0.1
150
+ pitch_shift_min_semitones: -5
151
+ pitch_shift_max_semitones: 5
152
+ seven_band_parametric_eq: 0.1
153
+ seven_band_parametric_eq_min_gain_db: -9
154
+ seven_band_parametric_eq_max_gain_db: 9
155
+ tanh_distortion: 0.1
156
+ tanh_distortion_min: 0.1
157
+ tanh_distortion_max: 0.7
158
+ bass:
159
+ pitch_shift: 0.1
160
+ pitch_shift_min_semitones: -2
161
+ pitch_shift_max_semitones: 2
162
+ seven_band_parametric_eq: 0.1
163
+ seven_band_parametric_eq_min_gain_db: -3
164
+ seven_band_parametric_eq_max_gain_db: 6
165
+ tanh_distortion: 0.1
166
+ tanh_distortion_min: 0.1
167
+ tanh_distortion_max: 0.5
168
+ drums:
169
+ pitch_shift: 0.1
170
+ pitch_shift_min_semitones: -5
171
+ pitch_shift_max_semitones: 5
172
+ seven_band_parametric_eq: 0.1
173
+ seven_band_parametric_eq_min_gain_db: -9
174
+ seven_band_parametric_eq_max_gain_db: 9
175
+ tanh_distortion: 0.1
176
+ tanh_distortion_min: 0.1
177
+ tanh_distortion_max: 0.6
178
+ other:
179
+ pitch_shift: 0.1
180
+ pitch_shift_min_semitones: -4
181
+ pitch_shift_max_semitones: 4
182
+ gaussian_noise: 0.1
183
+ gaussian_noise_min_amplitude: 0.001
184
+ gaussian_noise_max_amplitude: 0.015
185
+ time_stretch: 0.1
186
+ time_stretch_min_rate: 0.8
187
+ time_stretch_max_rate: 1.25
188
+ inference:
189
+ batch_size: 1
190
+ dim_t: 2048
191
+ num_overlap: 2
bs_roformer/bs_6stem_config.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ sw: true
2
+ audio:
3
+ chunk_size: 588800
4
+ dim_f: 1024
5
+ dim_t: 801
6
+ hop_length: 441
7
+ n_fft: 2048
8
+ num_channels: 2
9
+ sample_rate: 44100
10
+ min_mean_abs: 0.0
11
+ model:
12
+ dim: 256
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 6
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ use_shared_bias: true
19
+ training:
20
+ instruments:
21
+ - bass
22
+ - drums
23
+ - other
24
+ - vocals
25
+ - guitar
26
+ - piano
27
+ use_amp: true
28
+ target_instrument: null
29
+ inference:
30
+ batch_size: 1
31
+ dim_t: 1101
32
+ num_overlap: 2
33
+ normalize: false
bs_roformer/bs_6stem_fixed.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24e7d35ee9c64415673d3fd33e06a67cac2c103c5df6267ba1576459c775916e
3
+ size 699412152
bs_roformer/bs_6stem_fixed_config.yaml ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 588800
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 256
12
+ depth: 12
13
+ stereo: true
14
+ num_stems: 6
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.1
84
+ ff_dropout: 0.1
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 512
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ mlp_expansion_factor: 4
102
+ use_torch_checkpoint: false
103
+ skip_connection: false
104
+ training:
105
+ batch_size: 2
106
+ gradient_accumulation_steps: 1
107
+ grad_clip: 0
108
+ instruments:
109
+ - bass
110
+ - drums
111
+ - other
112
+ - vocals
113
+ - guitar
114
+ - piano
115
+ patience: 3
116
+ reduce_factor: 0.95
117
+ target_instrument: null
118
+ num_epochs: 1000
119
+ num_steps: 1000
120
+ augmentation: false
121
+ augmentation_type: simple1
122
+ use_mp3_compress: false
123
+ augmentation_mix: true
124
+ augmentation_loudness: true
125
+ augmentation_loudness_type: 1
126
+ augmentation_loudness_min: 0.5
127
+ augmentation_loudness_max: 1.5
128
+ q: 0.95
129
+ coarse_loss_clip: true
130
+ ema_momentum: 0.999
131
+ optimizer: adam
132
+ lr: 1.0e-05
133
+ other_fix: false
134
+ use_amp: true
135
+ augmentations:
136
+ enable: true
137
+ loudness: true
138
+ loudness_min: 0.5
139
+ loudness_max: 1.5
140
+ mixup: true
141
+ mixup_probs: !!python/tuple
142
+ - 0.2
143
+ - 0.02
144
+ mixup_loudness_min: 0.5
145
+ mixup_loudness_max: 1.5
146
+ all:
147
+ channel_shuffle: 0.5
148
+ random_inverse: 0.1
149
+ random_polarity: 0.5
150
+ vocals:
151
+ pitch_shift: 0.1
152
+ pitch_shift_min_semitones: -5
153
+ pitch_shift_max_semitones: 5
154
+ seven_band_parametric_eq: 0.1
155
+ seven_band_parametric_eq_min_gain_db: -9
156
+ seven_band_parametric_eq_max_gain_db: 9
157
+ tanh_distortion: 0.1
158
+ tanh_distortion_min: 0.1
159
+ tanh_distortion_max: 0.7
160
+ bass:
161
+ pitch_shift: 0.1
162
+ pitch_shift_min_semitones: -2
163
+ pitch_shift_max_semitones: 2
164
+ seven_band_parametric_eq: 0.1
165
+ seven_band_parametric_eq_min_gain_db: -3
166
+ seven_band_parametric_eq_max_gain_db: 6
167
+ tanh_distortion: 0.1
168
+ tanh_distortion_min: 0.1
169
+ tanh_distortion_max: 0.5
170
+ drums:
171
+ pitch_shift: 0.1
172
+ pitch_shift_min_semitones: -5
173
+ pitch_shift_max_semitones: 5
174
+ seven_band_parametric_eq: 0.1
175
+ seven_band_parametric_eq_min_gain_db: -9
176
+ seven_band_parametric_eq_max_gain_db: 9
177
+ tanh_distortion: 0.1
178
+ tanh_distortion_min: 0.1
179
+ tanh_distortion_max: 0.6
180
+ other:
181
+ pitch_shift: 0.1
182
+ pitch_shift_min_semitones: -4
183
+ pitch_shift_max_semitones: 4
184
+ gaussian_noise: 0.1
185
+ gaussian_noise_min_amplitude: 0.001
186
+ gaussian_noise_max_amplitude: 0.015
187
+ time_stretch: 0.1
188
+ time_stretch_min_rate: 0.8
189
+ time_stretch_max_rate: 1.25
190
+ inference:
191
+ batch_size: 1
192
+ dim_t: 1101
193
+ num_overlap: 2
194
+ normalize: false
bs_roformer/bs_bass_beatloo_labs_config.yaml ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 131584
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 512
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 192
12
+ depth: 6
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.1
84
+ ff_dropout: 0.1
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 512
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ training:
102
+ batch_size: 4
103
+ gradient_accumulation_steps: 1
104
+ grad_clip: 0
105
+ instruments:
106
+ - bass
107
+ - other
108
+ lr: 5.0e-05
109
+ patience: 2
110
+ reduce_factor: 0.95
111
+ target_instrument: bass
112
+ num_epochs: 1000
113
+ num_steps: 1000
114
+ q: 0.95
115
+ coarse_loss_clip: true
116
+ ema_momentum: 0.999
117
+ optimizer: adam
118
+ other_fix: false
119
+ use_amp: true
120
+ augmentations:
121
+ enable: true
122
+ loudness: true
123
+ loudness_min: 0.5
124
+ loudness_max: 1.5
125
+ inference:
126
+ batch_size: 1
127
+ dim_t: 256
128
+ num_overlap: 2
bs_roformer/bs_cr_4stem_zf_turbo_config.yaml ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ conformer: true
2
+ audio:
3
+ chunk_size: 352800
4
+ dim_f: 1024
5
+ dim_t: 256
6
+ hop_length: 441
7
+ n_fft: 2048
8
+ num_channels: 2
9
+ sample_rate: 44100
10
+ min_mean_abs: 0.0
11
+ model:
12
+ dim: 256
13
+ depth: 6
14
+ stereo: true
15
+ num_stems: 4
16
+ time_conformer_depth: 1
17
+ freq_conformer_depth: 1
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0
84
+ ff_dropout: 0
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 512
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ mlp_expansion_factor: 2
102
+ ff_mult: 4
103
+ conv_expansion_factor: 2
104
+ conv_kernel_size: 31
105
+ use_torch_checkpoint: false
106
+ skip_connection: false
107
+ sage_attention: false
108
+ training:
109
+ batch_size: 2
110
+ gradient_accumulation_steps: 1
111
+ grad_clip: 0.0
112
+ instruments:
113
+ - drums
114
+ - bass
115
+ - other
116
+ - vocals
117
+ patience: 3
118
+ reduce_factor: 0.95
119
+ target_instrument: null
120
+ num_epochs: 1000
121
+ num_steps: 1000
122
+ q: 0.95
123
+ coarse_loss_clip: false
124
+ ema_momentum: 0.999
125
+ optimizer: adamw
126
+ lr: 1.0e-05
127
+ other_fix: false
128
+ use_amp: true
129
+ optimizer1:
130
+ muon_group:
131
+ lr: 0.001
132
+ weight_decay: 0.0
133
+ momentum: 0.95
134
+ adam_group:
135
+ lr: 0.0001
136
+ weight_decay: 0.0
137
+ betas:
138
+ - 0.9
139
+ - 0.99
140
+ eps: 1.0e-08
141
+ augmentations:
142
+ enable: true
143
+ loudness: true
144
+ loudness_min: 0.5
145
+ loudness_max: 1.5
146
+ mixup: true
147
+ mixup_probs: !!python/tuple
148
+ - 0.2
149
+ - 0.02
150
+ - 0.002
151
+ - 0.0002
152
+ - 2.0e-05
153
+ mixup_loudness_min: 0.5
154
+ mixup_loudness_max: 1.5
155
+ mp3_compression_on_mixture: 0.1
156
+ mp3_compression_on_mixture_bitrate_min: 32
157
+ mp3_compression_on_mixture_bitrate_max: 320
158
+ mp3_compression_on_mixture_backend: lameenc
159
+ all:
160
+ channel_shuffle: 0.5
161
+ random_inverse: 0.1
162
+ random_polarity: 0.5
163
+ vocals:
164
+ pitch_shift: 0.1
165
+ pitch_shift_min_semitones: -5
166
+ pitch_shift_max_semitones: 5
167
+ seven_band_parametric_eq: 0.1
168
+ seven_band_parametric_eq_min_gain_db: -9
169
+ seven_band_parametric_eq_max_gain_db: 9
170
+ tanh_distortion: 0.1
171
+ tanh_distortion_min: 0.1
172
+ tanh_distortion_max: 0.7
173
+ bass:
174
+ pitch_shift: 0.1
175
+ pitch_shift_min_semitones: -2
176
+ pitch_shift_max_semitones: 2
177
+ seven_band_parametric_eq: 0.1
178
+ seven_band_parametric_eq_min_gain_db: -3
179
+ seven_band_parametric_eq_max_gain_db: 6
180
+ tanh_distortion: 0.1
181
+ tanh_distortion_min: 0.1
182
+ tanh_distortion_max: 0.5
183
+ drums:
184
+ pitch_shift: 0.1
185
+ pitch_shift_min_semitones: -5
186
+ pitch_shift_max_semitones: 5
187
+ seven_band_parametric_eq: 0.1
188
+ seven_band_parametric_eq_min_gain_db: -9
189
+ seven_band_parametric_eq_max_gain_db: 9
190
+ tanh_distortion: 0.1
191
+ tanh_distortion_min: 0.1
192
+ tanh_distortion_max: 0.6
193
+ other:
194
+ pitch_shift: 0.1
195
+ pitch_shift_min_semitones: -4
196
+ pitch_shift_max_semitones: 4
197
+ gaussian_noise: 0.1
198
+ gaussian_noise_min_amplitude: 0.001
199
+ gaussian_noise_max_amplitude: 0.015
200
+ time_stretch: 0.1
201
+ time_stretch_min_rate: 0.8
202
+ time_stretch_max_rate: 1.25
203
+ inference:
204
+ chunk_size: 882000
205
+ batch_size: 1
206
+ dim_t: 801
207
+ num_overlap: 2
208
+ normalize: false
bs_roformer/bs_deverb_256_8_anvuew.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee204fc59fa4111674536d47bd1ef3759acb9f7cf5a759ec4b867a828bb76c64
3
+ size 170770820
bs_roformer/bs_deverb_256_8_anvuew_config.yaml ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352768
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+ model:
11
+ dim: 256
12
+ depth: 8
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.1
84
+ ff_dropout: 0.1
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 512
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ training:
102
+ batch_size: 1
103
+ gradient_accumulation_steps: 1
104
+ grad_clip: 0
105
+ instruments:
106
+ - noreverb
107
+ - reverb
108
+ lr: 5.0e-05
109
+ patience: 2
110
+ reduce_factor: 0.95
111
+ target_instrument: noreverb
112
+ num_epochs: 1000
113
+ num_steps: 7600
114
+ q: 0.95
115
+ coarse_loss_clip: true
116
+ ema_momentum: 0.999
117
+ optimizer: adam
118
+ other_fix: false
119
+ use_amp: true
120
+ augmentations:
121
+ enable: true
122
+ loudness: true
123
+ loudness_min: 0.5
124
+ loudness_max: 1.5
125
+ mixup: false
126
+ mixup_probs: !!python/tuple
127
+ - 0.2
128
+ - 0.02
129
+ mixup_loudness_min: 0.5
130
+ mixup_loudness_max: 1.5
131
+ inference:
132
+ batch_size: 1
133
+ dim_t: 801
134
+ num_overlap: 2
bs_roformer/bs_deverb_384_10_anvuew.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c38653aaa5e49f2f7b84dd3be2b6b679e0cbea23978e6b48389ee6f0a914768
3
+ size 361499604
bs_roformer/bs_deverb_384_10_anvuew_config.yaml ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352768
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+ model:
11
+ dim: 384
12
+ depth: 10
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.1
84
+ ff_dropout: 0.1
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 512
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ training:
102
+ batch_size: 1
103
+ gradient_accumulation_steps: 1
104
+ grad_clip: 0
105
+ instruments:
106
+ - noreverb
107
+ - reverb
108
+ lr: 5.0e-05
109
+ patience: 2
110
+ reduce_factor: 0.95
111
+ target_instrument: noreverb
112
+ num_epochs: 1000
113
+ num_steps: 1000
114
+ q: 0.95
115
+ coarse_loss_clip: true
116
+ ema_momentum: 0.999
117
+ optimizer: adam
118
+ other_fix: false
119
+ use_amp: true
120
+ augmentations:
121
+ enable: true
122
+ loudness: true
123
+ loudness_min: 0.5
124
+ loudness_max: 1.5
125
+ mixup: false
126
+ mixup_probs: !!python/tuple
127
+ - 0.2
128
+ - 0.02
129
+ mixup_loudness_min: 0.5
130
+ mixup_loudness_max: 1.5
131
+ inference:
132
+ batch_size: 1
133
+ dim_t: 801
134
+ num_overlap: 2
bs_roformer/bs_deverb_room_anvuew.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2edec521f09e26341c1923dc82c8c52dbc86478b42b9999f679535743c970cb3
3
+ size 118128452
bs_roformer/bs_deverb_room_anvuew_config.yaml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 384000
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 1
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 128
12
+ depth: 16
13
+ stereo: false
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 3
26
+ - 3
27
+ - 3
28
+ - 3
29
+ - 3
30
+ - 4
31
+ - 4
32
+ - 4
33
+ - 4
34
+ - 4
35
+ - 5
36
+ - 5
37
+ - 5
38
+ - 5
39
+ - 6
40
+ - 6
41
+ - 6
42
+ - 6
43
+ - 7
44
+ - 7
45
+ - 7
46
+ - 8
47
+ - 8
48
+ - 8
49
+ - 9
50
+ - 9
51
+ - 10
52
+ - 10
53
+ - 11
54
+ - 12
55
+ - 13
56
+ - 14
57
+ - 15
58
+ - 16
59
+ - 17
60
+ - 18
61
+ - 19
62
+ - 20
63
+ - 21
64
+ - 22
65
+ - 23
66
+ - 24
67
+ - 25
68
+ - 27
69
+ - 29
70
+ - 31
71
+ - 33
72
+ - 35
73
+ - 37
74
+ - 39
75
+ - 41
76
+ - 43
77
+ - 45
78
+ - 48
79
+ - 52
80
+ - 57
81
+ - 64
82
+ dim_head: 16
83
+ heads: 8
84
+ attn_dropout: 0.0
85
+ ff_dropout: 0.0
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 512
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 3
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: false
102
+ mlp_expansion_factor: 4
103
+ use_torch_checkpoint: true
104
+ skip_connection: false
105
+ training:
106
+ batch_size: 4
107
+ gradient_accumulation_steps: 1
108
+ grad_clip: 1000.0
109
+ instruments:
110
+ - noreverb
111
+ - reverb
112
+ lr: 5.0e-05
113
+ patience: 5
114
+ reduce_factor: 0.75
115
+ target_instrument: noreverb
116
+ num_epochs: 1000
117
+ num_steps: 1000
118
+ q: 0.95
119
+ coarse_loss_clip: true
120
+ ema_momentum: 0.999
121
+ optimizer: adam
122
+ other_fix: false
123
+ use_amp: true
124
+ inference:
125
+ batch_size: 1
126
+ dim_t: 871
127
+ num_overlap: 2
bs_roformer/bs_drums_beatloo_labs.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb534cb6b4b90e7dcbcbb741ba4111393ccf6083b372b554bba7b556121d104e
3
+ size 98603241
bs_roformer/bs_drums_beatloo_labs_config.yaml ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 131584
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 512
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 192
12
+ depth: 6
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.1
84
+ ff_dropout: 0.1
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 512
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ training:
102
+ batch_size: 4
103
+ gradient_accumulation_steps: 1
104
+ grad_clip: 0
105
+ instruments:
106
+ - drums
107
+ - other
108
+ lr: 5.0e-05
109
+ patience: 2
110
+ reduce_factor: 0.95
111
+ target_instrument: drums
112
+ num_epochs: 1000
113
+ num_steps: 1000
114
+ q: 0.95
115
+ coarse_loss_clip: true
116
+ ema_momentum: 0.999
117
+ optimizer: adam
118
+ other_fix: false
119
+ use_amp: true
120
+ augmentations:
121
+ enable: true
122
+ loudness: true
123
+ loudness_min: 0.5
124
+ loudness_max: 1.5
125
+ inference:
126
+ batch_size: 1
127
+ dim_t: 256
128
+ num_overlap: 2
bs_roformer/bs_inst_fno_unwa.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f35bf6d87b2863372388e85c2d9679e5b7651e5c2ddd23aab1480f7af10b90ca
3
+ size 332004435
bs_roformer/bs_inst_fno_unwa_config.yaml ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fno: true
2
+ audio:
3
+ chunk_size: 749259
4
+ dim_f: 1024
5
+ dim_t: 1700
6
+ hop_length: 441
7
+ n_fft: 2048
8
+ num_channels: 2
9
+ sample_rate: 44100
10
+ min_mean_abs: 0.0
11
+ model:
12
+ dim: 256
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.0
85
+ ff_dropout: 0.0
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 441
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: false
102
+ mlp_expansion_factor: 4
103
+ training:
104
+ batch_size: 2
105
+ gradient_accumulation_steps: 1
106
+ grad_clip: 0
107
+ instruments:
108
+ - vocals
109
+ - other
110
+ patience: 3
111
+ reduce_factor: 0.95
112
+ target_instrument: other
113
+ num_epochs: 1000
114
+ num_steps: 1000
115
+ augmentation: false
116
+ augmentation_type: simple1
117
+ use_mp3_compress: false
118
+ augmentation_mix: true
119
+ augmentation_loudness: true
120
+ augmentation_loudness_type: 1
121
+ augmentation_loudness_min: 0.5
122
+ augmentation_loudness_max: 1.5
123
+ q: 0.95
124
+ coarse_loss_clip: true
125
+ ema_momentum: 0.999
126
+ optimizer: adam
127
+ lr: 1.0e-05
128
+ other_fix: false
129
+ use_amp: true
130
+ inference:
131
+ batch_size: 1
132
+ dim_t: 1700
133
+ num_overlap: 2
134
+ normalize: false
bs_roformer/bs_inst_hyperace2_unwa_config.yaml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hyperace2: true
2
+ audio:
3
+ chunk_size: 960000
4
+ dim_f: 1024
5
+ dim_t: 801
6
+ hop_length: 441
7
+ n_fft: 2048
8
+ num_channels: 2
9
+ sample_rate: 44100
10
+ min_mean_abs: 0.0001
11
+ model:
12
+ dim: 256
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.0
85
+ ff_dropout: 0.0
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 512
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: false
102
+ mlp_expansion_factor: 4
103
+ use_torch_checkpoint: true
104
+ skip_connection: false
105
+ training:
106
+ batch_size: 1
107
+ gradient_accumulation_steps: 1
108
+ grad_clip: 0
109
+ instruments:
110
+ - vocals
111
+ - instrument
112
+ lr: 1.0e-05
113
+ patience: 5
114
+ reduce_factor: 0.9
115
+ target_instrument: instrument
116
+ num_epochs: 1000
117
+ num_steps: 1000
118
+ q: 0.95
119
+ coarse_loss_clip: true
120
+ ema_momentum: 0.999
121
+ optimizer: adam
122
+ other_fix: false
123
+ use_amp: true
124
+ inference:
125
+ batch_size: 1
126
+ dim_t: 1876
127
+ num_overlap: 2
bs_roformer/bs_inst_hyperace_unwa_config.yaml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hyperace: true
2
+ audio:
3
+ chunk_size: 960000
4
+ dim_f: 1024
5
+ dim_t: 801
6
+ hop_length: 441
7
+ n_fft: 2048
8
+ num_channels: 2
9
+ sample_rate: 44100
10
+ min_mean_abs: 0.0001
11
+ model:
12
+ dim: 256
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.0
85
+ ff_dropout: 0.0
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 512
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: false
102
+ mlp_expansion_factor: 4
103
+ use_torch_checkpoint: true
104
+ skip_connection: false
105
+ training:
106
+ batch_size: 1
107
+ gradient_accumulation_steps: 1
108
+ grad_clip: 0
109
+ instruments:
110
+ - vocals
111
+ - instrument
112
+ lr: 1.0e-05
113
+ patience: 5
114
+ reduce_factor: 0.9
115
+ target_instrument: instrument
116
+ num_epochs: 1000
117
+ num_steps: 1000
118
+ q: 0.95
119
+ coarse_loss_clip: true
120
+ ema_momentum: 0.999
121
+ optimizer: adam
122
+ other_fix: false
123
+ use_amp: true
124
+ inference:
125
+ batch_size: 1
126
+ dim_t: 1876
127
+ num_overlap: 2
bs_roformer/bs_karaoke_anvuew.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:206d04757cb5f75ca3b55f8a0a48f5c26aa2351d4ff3c7adbfc9affa30ea3ae4
3
+ size 204486925
bs_roformer/bs_karaoke_anvuew_config.yaml ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 640000
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 256
12
+ depth: 12
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.0
84
+ ff_dropout: 0.0
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 512
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ mlp_expansion_factor: 4
102
+ use_torch_checkpoint: true
103
+ skip_connection: false
104
+ training:
105
+ batch_size: 1
106
+ gradient_accumulation_steps: 1
107
+ grad_clip: 0
108
+ instruments:
109
+ - Vocals
110
+ - Instrumental
111
+ lr: 5.0e-05
112
+ patience: 7
113
+ reduce_factor: 0.75
114
+ target_instrument: Vocals
115
+ num_epochs: 1000
116
+ num_steps: 1000
117
+ q: 0.95
118
+ coarse_loss_clip: true
119
+ ema_momentum: 0.999
120
+ optimizer: adam
121
+ other_fix: false
122
+ use_amp: true
123
+ inference:
124
+ batch_size: 1
125
+ dim_t: 1251
126
+ num_overlap: 2
bs_roformer/bs_karaoke_becruily.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb90ee24c1154d83fbcfd27e96182f19e061557cc6e4746953125e08c29389f9
3
+ size 204436907
bs_roformer/bs_karaoke_becruily_config.yaml ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 882000
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 256
12
+ depth: 12
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0
84
+ ff_dropout: 0
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 512
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ mlp_expansion_factor: 4
102
+ training:
103
+ batch_size: 1
104
+ gradient_accumulation_steps: 1
105
+ grad_clip: 0
106
+ instruments:
107
+ - Vocals
108
+ - Instrumental
109
+ patience: 2
110
+ reduce_factor: 0.95
111
+ target_instrument: Vocals
112
+ num_epochs: 1000
113
+ num_steps: 1000
114
+ q: 0.95
115
+ coarse_loss_clip: true
116
+ ema_momentum: 0.999
117
+ optimizer: adam
118
+ lr: 1.0e-05
119
+ other_fix: false
120
+ use_amp: true
121
+ inference:
122
+ batch_size: 1
123
+ dim_t: 2001
124
+ num_overlap: 2
125
+ normalize: false
bs_roformer/bs_karaoke_gabox.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db8357825398d4231031ad1ab4aa12a94bcaad8d67e8ce5e4b3c5b48fdee1d4f
3
+ size 204483448
bs_roformer/bs_karaoke_gabox_config.yaml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 256
12
+ depth: 12
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.0
84
+ ff_dropout: 0.0
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 512
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ mlp_expansion_factor: 4
102
+ use_torch_checkpoint: true
103
+ skip_connection: false
104
+ training:
105
+ batch_size: 1
106
+ gradient_accumulation_steps: 999
107
+ grad_clip: 1
108
+ instruments:
109
+ - vocals
110
+ - other
111
+ lr: 1.0e-05
112
+ patience: 1000000
113
+ reduce_factor: 0.75
114
+ target_instrument: vocals
115
+ num_epochs: 1000
116
+ num_steps: 1000
117
+ q: 0.95
118
+ coarse_loss_clip: true
119
+ ema_momentum: 0.999
120
+ optimizer: Fira
121
+ other_fix: true
122
+ use_amp: true
123
+ use_torch_checkpoint: true
124
+ inference:
125
+ batch_size: 1
126
+ dim_t: 1251
127
+ num_overlap: 2
bs_roformer/bs_logic_6stem_config.yaml ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 588800
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 256
12
+ depth: 12
13
+ stereo: true
14
+ num_stems: 6
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.1
84
+ ff_dropout: 0.1
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 512
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ mlp_expansion_factor: 4
102
+ use_torch_checkpoint: false
103
+ skip_connection: false
104
+ training:
105
+ batch_size: 2
106
+ gradient_accumulation_steps: 1
107
+ grad_clip: 0
108
+ instruments:
109
+ - bass
110
+ - drums
111
+ - other
112
+ - vocals
113
+ - guitar
114
+ - piano
115
+ patience: 3
116
+ reduce_factor: 0.95
117
+ target_instrument: null
118
+ num_epochs: 1000
119
+ num_steps: 1000
120
+ augmentation: false
121
+ augmentation_type: simple1
122
+ use_mp3_compress: false
123
+ augmentation_mix: true
124
+ augmentation_loudness: true
125
+ augmentation_loudness_type: 1
126
+ augmentation_loudness_min: 0.5
127
+ augmentation_loudness_max: 1.5
128
+ q: 0.95
129
+ coarse_loss_clip: true
130
+ ema_momentum: 0.999
131
+ optimizer: adam
132
+ lr: 1.0e-05
133
+ other_fix: false
134
+ use_amp: true
135
+ augmentations:
136
+ enable: true
137
+ loudness: true
138
+ loudness_min: 0.5
139
+ loudness_max: 1.5
140
+ mixup: true
141
+ mixup_probs: !!python/tuple
142
+ - 0.2
143
+ - 0.02
144
+ mixup_loudness_min: 0.5
145
+ mixup_loudness_max: 1.5
146
+ all:
147
+ channel_shuffle: 0.5
148
+ random_inverse: 0.1
149
+ random_polarity: 0.5
150
+ vocals:
151
+ pitch_shift: 0.1
152
+ pitch_shift_min_semitones: -5
153
+ pitch_shift_max_semitones: 5
154
+ seven_band_parametric_eq: 0.1
155
+ seven_band_parametric_eq_min_gain_db: -9
156
+ seven_band_parametric_eq_max_gain_db: 9
157
+ tanh_distortion: 0.1
158
+ tanh_distortion_min: 0.1
159
+ tanh_distortion_max: 0.7
160
+ bass:
161
+ pitch_shift: 0.1
162
+ pitch_shift_min_semitones: -2
163
+ pitch_shift_max_semitones: 2
164
+ seven_band_parametric_eq: 0.1
165
+ seven_band_parametric_eq_min_gain_db: -3
166
+ seven_band_parametric_eq_max_gain_db: 6
167
+ tanh_distortion: 0.1
168
+ tanh_distortion_min: 0.1
169
+ tanh_distortion_max: 0.5
170
+ drums:
171
+ pitch_shift: 0.1
172
+ pitch_shift_min_semitones: -5
173
+ pitch_shift_max_semitones: 5
174
+ seven_band_parametric_eq: 0.1
175
+ seven_band_parametric_eq_min_gain_db: -9
176
+ seven_band_parametric_eq_max_gain_db: 9
177
+ tanh_distortion: 0.1
178
+ tanh_distortion_min: 0.1
179
+ tanh_distortion_max: 0.6
180
+ other:
181
+ pitch_shift: 0.1
182
+ pitch_shift_min_semitones: -4
183
+ pitch_shift_max_semitones: 4
184
+ gaussian_noise: 0.1
185
+ gaussian_noise_min_amplitude: 0.001
186
+ gaussian_noise_max_amplitude: 0.015
187
+ time_stretch: 0.1
188
+ time_stretch_min_rate: 0.8
189
+ time_stretch_max_rate: 1.25
190
+ inference:
191
+ batch_size: 1
192
+ dim_t: 1101
193
+ num_overlap: 2
194
+ normalize: false
bs_roformer/bs_male_female_146_sucial_config.yaml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 384
12
+ depth: 8
13
+ stereo: true
14
+ num_stems: 2
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.0
84
+ ff_dropout: 0.0
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 441
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ training:
102
+ batch_size: 1
103
+ gradient_accumulation_steps: 1
104
+ grad_clip: 0
105
+ instruments:
106
+ - male
107
+ - female
108
+ lr: 1.0e-05
109
+ patience: 2
110
+ reduce_factor: 0.95
111
+ target_instrument: null
112
+ num_epochs: 1000
113
+ num_steps: 1000
114
+ q: 0.95
115
+ coarse_loss_clip: true
116
+ ema_momentum: 0.999
117
+ optimizer: adam
118
+ other_fix: true
119
+ use_amp: true
120
+ inference:
121
+ batch_size: 1
122
+ dim_t: 801
123
+ num_overlap: 2
bs_roformer/bs_male_female_267_sucial.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:123c00786bdbc6bd462dddb35cd21fd6ae99ab8319f93f63a8abc1012e593d94
3
+ size 527121477
bs_roformer/bs_male_female_267_sucial_config.yaml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 384
12
+ depth: 8
13
+ stereo: true
14
+ num_stems: 2
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.0
84
+ ff_dropout: 0.0
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 441
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ training:
102
+ batch_size: 1
103
+ gradient_accumulation_steps: 1
104
+ grad_clip: 0
105
+ instruments:
106
+ - male
107
+ - female
108
+ lr: 1.0e-05
109
+ patience: 2
110
+ reduce_factor: 0.95
111
+ target_instrument: null
112
+ num_epochs: 1000
113
+ num_steps: 1000
114
+ q: 0.95
115
+ coarse_loss_clip: true
116
+ ema_momentum: 0.999
117
+ optimizer: adam
118
+ other_fix: true
119
+ use_amp: true
120
+ inference:
121
+ batch_size: 1
122
+ dim_t: 801
123
+ num_overlap: 2
bs_roformer/bs_male_female_aufr33_config.yaml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 384
12
+ depth: 8
13
+ stereo: true
14
+ num_stems: 2
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.0
84
+ ff_dropout: 0.0
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 441
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ training:
102
+ batch_size: 1
103
+ gradient_accumulation_steps: 1
104
+ grad_clip: 0
105
+ instruments:
106
+ - male
107
+ - female
108
+ lr: 1.0e-05
109
+ patience: 2
110
+ reduce_factor: 0.95
111
+ target_instrument: null
112
+ num_epochs: 1000
113
+ num_steps: 1000
114
+ q: 0.95
115
+ coarse_loss_clip: true
116
+ ema_momentum: 0.999
117
+ optimizer: adam
118
+ other_fix: true
119
+ use_amp: true
120
+ inference:
121
+ batch_size: 1
122
+ dim_t: 801
123
+ num_overlap: 2
bs_roformer/bs_other_viperx.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2e825a03bc908cb04dbd88eddeefbf5147dd1cf1f95cebf453d9dbfabec494b
3
+ size 393068365
bs_roformer/bs_other_viperx_config.yaml ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 131584
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 512
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+ model:
11
+ dim: 384
12
+ depth: 12
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.1
84
+ ff_dropout: 0.1
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 512
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ training:
102
+ batch_size: 4
103
+ gradient_accumulation_steps: 1
104
+ grad_clip: 0
105
+ instruments:
106
+ - vocals
107
+ - other
108
+ lr: 5.0e-05
109
+ patience: 2
110
+ reduce_factor: 0.95
111
+ target_instrument: other
112
+ num_epochs: 1000
113
+ num_steps: 1000
114
+ q: 0.95
115
+ coarse_loss_clip: true
116
+ ema_momentum: 0.999
117
+ optimizer: adam
118
+ other_fix: false
119
+ use_amp: true
120
+ augmentations:
121
+ enable: true
122
+ loudness: true
123
+ loudness_min: 0.5
124
+ loudness_max: 1.5
125
+ mixup: true
126
+ mixup_probs: !!python/tuple
127
+ - 0.2
128
+ - 0.02
129
+ mixup_loudness_min: 0.5
130
+ mixup_loudness_max: 1.5
131
+ inference:
132
+ batch_size: 1
133
+ dim_t: 512
134
+ num_overlap: 2
bs_roformer/bs_resurrection_inst_unwa.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16311025a5133ae6411760ccfe9e3e66b31a01d9d8bec0a03fa7ec4bedac7a15
3
+ size 204483033
bs_roformer/bs_resurrection_inst_unwa_config.yaml ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 749259
3
+ dim_f: 1024
4
+ dim_t: 1700
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 256
12
+ depth: 12
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.0
84
+ ff_dropout: 0.0
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 441
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ mlp_expansion_factor: 4
102
+ use_torch_checkpoint: false
103
+ skip_connection: false
104
+ training:
105
+ batch_size: 2
106
+ gradient_accumulation_steps: 1
107
+ grad_clip: 0
108
+ instruments:
109
+ - vocals
110
+ - other
111
+ patience: 3
112
+ reduce_factor: 0.95
113
+ target_instrument: other
114
+ num_epochs: 1000
115
+ num_steps: 1000
116
+ augmentation: false
117
+ augmentation_type: simple1
118
+ use_mp3_compress: false
119
+ augmentation_mix: true
120
+ augmentation_loudness: true
121
+ augmentation_loudness_type: 1
122
+ augmentation_loudness_min: 0.5
123
+ augmentation_loudness_max: 1.5
124
+ q: 0.95
125
+ coarse_loss_clip: true
126
+ ema_momentum: 0.999
127
+ optimizer: adam
128
+ lr: 1.0e-05
129
+ other_fix: false
130
+ use_amp: true
131
+ inference:
132
+ batch_size: 1
133
+ dim_t: 1700
134
+ num_overlap: 2
135
+ normalize: false
bs_roformer/bs_resurrection_unwa.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9dbfe5cb572e4ed32a15ec727d7bd06c8d7aba97509e6fda5bc008bb1e0b2dd5
3
+ size 204510749
bs_roformer/bs_resurrection_unwa_config.yaml ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 785920
3
+ dim_f: 1024
4
+ dim_t: 1536
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 256
12
+ depth: 12
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.0
84
+ ff_dropout: 0.0
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 512
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ mlp_expansion_factor: 4
102
+ use_torch_checkpoint: false
103
+ skip_connection: false
104
+ training:
105
+ batch_size: 2
106
+ gradient_accumulation_steps: 1
107
+ grad_clip: 0
108
+ instruments:
109
+ - vocals
110
+ - other
111
+ patience: 3
112
+ reduce_factor: 0.95
113
+ target_instrument: vocals
114
+ num_epochs: 1000
115
+ num_steps: 1000
116
+ augmentation: false
117
+ augmentation_type: simple1
118
+ use_mp3_compress: false
119
+ augmentation_mix: true
120
+ augmentation_loudness: true
121
+ augmentation_loudness_type: 1
122
+ augmentation_loudness_min: 0.5
123
+ augmentation_loudness_max: 1.5
124
+ q: 0.95
125
+ coarse_loss_clip: true
126
+ ema_momentum: 0.999
127
+ optimizer: adam
128
+ lr: 1.0e-05
129
+ other_fix: false
130
+ use_amp: true
131
+ inference:
132
+ batch_size: 1
133
+ dim_t: 1536
134
+ num_overlap: 2
135
+ normalize: false
bs_roformer/bs_revive1_unwa_config.yaml ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100
3
+ dim_f: 1024
4
+ dim_t: 1101
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 512
12
+ depth: 12
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.0
84
+ ff_dropout: 0.0
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 441
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ training:
102
+ batch_size: 1
103
+ gradient_accumulation_steps: 1
104
+ grad_clip: 0
105
+ instruments:
106
+ - vocals
107
+ - other
108
+ lr: 1.0e-05
109
+ patience: 2
110
+ reduce_factor: 0.95
111
+ target_instrument: vocals
112
+ num_epochs: 1000
113
+ num_steps: 1000
114
+ augmentation: false
115
+ augmentation_type: null
116
+ use_mp3_compress: false
117
+ augmentation_mix: false
118
+ augmentation_loudness: false
119
+ augmentation_loudness_type: 1
120
+ augmentation_loudness_min: 0
121
+ augmentation_loudness_max: 0
122
+ q: 0.95
123
+ coarse_loss_clip: false
124
+ ema_momentum: 0.999
125
+ optimizer: adam
126
+ other_fix: true
127
+ use_amp: true
128
+ inference:
129
+ batch_size: 1
130
+ dim_t: 1101
131
+ num_overlap: 2
bs_roformer/bs_revive2_unwa.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58098850c882a7472dad39f99fb8040ce6eaafe671cfe9881d89aea276bbb5f5
3
+ size 639326600
bs_roformer/bs_revive2_unwa_config.yaml ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100
3
+ dim_f: 1024
4
+ dim_t: 1101
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 512
12
+ depth: 12
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.0
84
+ ff_dropout: 0.0
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 441
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ training:
102
+ batch_size: 1
103
+ gradient_accumulation_steps: 1
104
+ grad_clip: 0
105
+ instruments:
106
+ - vocals
107
+ - other
108
+ lr: 1.0e-05
109
+ patience: 2
110
+ reduce_factor: 0.95
111
+ target_instrument: vocals
112
+ num_epochs: 1000
113
+ num_steps: 1000
114
+ augmentation: false
115
+ augmentation_type: null
116
+ use_mp3_compress: false
117
+ augmentation_mix: false
118
+ augmentation_loudness: false
119
+ augmentation_loudness_type: 1
120
+ augmentation_loudness_min: 0
121
+ augmentation_loudness_max: 0
122
+ q: 0.95
123
+ coarse_loss_clip: false
124
+ ema_momentum: 0.999
125
+ optimizer: adam
126
+ other_fix: true
127
+ use_amp: true
128
+ inference:
129
+ batch_size: 1
130
+ dim_t: 1101
131
+ num_overlap: 2
bs_roformer/bs_revive3e_unwa.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b0751b9a15c591407c3b77f08eb4ad3005e42e96051f3f2b39760f1130c467b
3
+ size 639326600
bs_roformer/bs_revive3e_unwa_config.yaml ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100
3
+ dim_f: 1024
4
+ dim_t: 1101
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 512
12
+ depth: 12
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.0
84
+ ff_dropout: 0.0
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 441
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ training:
102
+ batch_size: 1
103
+ gradient_accumulation_steps: 1
104
+ grad_clip: 0
105
+ instruments:
106
+ - vocals
107
+ - other
108
+ lr: 1.0e-05
109
+ patience: 2
110
+ reduce_factor: 0.95
111
+ target_instrument: vocals
112
+ num_epochs: 1000
113
+ num_steps: 1000
114
+ augmentation: false
115
+ augmentation_type: null
116
+ use_mp3_compress: false
117
+ augmentation_mix: false
118
+ augmentation_loudness: false
119
+ augmentation_loudness_type: 1
120
+ augmentation_loudness_min: 0
121
+ augmentation_loudness_max: 0
122
+ q: 0.95
123
+ coarse_loss_clip: false
124
+ ema_momentum: 0.999
125
+ optimizer: adam
126
+ other_fix: true
127
+ use_amp: true
128
+ inference:
129
+ batch_size: 1
130
+ dim_t: 1101
131
+ num_overlap: 2
bs_roformer/bs_voc_hyperace2_unwa_config.yaml ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hyperace2: true
2
+ audio:
3
+ chunk_size: 960000
4
+ dim_f: 1024
5
+ dim_t: 801
6
+ hop_length: 441
7
+ n_fft: 2048
8
+ num_channels: 2
9
+ sample_rate: 44100
10
+ min_mean_abs: 0.0001
11
+ model:
12
+ dim: 256
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.0
85
+ ff_dropout: 0.0
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 512
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: false
102
+ mlp_expansion_factor: 4
103
+ use_torch_checkpoint: true
104
+ skip_connection: false
105
+ training:
106
+ batch_size: 1
107
+ gradient_accumulation_steps: 1
108
+ grad_clip: 0
109
+ instruments:
110
+ - vocals
111
+ - instrument
112
+ lr: 1.0e-05
113
+ patience: 5
114
+ reduce_factor: 0.9
115
+ target_instrument: vocals
116
+ num_epochs: 1000
117
+ num_steps: 1000
118
+ q: 0.95
119
+ coarse_loss_clip: true
120
+ ema_momentum: 0.999
121
+ optimizer: adam
122
+ other_fix: false
123
+ use_amp: true
124
+ inference:
125
+ batch_size: 1
126
+ dim_t: 1876
127
+ num_overlap: 2
bs_roformer/bs_vocals_1296_viperx.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6c94864adfb73bbb0ca58ec14d58dd0b364549e9fb61433ae51916f3e2f8d0b
3
+ size 639317465
bs_roformer/bs_vocals_1296_viperx_config.yaml ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+ model:
11
+ dim: 512
12
+ depth: 12
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ freqs_per_bands: !!python/tuple
18
+ - 2
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 4
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 12
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 24
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 48
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 128
79
+ - 129
80
+ dim_head: 64
81
+ heads: 8
82
+ attn_dropout: 0.1
83
+ ff_dropout: 0.1
84
+ flash_attn: true
85
+ dim_freqs_in: 1025
86
+ stft_n_fft: 2048
87
+ stft_hop_length: 441
88
+ stft_win_length: 2048
89
+ stft_normalized: false
90
+ mask_estimator_depth: 2
91
+ multi_stft_resolution_loss_weight: 1.0
92
+ multi_stft_resolutions_window_sizes: !!python/tuple
93
+ - 4096
94
+ - 2048
95
+ - 1024
96
+ - 512
97
+ - 256
98
+ multi_stft_hop_size: 147
99
+ multi_stft_normalized: false
100
+ training:
101
+ batch_size: 16
102
+ gradient_accumulation_steps: 1
103
+ grad_clip: 0
104
+ instruments:
105
+ - Vocals
106
+ - Instrumental
107
+ lr: 5.0e-05
108
+ patience: 2
109
+ reduce_factor: 0.95
110
+ target_instrument: Vocals
111
+ num_epochs: 1000
112
+ num_steps: 1000
113
+ augmentation: false
114
+ augmentation_type: simple1
115
+ use_mp3_compress: false
116
+ augmentation_mix: true
117
+ augmentation_loudness: true
118
+ augmentation_loudness_type: 1
119
+ augmentation_loudness_min: 0.5
120
+ augmentation_loudness_max: 1.5
121
+ q: 0.95
122
+ coarse_loss_clip: true
123
+ ema_momentum: 0.999
124
+ optimizer: adam
125
+ other_fix: false
126
+ use_amp: true
127
+ inference:
128
+ batch_size: 1
129
+ dim_t: 801
130
+ num_overlap: 2
bs_roformer/bs_vocals_anvuew.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d0f72ad0ac4154f5f4c3cdf230880bbb9a5bd01dc610241a2abfe787f0d7784
3
+ size 204485563
bs_roformer/bs_vocals_anvuew_config.yaml ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 960000
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0001
10
+ model:
11
+ dim: 256
12
+ depth: 12
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ linear_transformer_depth: 0
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0.0
84
+ ff_dropout: 0.0
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 512
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: false
101
+ mlp_expansion_factor: 4
102
+ use_torch_checkpoint: true
103
+ skip_connection: false
104
+ training:
105
+ batch_size: 1
106
+ gradient_accumulation_steps: 1
107
+ grad_clip: 0
108
+ instruments:
109
+ - vocals
110
+ - instrument
111
+ lr: 1.0e-05
112
+ patience: 5
113
+ reduce_factor: 0.9
114
+ target_instrument: vocals
115
+ num_epochs: 1000
116
+ num_steps: 1000
117
+ q: 0.95
118
+ coarse_loss_clip: true
119
+ ema_momentum: 0.999
120
+ optimizer: adam
121
+ other_fix: false
122
+ use_amp: true
123
+ inference:
124
+ batch_size: 1
125
+ dim_t: 1876
126
+ num_overlap: 2
bs_roformer/bs_voctest_gabox.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18d58efe5e949e70fab11b875329af6d06ef11ccc29574bfe943fb57cc827f38
3
+ size 639254584
bs_roformer/bs_voctest_gabox_config.yaml ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+ model:
11
+ dim: 512
12
+ depth: 12
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ freqs_per_bands: !!python/tuple
18
+ - 2
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 4
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 12
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 24
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 48
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 128
79
+ - 129
80
+ dim_head: 64
81
+ heads: 8
82
+ attn_dropout: 0.1
83
+ ff_dropout: 0.1
84
+ flash_attn: true
85
+ dim_freqs_in: 1025
86
+ stft_n_fft: 2048
87
+ stft_hop_length: 441
88
+ stft_win_length: 2048
89
+ stft_normalized: false
90
+ mask_estimator_depth: 2
91
+ multi_stft_resolution_loss_weight: 1.0
92
+ multi_stft_resolutions_window_sizes: !!python/tuple
93
+ - 4096
94
+ - 2048
95
+ - 1024
96
+ - 512
97
+ - 256
98
+ multi_stft_hop_size: 147
99
+ multi_stft_normalized: false
100
+ training:
101
+ batch_size: 16
102
+ gradient_accumulation_steps: 1
103
+ grad_clip: 0
104
+ instruments:
105
+ - Vocals
106
+ - Instrumental
107
+ lr: 5.0e-05
108
+ patience: 2
109
+ reduce_factor: 0.95
110
+ target_instrument: Vocals
111
+ num_epochs: 1000
112
+ num_steps: 1000
113
+ augmentation: false
114
+ augmentation_type: simple1
115
+ use_mp3_compress: false
116
+ augmentation_mix: true
117
+ augmentation_loudness: true
118
+ augmentation_loudness_type: 1
119
+ augmentation_loudness_min: 0.5
120
+ augmentation_loudness_max: 1.5
121
+ q: 0.95
122
+ coarse_loss_clip: true
123
+ ema_momentum: 0.999
124
+ optimizer: adam
125
+ other_fix: false
126
+ use_amp: true
127
+ inference:
128
+ batch_size: 1
129
+ dim_t: 801
130
+ num_overlap: 2