noblebarkrr commited on
Commit
3a44c0a
·
verified ·
1 Parent(s): a97accd

Upload folder using huggingface_hub

Browse files
bs_roformer/bs_karaoke_3stem_giantailab.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:364964794b0c82c843c53099d80b6ecfee852cf2bf42f098515276c779623bb6
3
+ size 1295074954
bs_roformer/bs_karaoke_3stem_giantailab_config.yaml ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ conditional: true
2
+ audio:
3
+ chunk_size: 352800
4
+ dim_f: 1024
5
+ dim_t: 801 # don't work (use in model)
6
+ hop_length: 441 # don't work (use in model)
7
+ n_fft: 2048
8
+ num_channels: 2
9
+ sample_rate: 44100
10
+ min_mean_abs: 0.000
11
+
12
+ model:
13
+ dim: 512
14
+ depth: 12
15
+ stereo: true
16
+ num_stems: 3
17
+ time_transformer_depth: 1
18
+ freq_transformer_depth: 1
19
+ linear_transformer_depth: 0
20
+ freqs_per_bands: !!python/tuple
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 2
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 4
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 12
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 24
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 48
81
+ - 128
82
+ - 129
83
+ dim_head: 64
84
+ heads: 8
85
+ attn_dropout: 0.1
86
+ ff_dropout: 0.1
87
+ flash_attn: true
88
+ dim_freqs_in: 1025
89
+ stft_n_fft: 2048
90
+ stft_hop_length: 441
91
+ stft_win_length: 2048
92
+ stft_normalized: false
93
+ mask_estimator_depth: 2
94
+ multi_stft_resolution_loss_weight: 1.0
95
+ multi_stft_resolutions_window_sizes: !!python/tuple
96
+ - 4096
97
+ - 2048
98
+ - 1024
99
+ - 512
100
+ - 256
101
+ multi_stft_hop_size: 147
102
+ multi_stft_normalized: False
103
+
104
+ training:
105
+ batch_size: 1
106
+ gradient_accumulation_steps: 1
107
+ grad_clip: 0
108
+ instruments:
109
+ - vocals
110
+ - backing_vocal
111
+ - instrumental
112
+ gan_model:
113
+ - music
114
+ - music
115
+ - none
116
+ mix_instruments:
117
+ - vocals
118
+ - backing_vocal
119
+ - instrumental
120
+ diffusion_model:
121
+ - dit
122
+ - none
123
+ - none
124
+ lr: 1.0e-05
125
+ patience: 2
126
+ reduce_factor: 0.95
127
+ target_instrument: null
128
+ num_epochs: 1000
129
+ num_steps: 1000
130
+ q: 0.95
131
+ coarse_loss_clip: true
132
+ ema_momentum: 0.999
133
+ optimizer: adam
134
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
135
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
136
+
137
+
138
+ augmentations:
139
+ enable: true # enable or disable all augmentations (to fast disable if needed)
140
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
141
+ loudness_min: 0.5
142
+ loudness_max: 1.5
143
+ mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
144
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
145
+ - 0.2
146
+ - 0.02
147
+ mixup_loudness_min: 0.5
148
+ mixup_loudness_max: 1.5
149
+
150
+ # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
151
+ mp3_compression_on_mixture: 0.01
152
+ mp3_compression_on_mixture_bitrate_min: 32
153
+ mp3_compression_on_mixture_bitrate_max: 320
154
+ mp3_compression_on_mixture_backend: "lameenc"
155
+
156
+ all:
157
+ channel_shuffle: 0.5 # Set 0 or lower to disable
158
+ random_inverse: 0.1 # inverse track (better lower probability)
159
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
160
+
161
+ inference:
162
+ batch_size: 4
163
+ dim_t: 801
164
+ num_overlap: 2
165
+
166
+
167
+ loss_multistft:
168
+ fft_sizes:
169
+ - 1024
170
+ - 2048
171
+ - 4096
172
+ hop_sizes:
173
+ - 147
174
+ - 256
175
+ - 512
176
+ win_lengths:
177
+ - 1024
178
+ - 2048
179
+ - 4096
180
+ window: "hann_window"
181
+ scale: "mel"
182
+ n_bins: 128
183
+ sample_rate: 44100
184
+ perceptual_weighting: true
185
+ w_sc: 1.0
186
+ w_log_mag: 1.0
187
+ w_lin_mag: 0.0
188
+ w_phs: 0.0
189
+ mag_distance: "L1"