lainlives commited on
Commit
53afdfa
·
verified ·
1 Parent(s): 801f7dd

Add files using upload-large-folder tool

Browse files
BS_Inst_EXP_VRL.yaml ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 #352800 #485100
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.000
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 12
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ freqs_per_bands: !!python/tuple
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 128
80
+ - 129
81
+ dim_head: 64
82
+ heads: 8
83
+ attn_dropout: 0
84
+ ff_dropout: 0
85
+ flash_attn: true
86
+ dim_freqs_in: 1025
87
+ stft_n_fft: 2048
88
+ stft_hop_length: 441
89
+ stft_win_length: 2048
90
+ stft_normalized: false
91
+ mask_estimator_depth: 2
92
+ multi_stft_resolution_loss_weight: 1.0
93
+ multi_stft_resolutions_window_sizes: !!python/tuple
94
+ - 4096
95
+ - 2048
96
+ - 1024
97
+ - 512
98
+ - 256
99
+ multi_stft_hop_size: 147
100
+ multi_stft_normalized: False
101
+ training:
102
+ batch_size: 1
103
+ gradient_accumulation_steps: 1
104
+ grad_clip: 0
105
+ instruments:
106
+ - Vocals
107
+ - Instrumental
108
+ lr: 1.0e-04
109
+ patience: 2
110
+ reduce_factor: 0.95
111
+ target_instrument: Instrumental
112
+ num_epochs: 1
113
+ num_steps: 1000
114
+ q: 0.95
115
+ coarse_loss_clip: true
116
+ ema_momentum: 0.999
117
+ optimizer: adamw
118
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
119
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
120
+
121
+ inference:
122
+ batch_size: 1
123
+ dim_t: 1101
124
+ num_overlap: 2
aufr33-jarredou_DrumSep_model_mdx23c_ep_141_sdr_10.8059.yaml ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 130560
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 512
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ act: gelu
13
+ bottleneck_factor: 4
14
+ growth: 128
15
+ norm: InstanceNorm
16
+ num_blocks_per_scale: 2
17
+ num_channels: 128
18
+ num_scales: 5
19
+ num_subbands: 4
20
+ scale:
21
+ - 2
22
+ - 2
23
+
24
+ training:
25
+ batch_size: 12
26
+ gradient_accumulation_steps: 1
27
+ grad_clip: 0
28
+ instruments:
29
+ - Kick
30
+ - Snare
31
+ - Toms
32
+ - Hh
33
+ - Ride
34
+ - Crash
35
+ lr: 9.0e-05
36
+ patience: 30
37
+ reduce_factor: 0.95
38
+ target_instrument: null
39
+ num_epochs: 1000
40
+ num_steps: 1268
41
+ q: 0.95
42
+ coarse_loss_clip: true
43
+ ema_momentum: 0.999
44
+ optimizer: adam
45
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
46
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
47
+
48
+ augmentations:
49
+ enable: true # enable or disable all augmentations (to fast disable if needed)
50
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
51
+ loudness_min: 0.5
52
+ loudness_max: 1.5
53
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
54
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
55
+ - 0.2
56
+ - 0.02
57
+ mixup_loudness_min: 0.5
58
+ mixup_loudness_max: 1.5
59
+
60
+ # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
61
+ mp3_compression_on_mixture: 0.0
62
+ mp3_compression_on_mixture_bitrate_min: 32
63
+ mp3_compression_on_mixture_bitrate_max: 320
64
+ mp3_compression_on_mixture_backend: "lameenc"
65
+
66
+ all:
67
+ channel_shuffle: 0.5 # Set 0 or lower to disable
68
+ random_inverse: 0.01 # inverse track (better lower probability)
69
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
70
+ mp3_compression: 0.0
71
+ mp3_compression_min_bitrate: 32
72
+ mp3_compression_max_bitrate: 320
73
+ mp3_compression_backend: "lameenc"
74
+ pitch_shift: 0.1
75
+ pitch_shift_min_semitones: -3
76
+ pitch_shift_max_semitones: 3
77
+ seven_band_parametric_eq: 0.5
78
+ seven_band_parametric_eq_min_gain_db: -6
79
+ seven_band_parametric_eq_max_gain_db: 6
80
+ tanh_distortion: 0.2
81
+ tanh_distortion_min: 0.1
82
+ tanh_distortion_max: 0.5
83
+
84
+ inference:
85
+ batch_size: 1
86
+ dim_t: 256
87
+ num_overlap: 4
config_dnr_bandit_bsrnn_multi_mus64.yaml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "MultiMaskMultiSourceBandSplitRNN"
2
+ audio:
3
+ chunk_size: 264600
4
+ num_channels: 2
5
+ sample_rate: 44100
6
+ min_mean_abs: 0.001
7
+
8
+ model:
9
+ in_channel: 1
10
+ stems: ['speech', 'music', 'effects']
11
+ band_specs: "musical"
12
+ n_bands: 64
13
+ fs: 44100
14
+ require_no_overlap: false
15
+ require_no_gap: true
16
+ normalize_channel_independently: false
17
+ treat_channel_as_feature: true
18
+ n_sqm_modules: 8
19
+ emb_dim: 128
20
+ rnn_dim: 256
21
+ bidirectional: true
22
+ rnn_type: "GRU"
23
+ mlp_dim: 512
24
+ hidden_activation: "Tanh"
25
+ hidden_activation_kwargs: null
26
+ complex_mask: true
27
+ n_fft: 2048
28
+ win_length: 2048
29
+ hop_length: 512
30
+ window_fn: "hann_window"
31
+ wkwargs: null
32
+ power: null
33
+ center: true
34
+ normalized: true
35
+ pad_mode: "constant"
36
+ onesided: true
37
+
38
+ training:
39
+ batch_size: 4
40
+ gradient_accumulation_steps: 4
41
+ grad_clip: 0
42
+ instruments:
43
+ - Speech
44
+ - Music
45
+ - Effects
46
+ lr: 9.0e-05
47
+ patience: 2
48
+ reduce_factor: 0.95
49
+ target_instrument: null
50
+ num_epochs: 1000
51
+ num_steps: 1000
52
+ q: 0.95
53
+ coarse_loss_clip: true
54
+ ema_momentum: 0.999
55
+ optimizer: adam
56
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
57
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
58
+
59
+ augmentations:
60
+ enable: true # enable or disable all augmentations (to fast disable if needed)
61
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
62
+ loudness_min: 0.5
63
+ loudness_max: 1.5
64
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
65
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
66
+ - 0.2
67
+ - 0.02
68
+ mixup_loudness_min: 0.5
69
+ mixup_loudness_max: 1.5
70
+ all:
71
+ channel_shuffle: 0.5 # Set 0 or lower to disable
72
+ random_inverse: 0.1 # inverse track (better lower probability)
73
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
74
+
75
+ inference:
76
+ batch_size: 1
77
+ dim_t: 256
78
+ num_overlap: 4
config_dnr_bandit_v2_mus64.yaml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cls: Bandit
2
+
3
+ audio:
4
+ chunk_size: 384000
5
+ num_channels: 2
6
+ sample_rate: 48000
7
+ min_mean_abs: 0.000
8
+
9
+ kwargs:
10
+ in_channels: 1
11
+ stems: ['speech', 'music', 'sfx']
12
+ band_type: musical
13
+ n_bands: 64
14
+ normalize_channel_independently: false
15
+ treat_channel_as_feature: true
16
+ n_sqm_modules: 8
17
+ emb_dim: 128
18
+ rnn_dim: 256
19
+ bidirectional: true
20
+ rnn_type: "GRU"
21
+ mlp_dim: 512
22
+ hidden_activation: "Tanh"
23
+ hidden_activation_kwargs:
24
+ complex_mask: true
25
+ use_freq_weights: true
26
+ n_fft: 2048
27
+ win_length: 2048
28
+ hop_length: 512
29
+ window_fn: "hann_window"
30
+ wkwargs:
31
+ power:
32
+ center: true
33
+ normalized: true
34
+ pad_mode: "reflect"
35
+ onesided: true
36
+
37
+ training:
38
+ batch_size: 4
39
+ gradient_accumulation_steps: 4
40
+ grad_clip: 0
41
+ instruments:
42
+ - Speech
43
+ - Music
44
+ - Sfx
45
+ lr: 9.0e-05
46
+ patience: 2
47
+ reduce_factor: 0.95
48
+ target_instrument:
49
+ num_epochs: 1000
50
+ num_steps: 1000
51
+ q: 0.95
52
+ coarse_loss_clip: true
53
+ ema_momentum: 0.999
54
+ optimizer: adam
55
+ other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
56
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
57
+
58
+ augmentations:
59
+ enable: true # enable or disable all augmentations (to fast disable if needed)
60
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
61
+ loudness_min: 0.5
62
+ loudness_max: 1.5
63
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
64
+ mixup_probs: !!python/tuple
65
+ # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
66
+ - 0.2
67
+ - 0.02
68
+ mixup_loudness_min: 0.5
69
+ mixup_loudness_max: 1.5
70
+ all:
71
+ channel_shuffle: 0.5 # Set 0 or lower to disable
72
+ random_inverse: 0.1 # inverse track (better lower probability)
73
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
74
+
75
+ inference:
76
+ batch_size: 8
77
+ dim_t: 256
78
+ num_overlap: 4
config_mel_band_roformer_karaoke.yaml ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 000
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 6
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ num_bands: 60
19
+ dim_head: 64
20
+ heads: 8
21
+ attn_dropout: 0
22
+ ff_dropout: 0
23
+ flash_attn: true
24
+ dim_freqs_in: 1025
25
+ sample_rate: 44100 # needed for mel filter bank from librosa
26
+ stft_n_fft: 2048
27
+ stft_hop_length: 441
28
+ stft_win_length: 2048
29
+ stft_normalized: false
30
+ mask_estimator_depth: 2
31
+ multi_stft_resolution_loss_weight: 1.0
32
+ multi_stft_resolutions_window_sizes: !!python/tuple
33
+ - 4096
34
+ - 2048
35
+ - 1024
36
+ - 512
37
+ - 256
38
+ multi_stft_hop_size: 147
39
+ multi_stft_normalized: false
40
+
41
+ training:
42
+ batch_size: 4
43
+ gradient_accumulation_steps: 1
44
+ grad_clip: 0
45
+ instruments:
46
+ - Vocals
47
+ - Instrumental
48
+ lr: 1.0e-05
49
+ patience: 2
50
+ reduce_factor: 0.95
51
+ target_instrument: Vocals
52
+ num_epochs: 1000
53
+ num_steps: 2000
54
+ augmentation: false # enable augmentations by audiomentations and pedalboard
55
+ augmentation_type:
56
+ use_mp3_compress: false # Deprecated
57
+ augmentation_mix: false # Mix several stems of the same type with some probability
58
+ augmentation_loudness: false # randomly change loudness of each stem
59
+ augmentation_loudness_type: 1 # Type 1 or 2
60
+ augmentation_loudness_min: 0
61
+ augmentation_loudness_max: 0
62
+ q: 0.95
63
+ coarse_loss_clip: false
64
+ ema_momentum: 0.999
65
+ optimizer: adam
66
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
67
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
68
+ inference:
69
+ batch_size: 1
70
+ dim_t: 1101
71
+ num_overlap: 4
config_musdb18_scnet.yaml ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # 44100 * 11
3
+ num_channels: 2
4
+ sample_rate: 44100
5
+ min_mean_abs: 0.000
6
+
7
+ model:
8
+ sources:
9
+ - drums
10
+ - bass
11
+ - other
12
+ - vocals
13
+ audio_channels: 2
14
+ dims:
15
+ - 4
16
+ - 32
17
+ - 64
18
+ - 128
19
+ nfft: 4096
20
+ hop_size: 1024
21
+ win_size: 4096
22
+ normalized: True
23
+ band_SR:
24
+ - 0.175
25
+ - 0.392
26
+ - 0.433
27
+ band_stride:
28
+ - 1
29
+ - 4
30
+ - 16
31
+ band_kernel:
32
+ - 3
33
+ - 4
34
+ - 16
35
+ conv_depths:
36
+ - 3
37
+ - 2
38
+ - 1
39
+ compress: 4
40
+ conv_kernel: 3
41
+ num_dplayer: 6
42
+ expand: 1
43
+
44
+ training:
45
+ batch_size: 10
46
+ gradient_accumulation_steps: 1
47
+ grad_clip: 0
48
+ instruments:
49
+ - Drums
50
+ - Bass
51
+ - Other
52
+ - Vocals
53
+ lr: 5.0e-04
54
+ patience: 2
55
+ reduce_factor: 0.95
56
+ target_instrument: null
57
+ num_epochs: 1000
58
+ num_steps: 1000
59
+ q: 0.95
60
+ coarse_loss_clip: true
61
+ ema_momentum: 0.999
62
+ optimizer: adam
63
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
64
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
65
+
66
+ augmentations:
67
+ enable: true # enable or disable all augmentations (to fast disable if needed)
68
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
69
+ loudness_min: 0.5
70
+ loudness_max: 1.5
71
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
72
+ mixup_probs:
73
+ !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
74
+ - 0.2
75
+ - 0.02
76
+ mixup_loudness_min: 0.5
77
+ mixup_loudness_max: 1.5
78
+
79
+ inference:
80
+ batch_size: 8
81
+ dim_t: 256
82
+ num_overlap: 4
83
+ normalize: true
config_musdb18_scnet_large.yaml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # 44100 * 11
3
+ num_channels: 2
4
+ sample_rate: 44100
5
+ min_mean_abs: 0.000
6
+
7
+ model:
8
+ sources:
9
+ - drums
10
+ - bass
11
+ - other
12
+ - vocals
13
+ audio_channels: 2
14
+ dims:
15
+ - 4
16
+ - 64
17
+ - 128
18
+ - 256
19
+ nfft: 4096
20
+ hop_size: 1024
21
+ win_size: 4096
22
+ normalized: True
23
+ band_SR:
24
+ - 0.225
25
+ - 0.372
26
+ - 0.403
27
+ band_stride:
28
+ - 1
29
+ - 4
30
+ - 16
31
+ band_kernel:
32
+ - 3
33
+ - 4
34
+ - 16
35
+ conv_depths:
36
+ - 3
37
+ - 2
38
+ - 1
39
+ compress: 4
40
+ conv_kernel: 3
41
+ num_dplayer: 6
42
+ expand: 1
43
+
44
+ training:
45
+ batch_size: 6
46
+ gradient_accumulation_steps: 1
47
+ grad_clip: 0
48
+ instruments:
49
+ - Drums
50
+ - Bass
51
+ - Other
52
+ - Vocals
53
+ # lr: 1.0e-04
54
+ lr: 1.0
55
+ patience: 2
56
+ reduce_factor: 0.95
57
+ target_instrument: null
58
+ num_epochs: 1000
59
+ num_steps: 1000
60
+ q: 0.95
61
+ coarse_loss_clip: true
62
+ ema_momentum: 0.999
63
+ optimizer: prodigy
64
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
65
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
66
+
67
+ augmentations:
68
+ enable: true # enable or disable all augmentations (to fast disable if needed)
69
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
70
+ loudness_min: 0.5
71
+ loudness_max: 1.5
72
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
73
+ mixup_probs:
74
+ !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
75
+ - 0.2
76
+ - 0.02
77
+ mixup_loudness_min: 0.5
78
+ mixup_loudness_max: 1.5
79
+ all:
80
+ channel_shuffle: 0.5 # Set 0 or lower to disable
81
+ random_inverse: 0.1 # inverse track (better lower probability)
82
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
83
+
84
+ inference:
85
+ batch_size: 8
86
+ dim_t: 256
87
+ num_overlap: 4
88
+ normalize: false
config_musdb18_scnet_large_starrytong.yaml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # 44100 * 11
3
+ num_channels: 2
4
+ sample_rate: 44100
5
+ min_mean_abs: 0.000
6
+
7
+ model:
8
+ sources:
9
+ - drums
10
+ - bass
11
+ - other
12
+ - vocals
13
+ audio_channels: 2
14
+ dims:
15
+ - 4
16
+ - 64
17
+ - 128
18
+ - 256
19
+ nfft: 4096
20
+ hop_size: 1024
21
+ win_size: 4096
22
+ normalized: True
23
+ band_SR:
24
+ - 0.225
25
+ - 0.372
26
+ - 0.403
27
+ band_stride:
28
+ - 1
29
+ - 4
30
+ - 16
31
+ band_kernel:
32
+ - 3
33
+ - 4
34
+ - 16
35
+ conv_depths:
36
+ - 3
37
+ - 2
38
+ - 1
39
+ compress: 4
40
+ conv_kernel: 3
41
+ num_dplayer: 6
42
+ expand: 1
43
+
44
+ training:
45
+ batch_size: 6
46
+ gradient_accumulation_steps: 1
47
+ grad_clip: 0
48
+ instruments:
49
+ - Drums
50
+ - Bass
51
+ - Other
52
+ - Vocals
53
+ # lr: 1.0e-04
54
+ lr: 1.0
55
+ patience: 2
56
+ reduce_factor: 0.95
57
+ target_instrument: null
58
+ num_epochs: 1000
59
+ num_steps: 1000
60
+ q: 0.95
61
+ coarse_loss_clip: true
62
+ ema_momentum: 0.999
63
+ optimizer: prodigy
64
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
65
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
66
+
67
+ augmentations:
68
+ enable: true # enable or disable all augmentations (to fast disable if needed)
69
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
70
+ loudness_min: 0.5
71
+ loudness_max: 1.5
72
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
73
+ mixup_probs:
74
+ !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
75
+ - 0.2
76
+ - 0.02
77
+ mixup_loudness_min: 0.5
78
+ mixup_loudness_max: 1.5
79
+ all:
80
+ channel_shuffle: 0.5 # Set 0 or lower to disable
81
+ random_inverse: 0.1 # inverse track (better lower probability)
82
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
83
+
84
+ inference:
85
+ batch_size: 8
86
+ dim_t: 256
87
+ num_overlap: 4
88
+ normalize: true
config_musdb18_scnet_xl.yaml ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100 # 44100 * 11
3
+ num_channels: 2
4
+ sample_rate: 44100
5
+ min_mean_abs: 0.000
6
+
7
+ model:
8
+ sources:
9
+ - drums
10
+ - bass
11
+ - other
12
+ - vocals
13
+ audio_channels: 2
14
+ dims:
15
+ - 4
16
+ - 64
17
+ - 128
18
+ - 256
19
+ nfft: 4096
20
+ hop_size: 1024
21
+ win_size: 4096
22
+ normalized: True
23
+ band_SR:
24
+ - 0.230
25
+ - 0.370
26
+ - 0.400
27
+ band_stride:
28
+ - 1
29
+ - 4
30
+ - 16
31
+ band_kernel:
32
+ - 3
33
+ - 4
34
+ - 16
35
+ conv_depths:
36
+ - 3
37
+ - 2
38
+ - 1
39
+ compress: 4
40
+ conv_kernel: 3
41
+ num_dplayer: 8
42
+ expand: 1
43
+
44
+ training:
45
+ batch_size: 4
46
+ gradient_accumulation_steps: 1
47
+ grad_clip: 0
48
+ instruments:
49
+ - Drums
50
+ - Bass
51
+ - Other
52
+ - Vocals
53
+ patience: 2
54
+ reduce_factor: 0.95
55
+ target_instrument: null
56
+ num_epochs: 1000
57
+ num_steps: 1000
58
+ q: 0.95
59
+ coarse_loss_clip: true
60
+ ema_momentum: 0.999
61
+ # optimizer: prodigy
62
+ optimizer: adam
63
+ lr: 1.0e-05
64
+ # lr: 1.0
65
+ normalize: false # perform normalization on input of model (use the same for inference!)
66
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
67
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
68
+
69
+
70
+ augmentations:
71
+ enable: false # enable or disable all augmentations (to fast disable if needed)
72
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
73
+ loudness_min: 0.5
74
+ loudness_max: 1.5
75
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
76
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
77
+ - 0.2
78
+ - 0.02
79
+ mixup_loudness_min: 0.5
80
+ mixup_loudness_max: 1.5
81
+
82
+ # apply mp3 compression to mixture only (emulate downloading mp3 from internet)
83
+ mp3_compression_on_mixture: 0.01
84
+ mp3_compression_on_mixture_bitrate_min: 32
85
+ mp3_compression_on_mixture_bitrate_max: 320
86
+ mp3_compression_on_mixture_backend: "lameenc"
87
+
88
+ all:
89
+ channel_shuffle: 0.5 # Set 0 or lower to disable
90
+ random_inverse: 0.1 # inverse track (better lower probability)
91
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
92
+
93
+ mp3_compression: 0.01
94
+ mp3_compression_min_bitrate: 32
95
+ mp3_compression_max_bitrate: 320
96
+ mp3_compression_backend: "lameenc"
97
+
98
+ # pedalboard reverb block
99
+ pedalboard_reverb: 0.01
100
+ pedalboard_reverb_room_size_min: 0.1
101
+ pedalboard_reverb_room_size_max: 0.9
102
+ pedalboard_reverb_damping_min: 0.1
103
+ pedalboard_reverb_damping_max: 0.9
104
+ pedalboard_reverb_wet_level_min: 0.1
105
+ pedalboard_reverb_wet_level_max: 0.9
106
+ pedalboard_reverb_dry_level_min: 0.1
107
+ pedalboard_reverb_dry_level_max: 0.9
108
+ pedalboard_reverb_width_min: 0.9
109
+ pedalboard_reverb_width_max: 1.0
110
+
111
+ # pedalboard chorus block
112
+ pedalboard_chorus: 0.01
113
+ pedalboard_chorus_rate_hz_min: 1.0
114
+ pedalboard_chorus_rate_hz_max: 7.0
115
+ pedalboard_chorus_depth_min: 0.25
116
+ pedalboard_chorus_depth_max: 0.95
117
+ pedalboard_chorus_centre_delay_ms_min: 3
118
+ pedalboard_chorus_centre_delay_ms_max: 10
119
+ pedalboard_chorus_feedback_min: 0.0
120
+ pedalboard_chorus_feedback_max: 0.5
121
+ pedalboard_chorus_mix_min: 0.1
122
+ pedalboard_chorus_mix_max: 0.9
123
+
124
+ # pedalboard phazer block
125
+ pedalboard_phazer: 0.01
126
+ pedalboard_phazer_rate_hz_min: 1.0
127
+ pedalboard_phazer_rate_hz_max: 10.0
128
+ pedalboard_phazer_depth_min: 0.25
129
+ pedalboard_phazer_depth_max: 0.95
130
+ pedalboard_phazer_centre_frequency_hz_min: 200
131
+ pedalboard_phazer_centre_frequency_hz_max: 12000
132
+ pedalboard_phazer_feedback_min: 0.0
133
+ pedalboard_phazer_feedback_max: 0.5
134
+ pedalboard_phazer_mix_min: 0.1
135
+ pedalboard_phazer_mix_max: 0.9
136
+
137
+ # pedalboard distortion block
138
+ pedalboard_distortion: 0.01
139
+ pedalboard_distortion_drive_db_min: 1.0
140
+ pedalboard_distortion_drive_db_max: 25.0
141
+
142
+ # pedalboard pitch shift block
143
+ pedalboard_pitch_shift: 0.01
144
+ pedalboard_pitch_shift_semitones_min: -7
145
+ pedalboard_pitch_shift_semitones_max: 7
146
+
147
+ # pedalboard resample block
148
+ pedalboard_resample: 0.01
149
+ pedalboard_resample_target_sample_rate_min: 4000
150
+ pedalboard_resample_target_sample_rate_max: 44100
151
+
152
+ # pedalboard bitcrash block
153
+ pedalboard_bitcrash: 0.01
154
+ pedalboard_bitcrash_bit_depth_min: 4
155
+ pedalboard_bitcrash_bit_depth_max: 16
156
+
157
+ # pedalboard mp3 compressor block
158
+ pedalboard_mp3_compressor: 0.01
159
+ pedalboard_mp3_compressor_pedalboard_mp3_compressor_min: 0
160
+ pedalboard_mp3_compressor_pedalboard_mp3_compressor_max: 9.999
161
+
162
+ vocals:
163
+ pitch_shift: 0.1
164
+ pitch_shift_min_semitones: -5
165
+ pitch_shift_max_semitones: 5
166
+ seven_band_parametric_eq: 0.25
167
+ seven_band_parametric_eq_min_gain_db: -9
168
+ seven_band_parametric_eq_max_gain_db: 9
169
+ tanh_distortion: 0.1
170
+ tanh_distortion_min: 0.1
171
+ tanh_distortion_max: 0.7
172
+ bass:
173
+ pitch_shift: 0.1
174
+ pitch_shift_min_semitones: -2
175
+ pitch_shift_max_semitones: 2
176
+ seven_band_parametric_eq: 0.25
177
+ seven_band_parametric_eq_min_gain_db: -3
178
+ seven_band_parametric_eq_max_gain_db: 6
179
+ tanh_distortion: 0.2
180
+ tanh_distortion_min: 0.1
181
+ tanh_distortion_max: 0.5
182
+ drums:
183
+ pitch_shift: 0.33
184
+ pitch_shift_min_semitones: -5
185
+ pitch_shift_max_semitones: 5
186
+ seven_band_parametric_eq: 0.25
187
+ seven_band_parametric_eq_min_gain_db: -9
188
+ seven_band_parametric_eq_max_gain_db: 9
189
+ tanh_distortion: 0.33
190
+ tanh_distortion_min: 0.1
191
+ tanh_distortion_max: 0.6
192
+ other:
193
+ pitch_shift: 0.1
194
+ pitch_shift_min_semitones: -4
195
+ pitch_shift_max_semitones: 4
196
+ gaussian_noise: 0.1
197
+ gaussian_noise_min_amplitude: 0.001
198
+ gaussian_noise_max_amplitude: 0.015
199
+ time_stretch: 0.01
200
+ time_stretch_min_rate: 0.8
201
+ time_stretch_max_rate: 1.25
202
+
203
+ inference:
204
+ batch_size: 4
205
+ dim_t: 256
206
+ num_overlap: 4
207
+ normalize: false
deverb_bs_roformer_8_256dim_8depth.yaml ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352768
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.001
10
+
11
+ model:
12
+ dim: 256
13
+ depth: 8
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ linear_transformer_depth: 0
19
+ freqs_per_bands: !!python/tuple
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 2
43
+ - 2
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 4
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 12
63
+ - 12
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 24
71
+ - 24
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 48
79
+ - 48
80
+ - 128
81
+ - 129
82
+ dim_head: 64
83
+ heads: 8
84
+ attn_dropout: 0.1
85
+ ff_dropout: 0.1
86
+ flash_attn: true
87
+ dim_freqs_in: 1025
88
+ stft_n_fft: 2048
89
+ stft_hop_length: 512
90
+ stft_win_length: 2048
91
+ stft_normalized: false
92
+ mask_estimator_depth: 2
93
+ multi_stft_resolution_loss_weight: 1.0
94
+ multi_stft_resolutions_window_sizes: !!python/tuple
95
+ - 4096
96
+ - 2048
97
+ - 1024
98
+ - 512
99
+ - 256
100
+ multi_stft_hop_size: 147
101
+ multi_stft_normalized: False
102
+
103
+ training:
104
+ batch_size: 1
105
+ gradient_accumulation_steps: 1
106
+ grad_clip: 0
107
+ instruments:
108
+ - No Reverb
109
+ - Reverb
110
+ lr: 5.0e-05
111
+ patience: 2
112
+ reduce_factor: 0.95
113
+ target_instrument: No Reverb
114
+ num_epochs: 1000
115
+ num_steps: 7600
116
+ q: 0.95
117
+ coarse_loss_clip: true
118
+ ema_momentum: 0.999
119
+ optimizer: adam
120
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
121
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
122
+
123
+ augmentations:
124
+ enable: true # enable or disable all augmentations (to fast disable if needed)
125
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
126
+ loudness_min: 0.5
127
+ loudness_max: 1.5
128
+ mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
129
+ mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
130
+ - 0.2
131
+ - 0.02
132
+ mixup_loudness_min: 0.5
133
+ mixup_loudness_max: 1.5
134
+
135
+ inference:
136
+ batch_size: 1
137
+ dim_t: 801
138
+ num_overlap: 4
mdx.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ models: ['0d19c1c6', '7ecf8ec1', 'c511e2ab', '7d865c68']
2
+ weights: [
3
+ [1., 1., 0., 0.],
4
+ [0., 1., 0., 0.],
5
+ [1., 0., 1., 1.],
6
+ [1., 0., 1., 1.],
7
+ ]
8
+ segment: 44
mdx_extra.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ models: ['e51eebcc', 'a1d90b5c', '5d2d6c55', 'cfa93e08']
2
+ segment: 44
mdx_extra_q.yaml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ models: ['83fc094f', '464b36d7', '14fc6a69', '7fd6ef75']
2
+ segment: 44
mdx_q.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ models: ['6b9c2ca1', 'b72baf4e', '42e558d4', '305bc58f']
2
+ weights: [
3
+ [1., 1., 0., 0.],
4
+ [0., 1., 0., 0.],
5
+ [1., 0., 1., 1.],
6
+ [1., 0., 1., 1.],
7
+ ]
8
+ segment: 44