noblebarkrr commited on
Commit
bb45c79
·
verified ·
1 Parent(s): 52b8291

Upload folder using huggingface_hub

Browse files
bs_roformer/bs_inst_exp_vlp_unwa.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c035e2a102243405e45bf33faa175f62fd7118f63b62771fafdf81062b804131
3
+ size 393351501
bs_roformer/bs_inst_exp_vlp_unwa_config.yaml ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 485100
3
+ dim_f: 1024
4
+ dim_t: 801
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 384
12
+ depth: 12
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ freqs_per_bands: !!python/tuple
18
+ - 2
19
+ - 2
20
+ - 2
21
+ - 2
22
+ - 2
23
+ - 2
24
+ - 2
25
+ - 2
26
+ - 2
27
+ - 2
28
+ - 2
29
+ - 2
30
+ - 2
31
+ - 2
32
+ - 2
33
+ - 2
34
+ - 2
35
+ - 2
36
+ - 2
37
+ - 2
38
+ - 2
39
+ - 2
40
+ - 2
41
+ - 2
42
+ - 4
43
+ - 4
44
+ - 4
45
+ - 4
46
+ - 4
47
+ - 4
48
+ - 4
49
+ - 4
50
+ - 4
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 12
55
+ - 12
56
+ - 12
57
+ - 12
58
+ - 12
59
+ - 12
60
+ - 12
61
+ - 12
62
+ - 24
63
+ - 24
64
+ - 24
65
+ - 24
66
+ - 24
67
+ - 24
68
+ - 24
69
+ - 24
70
+ - 48
71
+ - 48
72
+ - 48
73
+ - 48
74
+ - 48
75
+ - 48
76
+ - 48
77
+ - 48
78
+ - 128
79
+ - 129
80
+ dim_head: 64
81
+ heads: 8
82
+ attn_dropout: 0
83
+ ff_dropout: 0
84
+ flash_attn: true
85
+ dim_freqs_in: 1025
86
+ stft_n_fft: 2048
87
+ stft_hop_length: 441
88
+ stft_win_length: 2048
89
+ stft_normalized: false
90
+ mask_estimator_depth: 2
91
+ multi_stft_resolution_loss_weight: 1.0
92
+ multi_stft_resolutions_window_sizes: !!python/tuple
93
+ - 4096
94
+ - 2048
95
+ - 1024
96
+ - 512
97
+ - 256
98
+ multi_stft_hop_size: 147
99
+ multi_stft_normalized: false
100
+ training:
101
+ batch_size: 1
102
+ gradient_accumulation_steps: 1
103
+ grad_clip: 0
104
+ instruments:
105
+ - vocals
106
+ - other
107
+ lr: 0.0001
108
+ patience: 2
109
+ reduce_factor: 0.95
110
+ target_instrument: other
111
+ num_epochs: 1
112
+ num_steps: 1000
113
+ q: 0.95
114
+ coarse_loss_clip: true
115
+ ema_momentum: 0.999
116
+ optimizer: adamw
117
+ other_fix: true
118
+ use_amp: true
119
+ inference:
120
+ batch_size: 1
121
+ dim_t: 1101
122
+ num_overlap: 2
mel_band_roformer/mbr_lead_rhythm_guitar_listra92_config.yaml CHANGED
@@ -1,108 +1,102 @@
1
- audio:
2
- chunk_size: 132300
3
- dim_f: 1024
4
- dim_t: 256
5
- hop_length: 441
6
- n_fft: 2048
7
- num_channels: 2
8
- sample_rate: 44100
9
- min_mean_abs: 000
10
-
11
- model:
12
- dim: 384
13
- depth: 4
14
- stereo: true
15
- num_stems: 1
16
- time_transformer_depth: 1
17
- freq_transformer_depth: 1
18
- num_bands: 60
19
- dim_head: 64
20
- heads: 8
21
- attn_dropout: 0
22
- ff_dropout: 0
23
- flash_attn: true
24
- dim_freqs_in: 1025
25
- sample_rate: 44100 # needed for mel filter bank from librosa
26
- stft_n_fft: 2048
27
- stft_hop_length: 441
28
- stft_win_length: 2048
29
- stft_normalized: false
30
- mask_estimator_depth: 2
31
- multi_stft_resolution_loss_weight: 2.0
32
- multi_stft_resolutions_window_sizes: !!python/tuple
33
- - 4096
34
- - 2048
35
- - 1024
36
- - 512
37
- - 256
38
- multi_stft_hop_size: 147
39
- multi_stft_normalized: false
40
- mlp_expansion_factor: 2 # Probably too big (requires a lot of memory for weights)
41
- use_torch_checkpoint: false # it allows to greatly reduce GPU memory consumption during training (not fully tested)
42
- skip_connection: false # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
43
-
44
- loss_multistft:
45
- fft_sizes:
46
- - 1024
47
- - 2048
48
- - 4096
49
- hop_sizes:
50
- - 512
51
- - 1024
52
- - 2048
53
- win_lengths:
54
- - 1024
55
- - 2048
56
- - 4096
57
- window: "hann_window"
58
- scale: "mel"
59
- n_bins: 128
60
- sample_rate: 44100
61
- perceptual_weighting: true
62
- w_sc: 1.0
63
- w_log_mag: 1.0
64
- w_lin_mag: 0.0
65
- w_phs: 0.0
66
- mag_distance: "L1"
67
-
68
- training:
69
- batch_size: 2
70
- gradient_accumulation_steps: 2
71
- grad_clip: 0
72
- instruments:
73
- - Lead
74
- - Rhythm
75
- lr: 1.0e-04
76
- patience: 5
77
- reduce_factor: 0.95
78
- target_instrument: Lead
79
- num_epochs: 1000
80
- num_steps: 1000
81
- q: 0.95
82
- coarse_loss_clip: true
83
- ema_momentum: 0.999
84
- optimizer: adamw
85
- other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
86
- use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
87
-
88
- augmentations:
89
- enable: true # enable or disable all augmentations (to fast disable if needed)
90
- loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
91
- loudness_min: 0.5
92
- loudness_max: 1.5
93
- difference:
94
- channel_shuffle: 0.5 # Set 0 or lower to disable
95
- random_inverse: 0.01 # inverse track (better lower probability)
96
- random_polarity: 0.5 # polarity change (multiply waveform to -1)
97
-
98
- inference:
99
- batch_size: 12
100
- dim_t: 256
101
- num_overlap: 1
102
-
103
- lora:
104
- r: 8
105
- lora_alpha: 16. #alpha / rank > 1
106
- lora_dropout: 0.05
107
- merge_weights: true
108
- fan_in_fan_out: false
 
1
+ audio:
2
+ chunk_size: 132300
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0
10
+ model:
11
+ dim: 384
12
+ depth: 4
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ num_bands: 60
18
+ dim_head: 64
19
+ heads: 8
20
+ attn_dropout: 0
21
+ ff_dropout: 0
22
+ flash_attn: true
23
+ dim_freqs_in: 1025
24
+ sample_rate: 44100
25
+ stft_n_fft: 2048
26
+ stft_hop_length: 441
27
+ stft_win_length: 2048
28
+ stft_normalized: false
29
+ mask_estimator_depth: 2
30
+ multi_stft_resolution_loss_weight: 2.0
31
+ multi_stft_resolutions_window_sizes: !!python/tuple
32
+ - 4096
33
+ - 2048
34
+ - 1024
35
+ - 512
36
+ - 256
37
+ multi_stft_hop_size: 147
38
+ multi_stft_normalized: false
39
+ mlp_expansion_factor: 2
40
+ use_torch_checkpoint: false
41
+ skip_connection: false
42
+ loss_multistft:
43
+ fft_sizes:
44
+ - 1024
45
+ - 2048
46
+ - 4096
47
+ hop_sizes:
48
+ - 512
49
+ - 1024
50
+ - 2048
51
+ win_lengths:
52
+ - 1024
53
+ - 2048
54
+ - 4096
55
+ window: hann_window
56
+ scale: mel
57
+ n_bins: 128
58
+ sample_rate: 44100
59
+ perceptual_weighting: true
60
+ w_sc: 1.0
61
+ w_log_mag: 1.0
62
+ w_lin_mag: 0.0
63
+ w_phs: 0.0
64
+ mag_distance: L1
65
+ training:
66
+ batch_size: 2
67
+ gradient_accumulation_steps: 2
68
+ grad_clip: 0
69
+ instruments:
70
+ - Lead
71
+ - Rhythm
72
+ lr: 0.0001
73
+ patience: 5
74
+ reduce_factor: 0.95
75
+ target_instrument: Lead
76
+ num_epochs: 1000
77
+ num_steps: 1000
78
+ q: 0.95
79
+ coarse_loss_clip: true
80
+ ema_momentum: 0.999
81
+ optimizer: adamw
82
+ other_fix: false
83
+ use_amp: true
84
+ augmentations:
85
+ enable: true
86
+ loudness: true
87
+ loudness_min: 0.5
88
+ loudness_max: 1.5
89
+ difference:
90
+ channel_shuffle: 0.5
91
+ random_inverse: 0.01
92
+ random_polarity: 0.5
93
+ inference:
94
+ batch_size: 1
95
+ dim_t: 256
96
+ num_overlap: 2
97
+ lora:
98
+ r: 8
99
+ lora_alpha: 16.0
100
+ lora_dropout: 0.05
101
+ merge_weights: true
102
+ fan_in_fan_out: false
 
 
 
 
 
 
mel_band_roformer/mbr_vocalsfv7_gabox.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77b9e4b54802d670c02daceb30c61ec825dc54ca6c29c34b03cdd5e9f78382b6
3
+ size 489571079
mel_band_roformer/mbr_vocalsfv7_gabox_config.yaml ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 352800
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 0.0
10
+ model:
11
+ dim: 256
12
+ depth: 12
13
+ stereo: true
14
+ num_stems: 1
15
+ time_transformer_depth: 1
16
+ freq_transformer_depth: 1
17
+ num_bands: 60
18
+ dim_head: 64
19
+ heads: 8
20
+ attn_dropout: 0
21
+ ff_dropout: 0
22
+ flash_attn: true
23
+ dim_freqs_in: 1025
24
+ sample_rate: 44100
25
+ stft_n_fft: 2048
26
+ stft_hop_length: 441
27
+ stft_win_length: 2048
28
+ stft_normalized: false
29
+ mask_estimator_depth: 2
30
+ multi_stft_resolution_loss_weight: 1.0
31
+ multi_stft_resolutions_window_sizes: !!python/tuple
32
+ - 4096
33
+ - 2048
34
+ - 1024
35
+ - 512
36
+ - 256
37
+ multi_stft_hop_size: 147
38
+ multi_stft_normalized: false
39
+ use_torch_checkpoint: true
40
+ training:
41
+ batch_size: 1
42
+ gradient_accumulation_steps: 1
43
+ grad_clip: 0
44
+ instruments:
45
+ - Vocals
46
+ - Instrumental
47
+ lr: 1.0e-05
48
+ patience: 100000000
49
+ reduce_factor: 0.95
50
+ target_instrument: Vocals
51
+ num_epochs: 1000
52
+ num_steps: 1000
53
+ augmentation: false
54
+ augmentation_type: simple1
55
+ use_mp3_compress: false
56
+ augmentation_mix: false
57
+ augmentation_loudness: true
58
+ augmentation_loudness_type: 1
59
+ augmentation_loudness_min: 0
60
+ augmentation_loudness_max: 0
61
+ q: 0.95
62
+ coarse_loss_clip: false
63
+ ema_momentum: 0.999
64
+ optimizer: adamw
65
+ other_fix: false
66
+ use_amp: true
67
+ inference:
68
+ batch_size: 1
69
+ dim_t: 1101
70
+ num_overlap: 2