listra92 commited on
Commit
092b341
·
verified ·
1 Parent(s): c0aa228

Upload config_mel_band_roformer_Lead_Rhythm_Guitar.yaml

Browse files
misc/config_mel_band_roformer_Lead_Rhythm_Guitar.yaml ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 132300
3
+ dim_f: 1024
4
+ dim_t: 256
5
+ hop_length: 441
6
+ n_fft: 2048
7
+ num_channels: 2
8
+ sample_rate: 44100
9
+ min_mean_abs: 000
10
+
11
+ model:
12
+ dim: 384
13
+ depth: 4
14
+ stereo: true
15
+ num_stems: 1
16
+ time_transformer_depth: 1
17
+ freq_transformer_depth: 1
18
+ num_bands: 60
19
+ dim_head: 64
20
+ heads: 8
21
+ attn_dropout: 0
22
+ ff_dropout: 0
23
+ flash_attn: true
24
+ dim_freqs_in: 1025
25
+ sample_rate: 44100 # needed for mel filter bank from librosa
26
+ stft_n_fft: 2048
27
+ stft_hop_length: 441
28
+ stft_win_length: 2048
29
+ stft_normalized: false
30
+ mask_estimator_depth: 2
31
+ multi_stft_resolution_loss_weight: 2.0
32
+ multi_stft_resolutions_window_sizes: !!python/tuple
33
+ - 4096
34
+ - 2048
35
+ - 1024
36
+ - 512
37
+ - 256
38
+ multi_stft_hop_size: 147
39
+ multi_stft_normalized: false
40
+ mlp_expansion_factor: 2 # Probably too big (requires a lot of memory for weights)
41
+ use_torch_checkpoint: false # it allows to greatly reduce GPU memory consumption during training (not fully tested)
42
+ skip_connection: false # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
43
+
44
+ loss_multistft:
45
+ fft_sizes:
46
+ - 1024
47
+ - 2048
48
+ - 4096
49
+ hop_sizes:
50
+ - 512
51
+ - 1024
52
+ - 2048
53
+ win_lengths:
54
+ - 1024
55
+ - 2048
56
+ - 4096
57
+ window: "hann_window"
58
+ scale: "mel"
59
+ n_bins: 128
60
+ sample_rate: 44100
61
+ perceptual_weighting: true
62
+ w_sc: 1.0
63
+ w_log_mag: 1.0
64
+ w_lin_mag: 0.0
65
+ w_phs: 0.0
66
+ mag_distance: "L1"
67
+
68
+ training:
69
+ batch_size: 2
70
+ gradient_accumulation_steps: 2
71
+ grad_clip: 0
72
+ instruments:
73
+ - Lead
74
+ - Rhythm
75
+ lr: 1.0e-04
76
+ patience: 5
77
+ reduce_factor: 0.95
78
+ target_instrument: Lead
79
+ num_epochs: 1000
80
+ num_steps: 1000
81
+ q: 0.95
82
+ coarse_loss_clip: true
83
+ ema_momentum: 0.999
84
+ optimizer: adamw
85
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
86
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
87
+
88
+ augmentations:
89
+ enable: true # enable or disable all augmentations (to fast disable if needed)
90
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
91
+ loudness_min: 0.5
92
+ loudness_max: 1.5
93
+ difference:
94
+ channel_shuffle: 0.5 # Set 0 or lower to disable
95
+ random_inverse: 0.01 # inverse track (better lower probability)
96
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
97
+
98
+ inference:
99
+ batch_size: 12
100
+ dim_t: 256
101
+ num_overlap: 1
102
+
103
+ lora:
104
+ r: 8
105
+ lora_alpha: 16. #alpha / rank > 1
106
+ lora_dropout: 0.05
107
+ merge_weights: true
108
+ fan_in_fan_out: false