am2460162 commited on
Commit
0308576
·
verified ·
1 Parent(s): 830c0d7

added choirsep models by concert.isolations.business@gmail.com

Browse files
demucs_choirsep/config_htdemucs_choirsep.yaml ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 132300 # samplerate * segment
3
+ min_mean_abs: 0.001
4
+ hop_length: 1024
5
+
6
+ training:
7
+ batch_size: 4
8
+ gradient_accumulation_steps: 1
9
+ grad_clip: 0
10
+ segment: 3
11
+ shift: 1
12
+ samplerate: 44100
13
+ channels: 2
14
+ normalize: true
15
+ instruments: ['alto', 'bass', 'soprano', 'tenor']
16
+ target_instrument: null
17
+ num_epochs: 1000
18
+ num_steps: 1000
19
+ optimizer: adam
20
+ lr: 1.0e-04
21
+ patience: 2
22
+ reduce_factor: 0.95
23
+ q: 0.95
24
+ coarse_loss_clip: true
25
+ ema_momentum: 0.999
26
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
27
+ use_amp: false # enable or disable usage of mixed precision (float16) - usually it must be true
28
+
29
+ loss_multistft:
30
+ fft_sizes:
31
+ - 1024
32
+ - 2048
33
+ - 4096
34
+ hop_sizes:
35
+ - 512
36
+ - 1024
37
+ - 2048
38
+ win_lengths:
39
+ - 1024
40
+ - 2048
41
+ - 4096
42
+ window: "hann_window"
43
+ scale: "mel"
44
+ n_bins: 128
45
+ sample_rate: 44100
46
+ perceptual_weighting: true
47
+ w_sc: 1.0
48
+ w_log_mag: 1.0
49
+ w_lin_mag: 0.0
50
+ w_phs: 0.0
51
+ mag_distance: "L1"
52
+
53
+ augmentations:
54
+ enable: false # enable or disable all augmentations (to fast disable if needed)
55
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
56
+ loudness_min: 0.5
57
+ loudness_max: 1.5
58
+ mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
59
+ mixup_probs: [0.2, 0.02]
60
+ mixup_loudness_min: 0.5
61
+ mixup_loudness_max: 1.5
62
+ all:
63
+ channel_shuffle: 0.5 # Set 0 or lower to disable
64
+ random_inverse: 0.1 # inverse track (better lower probability)
65
+ random_polarity: 0.5 # polarity change (multiply waveform to -1)
66
+
67
+ inference:
68
+ num_overlap: 4
69
+ batch_size: 8
70
+
71
+ model: htdemucs
72
+
73
+ htdemucs: # see demucs/htdemucs.py for a detailed description
74
+ # Channels
75
+ channels: 48
76
+ channels_time:
77
+ growth: 2
78
+ # STFT
79
+ num_subbands: 1
80
+ nfft: 4096
81
+ wiener_iters: 0
82
+ end_iters: 0
83
+ wiener_residual: false
84
+ cac: true
85
+ # Main structure
86
+ depth: 4
87
+ rewrite: true
88
+ # Frequency Branch
89
+ multi_freqs: []
90
+ multi_freqs_depth: 3
91
+ freq_emb: 0.2
92
+ emb_scale: 10
93
+ emb_smooth: true
94
+ # Convolutions
95
+ kernel_size: 8
96
+ stride: 4
97
+ time_stride: 2
98
+ context: 1
99
+ context_enc: 0
100
+ # normalization
101
+ norm_starts: 4
102
+ norm_groups: 4
103
+ # DConv residual branch
104
+ dconv_mode: 3
105
+ dconv_depth: 2
106
+ dconv_comp: 8
107
+ dconv_init: 1e-3
108
+ # Before the Transformer
109
+ bottom_channels: 0
110
+ # CrossTransformer
111
+ # ------ Common to all
112
+ # Regular parameters
113
+ t_layers: 5
114
+ t_hidden_scale: 4.0
115
+ t_heads: 8
116
+ t_dropout: 0.0
117
+ t_layer_scale: True
118
+ t_gelu: True
119
+ # ------------- Positional Embedding
120
+ t_emb: sin
121
+ t_max_positions: 10000 # for the scaled embedding
122
+ t_max_period: 10000.0
123
+ t_weight_pos_embed: 1.0
124
+ t_cape_mean_normalize: True
125
+ t_cape_augment: True
126
+ t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
127
+ t_sin_random_shift: 0
128
+ # ------------- norm before a transformer encoder
129
+ t_norm_in: True
130
+ t_norm_in_group: False
131
+ # ------------- norm inside the encoder
132
+ t_group_norm: False
133
+ t_norm_first: True
134
+ t_norm_out: True
135
+ # ------------- optim
136
+ t_weight_decay: 0.0
137
+ t_lr:
138
+ # ------------- sparsity
139
+ t_sparse_self_attn: False
140
+ t_sparse_cross_attn: False
141
+ t_mask_type: diag
142
+ t_mask_random_seed: 42
143
+ t_sparse_attn_window: 400
144
+ t_global_window: 100
145
+ t_sparsity: 0.95
146
+ t_auto_sparsity: False
147
+ # Cross Encoder First (False)
148
+ t_cross_first: False
149
+ # Weight init
150
+ rescale: 0.1
151
+
scnet_choirsep/config_scnet_choirsep.yaml ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ audio:
2
+ chunk_size: 131072 # 44100 * 11
3
+ num_channels: 2
4
+ sample_rate: 44100
5
+ min_mean_abs: 0.000
6
+
7
+ model:
8
+ sources:
9
+ - alto
10
+ - bass
11
+ - soprano
12
+ - tenor
13
+ audio_channels: 2
14
+ dims:
15
+ - 4
16
+ - 32
17
+ - 64
18
+ - 128
19
+ nfft: 4096
20
+ hop_size: 1024
21
+ win_size: 4096
22
+ normalized: True
23
+ band_SR:
24
+ - 0.175
25
+ - 0.392
26
+ - 0.433
27
+ band_stride:
28
+ - 1
29
+ - 4
30
+ - 16
31
+ band_kernel:
32
+ - 3
33
+ - 4
34
+ - 16
35
+ conv_depths:
36
+ - 3
37
+ - 2
38
+ - 1
39
+ compress: 4
40
+ conv_kernel: 3
41
+ num_dplayer: 6
42
+ expand: 1
43
+
44
+ training:
45
+ batch_size: 9
46
+ gradient_accumulation_steps: 1
47
+ grad_clip: 0
48
+ instruments:
49
+ - alto
50
+ - bass
51
+ - soprano
52
+ - tenor
53
+ lr: 5.0e-4
54
+ patience: 6
55
+ reduce_factor: 0.95
56
+ target_instrument: null
57
+ num_epochs: 1000
58
+ num_steps: 1000
59
+ q: 0.95
60
+ coarse_loss_clip: true
61
+ ema_momentum: 0.999
62
+ optimizer: adamw8bit
63
+ other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
64
+ use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
65
+
66
+ loss_multistft:
67
+ fft_sizes:
68
+ - 1024
69
+ - 2048
70
+ - 4096
71
+ hop_sizes:
72
+ - 512
73
+ - 1024
74
+ - 2048
75
+ win_lengths:
76
+ - 1024
77
+ - 2048
78
+ - 4096
79
+ window: "hann_window"
80
+ scale: "mel"
81
+ n_bins: 128
82
+ sample_rate: 44100
83
+ perceptual_weighting: true
84
+ w_sc: 1.0
85
+ w_log_mag: 1.0
86
+ w_lin_mag: 0.0
87
+ w_phs: 0.0
88
+ mag_distance: "L1"
89
+
90
+ augmentations:
91
+ enable: false # enable or disable all augmentations (to fast disable if needed)
92
+ loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
93
+ loudness_min: 0.5
94
+ loudness_max: 1.5
95
+ mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
96
+ mixup_probs:
97
+ !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
98
+ - 0.2
99
+ - 0.02
100
+ mixup_loudness_min: 0.5
101
+ mixup_loudness_max: 1.5
102
+
103
+ inference:
104
+ batch_size: 16
105
+ dim_t: 256
106
+ num_overlap: 1
107
+ normalize: false