Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +6 -0
- BS-Roformer-SW.yaml +198 -0
- UVR-MDX-NET_Crowd_HQ_1.onnx +3 -0
- UVR_Demucs_Model_1.yaml +2 -0
- UVR_MDXNET_1_9703.onnx +3 -0
- UVR_MDXNET_3_9662.onnx +3 -0
- assets/__pycache__/model_tools.cpython-310.pyc +0 -0
- assets/__pycache__/model_tools.cpython-313.pyc +0 -0
- assets/calculate-model-hashes.py +21 -0
- assets/delete_duplicate_models.py +8 -0
- assets/list_duplicate_remove.py +7 -0
- assets/model_data/mdx_model_data.json +384 -0
- assets/model_tools.py +380 -0
- config_aspiration_mel_band_roformer.yaml +76 -0
- config_bs_roformer_instrumental_resurrection_unwa.yaml +135 -0
- config_bs_roformer_karaoke_frazer_becruily.yaml +129 -0
- config_bs_roformer_vocals_gabox.yaml +133 -0
- config_bs_roformer_vocals_resurrection_unwa.yaml +135 -0
- config_bs_roformer_vocals_revive_unwa.yaml +134 -0
- config_dereverb-echo_mel_band_roformer.yaml +76 -0
- config_mdx23c_similarity.yaml +47 -0
- config_mel_band_roformer_instrumental_becruily.yaml +72 -0
- config_mel_band_roformer_instrumental_gabox.yaml +51 -0
- config_mel_band_roformer_karaoke_becruily.yaml +72 -0
- config_mel_band_roformer_kim_ft_unwa.yaml +72 -0
- config_mel_band_roformer_vocal_fullness_aname.yaml +54 -0
- config_mel_band_roformer_vocals_becruily.yaml +72 -0
- config_mel_band_roformer_vocals_gabox.yaml +51 -0
- config_melbandroformer_inst.yaml +51 -0
- config_melbandroformer_inst_v2.yaml +51 -0
- config_melbandroformer_instvoc_duality.yaml +51 -0
- denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768_config.yaml +71 -0
- denoise_mel_band_roformer_aufr33_sdr_27.9959_config.yaml +71 -0
- deverb_bs_roformer_8_384dim_10depth_config.yaml +137 -0
- hdemucs_mmi.yaml +2 -0
- htdemucs.yaml +1 -0
- htdemucs_6s.yaml +1 -0
- htdemucs_ft.yaml +7 -0
- kuielab_b_other.onnx +3 -0
- kuielab_b_vocals.onnx +3 -0
- mdx_model_data.json +384 -0
- mel_band_roformer_crowd_aufr33_viperx_sdr_8.7144_config.yaml +71 -0
- mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956_config.yaml +71 -0
- model_bs_roformer_ep_317_sdr_12.9755.yaml +133 -0
- model_bs_roformer_ep_368_sdr_12.9628.yaml +133 -0
- model_bs_roformer_ep_937_sdr_10.5309.yaml +138 -0
- model_mel_band_roformer_ep_3005_sdr_11.4360.yaml +72 -0
- repro_mdx_a_time_only.yaml +2 -0
- scnet_checkpoint_musdb18.ckpt +3 -0
- vocals_mel_band_roformer.yaml +50 -0
.gitattributes
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
kuielab_b_other.onnx filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
kuielab_b_vocals.onnx filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
UVR_MDXNET_3_9662.onnx filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
UVR_MDXNET_1_9703.onnx filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
scnet_checkpoint_musdb18.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
UVR-MDX-NET_Crowd_HQ_1.onnx filter=lfs diff=lfs merge=lfs -text
|
BS-Roformer-SW.yaml
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 588800 #882000
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 801 # don't work (use in model)
|
| 5 |
+
hop_length: 441 # don't work (use in model)
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 256
|
| 13 |
+
depth: 12
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 6
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
linear_transformer_depth: 0
|
| 19 |
+
freqs_per_bands: !!python/tuple
|
| 20 |
+
- 2
|
| 21 |
+
- 2
|
| 22 |
+
- 2
|
| 23 |
+
- 2
|
| 24 |
+
- 2
|
| 25 |
+
- 2
|
| 26 |
+
- 2
|
| 27 |
+
- 2
|
| 28 |
+
- 2
|
| 29 |
+
- 2
|
| 30 |
+
- 2
|
| 31 |
+
- 2
|
| 32 |
+
- 2
|
| 33 |
+
- 2
|
| 34 |
+
- 2
|
| 35 |
+
- 2
|
| 36 |
+
- 2
|
| 37 |
+
- 2
|
| 38 |
+
- 2
|
| 39 |
+
- 2
|
| 40 |
+
- 2
|
| 41 |
+
- 2
|
| 42 |
+
- 2
|
| 43 |
+
- 2
|
| 44 |
+
- 4
|
| 45 |
+
- 4
|
| 46 |
+
- 4
|
| 47 |
+
- 4
|
| 48 |
+
- 4
|
| 49 |
+
- 4
|
| 50 |
+
- 4
|
| 51 |
+
- 4
|
| 52 |
+
- 4
|
| 53 |
+
- 4
|
| 54 |
+
- 4
|
| 55 |
+
- 4
|
| 56 |
+
- 12
|
| 57 |
+
- 12
|
| 58 |
+
- 12
|
| 59 |
+
- 12
|
| 60 |
+
- 12
|
| 61 |
+
- 12
|
| 62 |
+
- 12
|
| 63 |
+
- 12
|
| 64 |
+
- 24
|
| 65 |
+
- 24
|
| 66 |
+
- 24
|
| 67 |
+
- 24
|
| 68 |
+
- 24
|
| 69 |
+
- 24
|
| 70 |
+
- 24
|
| 71 |
+
- 24
|
| 72 |
+
- 48
|
| 73 |
+
- 48
|
| 74 |
+
- 48
|
| 75 |
+
- 48
|
| 76 |
+
- 48
|
| 77 |
+
- 48
|
| 78 |
+
- 48
|
| 79 |
+
- 48
|
| 80 |
+
- 128
|
| 81 |
+
- 129
|
| 82 |
+
dim_head: 64
|
| 83 |
+
heads: 8
|
| 84 |
+
attn_dropout: 0.1
|
| 85 |
+
ff_dropout: 0.1
|
| 86 |
+
flash_attn: true
|
| 87 |
+
dim_freqs_in: 1025
|
| 88 |
+
stft_n_fft: 2048
|
| 89 |
+
stft_hop_length: 512
|
| 90 |
+
stft_win_length: 2048
|
| 91 |
+
stft_normalized: false
|
| 92 |
+
mask_estimator_depth: 2
|
| 93 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 94 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 95 |
+
- 4096
|
| 96 |
+
- 2048
|
| 97 |
+
- 1024
|
| 98 |
+
- 512
|
| 99 |
+
- 256
|
| 100 |
+
multi_stft_hop_size: 147
|
| 101 |
+
multi_stft_normalized: False
|
| 102 |
+
mlp_expansion_factor: 4
|
| 103 |
+
use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested)
|
| 104 |
+
skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
|
| 105 |
+
|
| 106 |
+
training:
|
| 107 |
+
batch_size: 2
|
| 108 |
+
gradient_accumulation_steps: 1
|
| 109 |
+
grad_clip: 0
|
| 110 |
+
instruments: ['bass', 'drums', 'other', 'vocals', 'guitar', 'piano']
|
| 111 |
+
patience: 3
|
| 112 |
+
reduce_factor: 0.95
|
| 113 |
+
target_instrument: null
|
| 114 |
+
num_epochs: 1000
|
| 115 |
+
num_steps: 1000
|
| 116 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
| 117 |
+
augmentation_type: simple1
|
| 118 |
+
use_mp3_compress: false # Deprecated
|
| 119 |
+
augmentation_mix: true # Mix several stems of the same type with some probability
|
| 120 |
+
augmentation_loudness: true # randomly change loudness of each stem
|
| 121 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
| 122 |
+
augmentation_loudness_min: 0.5
|
| 123 |
+
augmentation_loudness_max: 1.5
|
| 124 |
+
q: 0.95
|
| 125 |
+
coarse_loss_clip: true
|
| 126 |
+
ema_momentum: 0.999
|
| 127 |
+
# optimizer: prodigy
|
| 128 |
+
optimizer: adam
|
| 129 |
+
# lr: 1.0
|
| 130 |
+
lr: 1.0e-5
|
| 131 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 132 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 133 |
+
|
| 134 |
+
augmentations:
|
| 135 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
| 136 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
| 137 |
+
loudness_min: 0.5
|
| 138 |
+
loudness_max: 1.5
|
| 139 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
| 140 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
| 141 |
+
- 0.2
|
| 142 |
+
- 0.02
|
| 143 |
+
mixup_loudness_min: 0.5
|
| 144 |
+
mixup_loudness_max: 1.5
|
| 145 |
+
|
| 146 |
+
all:
|
| 147 |
+
channel_shuffle: 0.5 # Set 0 or lower to disable
|
| 148 |
+
random_inverse: 0.1 # inverse track (better lower probability)
|
| 149 |
+
random_polarity: 0.5 # polarity change (multiply waveform to -1)
|
| 150 |
+
|
| 151 |
+
vocals:
|
| 152 |
+
pitch_shift: 0.1
|
| 153 |
+
pitch_shift_min_semitones: -5
|
| 154 |
+
pitch_shift_max_semitones: 5
|
| 155 |
+
seven_band_parametric_eq: 0.1
|
| 156 |
+
seven_band_parametric_eq_min_gain_db: -9
|
| 157 |
+
seven_band_parametric_eq_max_gain_db: 9
|
| 158 |
+
tanh_distortion: 0.1
|
| 159 |
+
tanh_distortion_min: 0.1
|
| 160 |
+
tanh_distortion_max: 0.7
|
| 161 |
+
bass:
|
| 162 |
+
pitch_shift: 0.1
|
| 163 |
+
pitch_shift_min_semitones: -2
|
| 164 |
+
pitch_shift_max_semitones: 2
|
| 165 |
+
seven_band_parametric_eq: 0.1
|
| 166 |
+
seven_band_parametric_eq_min_gain_db: -3
|
| 167 |
+
seven_band_parametric_eq_max_gain_db: 6
|
| 168 |
+
tanh_distortion: 0.1
|
| 169 |
+
tanh_distortion_min: 0.1
|
| 170 |
+
tanh_distortion_max: 0.5
|
| 171 |
+
drums:
|
| 172 |
+
pitch_shift: 0.1
|
| 173 |
+
pitch_shift_min_semitones: -5
|
| 174 |
+
pitch_shift_max_semitones: 5
|
| 175 |
+
seven_band_parametric_eq: 0.1
|
| 176 |
+
seven_band_parametric_eq_min_gain_db: -9
|
| 177 |
+
seven_band_parametric_eq_max_gain_db: 9
|
| 178 |
+
tanh_distortion: 0.1
|
| 179 |
+
tanh_distortion_min: 0.1
|
| 180 |
+
tanh_distortion_max: 0.6
|
| 181 |
+
other:
|
| 182 |
+
pitch_shift: 0.1
|
| 183 |
+
pitch_shift_min_semitones: -4
|
| 184 |
+
pitch_shift_max_semitones: 4
|
| 185 |
+
gaussian_noise: 0.1
|
| 186 |
+
gaussian_noise_min_amplitude: 0.001
|
| 187 |
+
gaussian_noise_max_amplitude: 0.015
|
| 188 |
+
time_stretch: 0.1
|
| 189 |
+
time_stretch_min_rate: 0.8
|
| 190 |
+
time_stretch_max_rate: 1.25
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
inference:
|
| 194 |
+
batch_size: 1
|
| 195 |
+
dim_t: 801 # Changed from 1101 to match training
|
| 196 |
+
num_overlap: 2
|
| 197 |
+
normalize: false
|
| 198 |
+
|
UVR-MDX-NET_Crowd_HQ_1.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:313b7bf869c411fdafe005cf0d5a635c405cb3d0df137178a64091952d75225c
|
| 3 |
+
size 59074342
|
UVR_Demucs_Model_1.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
models: ['ebf34a2db']
|
| 2 |
+
segment: 44
|
UVR_MDXNET_1_9703.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:229ad3bb96a037e89d8ed86732d6d3675856e6a07c3e3f02896eac01ec7ee4be
|
| 3 |
+
size 29704436
|
UVR_MDXNET_3_9662.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e02220e80d8253f4c2209f8924298b2b686bbdf2868b788ff5500fb9bd94aadc
|
| 3 |
+
size 29704436
|
assets/__pycache__/model_tools.cpython-310.pyc
ADDED
|
Binary file (10.2 kB). View file
|
|
|
assets/__pycache__/model_tools.cpython-313.pyc
ADDED
|
Binary file (13.9 kB). View file
|
|
|
assets/calculate-model-hashes.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
import json
|
| 6 |
+
import hashlib
|
| 7 |
+
import requests
|
| 8 |
+
import model_tools as mt
|
| 9 |
+
|
| 10 |
+
MODEL_CACHE_PATH = "/tmp/audio-separator-models"
|
| 11 |
+
VR_MODEL_DATA_LOCAL_PATH = f"{MODEL_CACHE_PATH}/vr_model_data.json"
|
| 12 |
+
MDX_MODEL_DATA_LOCAL_PATH = f"{MODEL_CACHE_PATH}/mdx_model_data.json"
|
| 13 |
+
|
| 14 |
+
MODEL_DATA_URL_PREFIX = "https://raw.githubusercontent.com/TRvlvr/application_data/main"
|
| 15 |
+
VR_MODEL_DATA_URL = f"{MODEL_DATA_URL_PREFIX}/vr_model_data/model_data_new.json"
|
| 16 |
+
MDX_MODEL_DATA_URL = f"{MODEL_DATA_URL_PREFIX}/mdx_model_data/model_data_new.json"
|
| 17 |
+
|
| 18 |
+
OUTPUT_PATH = f"{MODEL_CACHE_PATH}/model_hashes.json"
|
| 19 |
+
|
| 20 |
+
if __name__ == "__main__":
|
| 21 |
+
mt.iterate_and_hash(MODEL_CACHE_PATH)
|
assets/delete_duplicate_models.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import hashlib
|
| 3 |
+
from collections import defaultdict
|
| 4 |
+
import model_tools as mt
|
| 5 |
+
|
| 6 |
+
if __name__ == "__main__":
|
| 7 |
+
print(f"Scanning directory: {os.getcwd()}")
|
| 8 |
+
mt.find_and_remove_duplicates()
|
assets/list_duplicate_remove.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from model_tools import remove_duplicate_lines
|
| 2 |
+
|
| 3 |
+
input_filename = "file.txt"
|
| 4 |
+
output_filename = "processed_links.txt"
|
| 5 |
+
|
| 6 |
+
if __name__ == "__main__":
|
| 7 |
+
remove_duplicate_lines(input_filename, output_filename)
|
assets/model_data/mdx_model_data.json
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"0ddfc0eb5792638ad5dc27850236c246": {
|
| 3 |
+
"compensate": 1.035,
|
| 4 |
+
"mdx_dim_f_set": 2048,
|
| 5 |
+
"mdx_dim_t_set": 8,
|
| 6 |
+
"mdx_n_fft_scale_set": 6144,
|
| 7 |
+
"primary_stem": "Vocals"
|
| 8 |
+
},
|
| 9 |
+
"26d308f91f3423a67dc69a6d12a8793d": {
|
| 10 |
+
"compensate": 1.035,
|
| 11 |
+
"mdx_dim_f_set": 2048,
|
| 12 |
+
"mdx_dim_t_set": 9,
|
| 13 |
+
"mdx_n_fft_scale_set": 8192,
|
| 14 |
+
"primary_stem": "Other"
|
| 15 |
+
},
|
| 16 |
+
"2cdd429caac38f0194b133884160f2c6": {
|
| 17 |
+
"compensate": 1.045,
|
| 18 |
+
"mdx_dim_f_set": 3072,
|
| 19 |
+
"mdx_dim_t_set": 8,
|
| 20 |
+
"mdx_n_fft_scale_set": 7680,
|
| 21 |
+
"primary_stem": "Instrumental"
|
| 22 |
+
},
|
| 23 |
+
"2f5501189a2f6db6349916fabe8c90de": {
|
| 24 |
+
"compensate": 1.035,
|
| 25 |
+
"mdx_dim_f_set": 2048,
|
| 26 |
+
"mdx_dim_t_set": 8,
|
| 27 |
+
"mdx_n_fft_scale_set": 6144,
|
| 28 |
+
"primary_stem": "Vocals",
|
| 29 |
+
"is_karaoke": true
|
| 30 |
+
},
|
| 31 |
+
"398580b6d5d973af3120df54cee6759d": {
|
| 32 |
+
"compensate": 1.75,
|
| 33 |
+
"mdx_dim_f_set": 3072,
|
| 34 |
+
"mdx_dim_t_set": 8,
|
| 35 |
+
"mdx_n_fft_scale_set": 7680,
|
| 36 |
+
"primary_stem": "Vocals"
|
| 37 |
+
},
|
| 38 |
+
"488b3e6f8bd3717d9d7c428476be2d75": {
|
| 39 |
+
"compensate": 1.035,
|
| 40 |
+
"mdx_dim_f_set": 3072,
|
| 41 |
+
"mdx_dim_t_set": 8,
|
| 42 |
+
"mdx_n_fft_scale_set": 7680,
|
| 43 |
+
"primary_stem": "Instrumental"
|
| 44 |
+
},
|
| 45 |
+
"4910e7827f335048bdac11fa967772f9": {
|
| 46 |
+
"compensate": 1.035,
|
| 47 |
+
"mdx_dim_f_set": 2048,
|
| 48 |
+
"mdx_dim_t_set": 7,
|
| 49 |
+
"mdx_n_fft_scale_set": 4096,
|
| 50 |
+
"primary_stem": "Drums"
|
| 51 |
+
},
|
| 52 |
+
"53c4baf4d12c3e6c3831bb8f5b532b93": {
|
| 53 |
+
"compensate": 1.043,
|
| 54 |
+
"mdx_dim_f_set": 3072,
|
| 55 |
+
"mdx_dim_t_set": 8,
|
| 56 |
+
"mdx_n_fft_scale_set": 7680,
|
| 57 |
+
"primary_stem": "Vocals"
|
| 58 |
+
},
|
| 59 |
+
"5d343409ef0df48c7d78cce9f0106781": {
|
| 60 |
+
"compensate": 1.075,
|
| 61 |
+
"mdx_dim_f_set": 3072,
|
| 62 |
+
"mdx_dim_t_set": 8,
|
| 63 |
+
"mdx_n_fft_scale_set": 7680,
|
| 64 |
+
"primary_stem": "Vocals"
|
| 65 |
+
},
|
| 66 |
+
"5f6483271e1efb9bfb59e4a3e6d4d098": {
|
| 67 |
+
"compensate": 1.035,
|
| 68 |
+
"mdx_dim_f_set": 2048,
|
| 69 |
+
"mdx_dim_t_set": 9,
|
| 70 |
+
"mdx_n_fft_scale_set": 6144,
|
| 71 |
+
"primary_stem": "Vocals"
|
| 72 |
+
},
|
| 73 |
+
"65ab5919372a128e4167f5e01a8fda85": {
|
| 74 |
+
"compensate": 1.035,
|
| 75 |
+
"mdx_dim_f_set": 2048,
|
| 76 |
+
"mdx_dim_t_set": 8,
|
| 77 |
+
"mdx_n_fft_scale_set": 8192,
|
| 78 |
+
"primary_stem": "Other"
|
| 79 |
+
},
|
| 80 |
+
"6703e39f36f18aa7855ee1047765621d": {
|
| 81 |
+
"compensate": 1.035,
|
| 82 |
+
"mdx_dim_f_set": 2048,
|
| 83 |
+
"mdx_dim_t_set": 9,
|
| 84 |
+
"mdx_n_fft_scale_set": 16384,
|
| 85 |
+
"primary_stem": "Bass"
|
| 86 |
+
},
|
| 87 |
+
"6b31de20e84392859a3d09d43f089515": {
|
| 88 |
+
"compensate": 1.035,
|
| 89 |
+
"mdx_dim_f_set": 2048,
|
| 90 |
+
"mdx_dim_t_set": 8,
|
| 91 |
+
"mdx_n_fft_scale_set": 6144,
|
| 92 |
+
"primary_stem": "Vocals"
|
| 93 |
+
},
|
| 94 |
+
"867595e9de46f6ab699008295df62798": {
|
| 95 |
+
"compensate": 1.03,
|
| 96 |
+
"mdx_dim_f_set": 3072,
|
| 97 |
+
"mdx_dim_t_set": 8,
|
| 98 |
+
"mdx_n_fft_scale_set": 7680,
|
| 99 |
+
"primary_stem": "Vocals"
|
| 100 |
+
},
|
| 101 |
+
"a3cd63058945e777505c01d2507daf37": {
|
| 102 |
+
"compensate": 1.03,
|
| 103 |
+
"mdx_dim_f_set": 2048,
|
| 104 |
+
"mdx_dim_t_set": 8,
|
| 105 |
+
"mdx_n_fft_scale_set": 6144,
|
| 106 |
+
"primary_stem": "Vocals"
|
| 107 |
+
},
|
| 108 |
+
"b33d9b3950b6cbf5fe90a32608924700": {
|
| 109 |
+
"compensate": 1.03,
|
| 110 |
+
"mdx_dim_f_set": 3072,
|
| 111 |
+
"mdx_dim_t_set": 8,
|
| 112 |
+
"mdx_n_fft_scale_set": 7680,
|
| 113 |
+
"primary_stem": "Vocals"
|
| 114 |
+
},
|
| 115 |
+
"c3b29bdce8c4fa17ec609e16220330ab": {
|
| 116 |
+
"compensate": 1.035,
|
| 117 |
+
"mdx_dim_f_set": 2048,
|
| 118 |
+
"mdx_dim_t_set": 8,
|
| 119 |
+
"mdx_n_fft_scale_set": 16384,
|
| 120 |
+
"primary_stem": "Bass"
|
| 121 |
+
},
|
| 122 |
+
"ceed671467c1f64ebdfac8a2490d0d52": {
|
| 123 |
+
"compensate": 1.035,
|
| 124 |
+
"mdx_dim_f_set": 3072,
|
| 125 |
+
"mdx_dim_t_set": 8,
|
| 126 |
+
"mdx_n_fft_scale_set": 7680,
|
| 127 |
+
"primary_stem": "Instrumental"
|
| 128 |
+
},
|
| 129 |
+
"d2a1376f310e4f7fa37fb9b5774eb701": {
|
| 130 |
+
"compensate": 1.035,
|
| 131 |
+
"mdx_dim_f_set": 3072,
|
| 132 |
+
"mdx_dim_t_set": 8,
|
| 133 |
+
"mdx_n_fft_scale_set": 7680,
|
| 134 |
+
"primary_stem": "Instrumental"
|
| 135 |
+
},
|
| 136 |
+
"d7bff498db9324db933d913388cba6be": {
|
| 137 |
+
"compensate": 1.035,
|
| 138 |
+
"mdx_dim_f_set": 2048,
|
| 139 |
+
"mdx_dim_t_set": 8,
|
| 140 |
+
"mdx_n_fft_scale_set": 6144,
|
| 141 |
+
"primary_stem": "Vocals"
|
| 142 |
+
},
|
| 143 |
+
"d94058f8c7f1fae4164868ae8ae66b20": {
|
| 144 |
+
"compensate": 1.035,
|
| 145 |
+
"mdx_dim_f_set": 2048,
|
| 146 |
+
"mdx_dim_t_set": 8,
|
| 147 |
+
"mdx_n_fft_scale_set": 6144,
|
| 148 |
+
"primary_stem": "Vocals"
|
| 149 |
+
},
|
| 150 |
+
"dc41ede5961d50f277eb846db17f5319": {
|
| 151 |
+
"compensate": 1.035,
|
| 152 |
+
"mdx_dim_f_set": 2048,
|
| 153 |
+
"mdx_dim_t_set": 9,
|
| 154 |
+
"mdx_n_fft_scale_set": 4096,
|
| 155 |
+
"primary_stem": "Drums"
|
| 156 |
+
},
|
| 157 |
+
"e5572e58abf111f80d8241d2e44e7fa4": {
|
| 158 |
+
"compensate": 1.028,
|
| 159 |
+
"mdx_dim_f_set": 3072,
|
| 160 |
+
"mdx_dim_t_set": 8,
|
| 161 |
+
"mdx_n_fft_scale_set": 7680,
|
| 162 |
+
"primary_stem": "Instrumental"
|
| 163 |
+
},
|
| 164 |
+
"e7324c873b1f615c35c1967f912db92a": {
|
| 165 |
+
"compensate": 1.03,
|
| 166 |
+
"mdx_dim_f_set": 3072,
|
| 167 |
+
"mdx_dim_t_set": 8,
|
| 168 |
+
"mdx_n_fft_scale_set": 7680,
|
| 169 |
+
"primary_stem": "Vocals"
|
| 170 |
+
},
|
| 171 |
+
"1c56ec0224f1d559c42fd6fd2a67b154": {
|
| 172 |
+
"compensate": 1.025,
|
| 173 |
+
"mdx_dim_f_set": 2048,
|
| 174 |
+
"mdx_dim_t_set": 8,
|
| 175 |
+
"mdx_n_fft_scale_set": 5120,
|
| 176 |
+
"primary_stem": "Instrumental"
|
| 177 |
+
},
|
| 178 |
+
"f2df6d6863d8f435436d8b561594ff49": {
|
| 179 |
+
"compensate": 1.035,
|
| 180 |
+
"mdx_dim_f_set": 3072,
|
| 181 |
+
"mdx_dim_t_set": 8,
|
| 182 |
+
"mdx_n_fft_scale_set": 7680,
|
| 183 |
+
"primary_stem": "Instrumental"
|
| 184 |
+
},
|
| 185 |
+
"b06327a00d5e5fbc7d96e1781bbdb596": {
|
| 186 |
+
"compensate": 1.035,
|
| 187 |
+
"mdx_dim_f_set": 3072,
|
| 188 |
+
"mdx_dim_t_set": 8,
|
| 189 |
+
"mdx_n_fft_scale_set": 6144,
|
| 190 |
+
"primary_stem": "Instrumental"
|
| 191 |
+
},
|
| 192 |
+
"94ff780b977d3ca07c7a343dab2e25dd": {
|
| 193 |
+
"compensate": 1.039,
|
| 194 |
+
"mdx_dim_f_set": 3072,
|
| 195 |
+
"mdx_dim_t_set": 8,
|
| 196 |
+
"mdx_n_fft_scale_set": 6144,
|
| 197 |
+
"primary_stem": "Instrumental"
|
| 198 |
+
},
|
| 199 |
+
"73492b58195c3b52d34590d5474452f6": {
|
| 200 |
+
"compensate": 1.043,
|
| 201 |
+
"mdx_dim_f_set": 3072,
|
| 202 |
+
"mdx_dim_t_set": 8,
|
| 203 |
+
"mdx_n_fft_scale_set": 7680,
|
| 204 |
+
"primary_stem": "Vocals"
|
| 205 |
+
},
|
| 206 |
+
"970b3f9492014d18fefeedfe4773cb42": {
|
| 207 |
+
"compensate": 1.009,
|
| 208 |
+
"mdx_dim_f_set": 3072,
|
| 209 |
+
"mdx_dim_t_set": 8,
|
| 210 |
+
"mdx_n_fft_scale_set": 7680,
|
| 211 |
+
"primary_stem": "Vocals"
|
| 212 |
+
},
|
| 213 |
+
"1d64a6d2c30f709b8c9b4ce1366d96ee": {
|
| 214 |
+
"compensate": 1.065,
|
| 215 |
+
"mdx_dim_f_set": 2048,
|
| 216 |
+
"mdx_dim_t_set": 8,
|
| 217 |
+
"mdx_n_fft_scale_set": 5120,
|
| 218 |
+
"primary_stem": "Instrumental",
|
| 219 |
+
"is_karaoke": true
|
| 220 |
+
},
|
| 221 |
+
"203f2a3955221b64df85a41af87cf8f0": {
|
| 222 |
+
"compensate": 1.035,
|
| 223 |
+
"mdx_dim_f_set": 3072,
|
| 224 |
+
"mdx_dim_t_set": 8,
|
| 225 |
+
"mdx_n_fft_scale_set": 6144,
|
| 226 |
+
"primary_stem": "Instrumental"
|
| 227 |
+
},
|
| 228 |
+
"291c2049608edb52648b96e27eb80e95": {
|
| 229 |
+
"compensate": 1.035,
|
| 230 |
+
"mdx_dim_f_set": 3072,
|
| 231 |
+
"mdx_dim_t_set": 8,
|
| 232 |
+
"mdx_n_fft_scale_set": 6144,
|
| 233 |
+
"primary_stem": "Instrumental"
|
| 234 |
+
},
|
| 235 |
+
"ead8d05dab12ec571d67549b3aab03fc": {
|
| 236 |
+
"compensate": 1.035,
|
| 237 |
+
"mdx_dim_f_set": 3072,
|
| 238 |
+
"mdx_dim_t_set": 8,
|
| 239 |
+
"mdx_n_fft_scale_set": 6144,
|
| 240 |
+
"primary_stem": "Instrumental"
|
| 241 |
+
},
|
| 242 |
+
"cc63408db3d80b4d85b0287d1d7c9632": {
|
| 243 |
+
"compensate": 1.033,
|
| 244 |
+
"mdx_dim_f_set": 3072,
|
| 245 |
+
"mdx_dim_t_set": 8,
|
| 246 |
+
"mdx_n_fft_scale_set": 6144,
|
| 247 |
+
"primary_stem": "Instrumental"
|
| 248 |
+
},
|
| 249 |
+
"cd5b2989ad863f116c855db1dfe24e39": {
|
| 250 |
+
"compensate": 1.035,
|
| 251 |
+
"mdx_dim_f_set": 3072,
|
| 252 |
+
"mdx_dim_t_set": 9,
|
| 253 |
+
"mdx_n_fft_scale_set": 6144,
|
| 254 |
+
"primary_stem": "Reverb"
|
| 255 |
+
},
|
| 256 |
+
"55657dd70583b0fedfba5f67df11d711": {
|
| 257 |
+
"compensate": 1.022,
|
| 258 |
+
"mdx_dim_f_set": 3072,
|
| 259 |
+
"mdx_dim_t_set": 8,
|
| 260 |
+
"mdx_n_fft_scale_set": 6144,
|
| 261 |
+
"primary_stem": "Instrumental"
|
| 262 |
+
},
|
| 263 |
+
"b6bccda408a436db8500083ef3491e8b": {
|
| 264 |
+
"compensate": 1.02,
|
| 265 |
+
"mdx_dim_f_set": 3072,
|
| 266 |
+
"mdx_dim_t_set": 8,
|
| 267 |
+
"mdx_n_fft_scale_set": 7680,
|
| 268 |
+
"primary_stem": "Instrumental"
|
| 269 |
+
},
|
| 270 |
+
"8a88db95c7fb5dbe6a095ff2ffb428b1": {
|
| 271 |
+
"compensate": 1.026,
|
| 272 |
+
"mdx_dim_f_set": 2048,
|
| 273 |
+
"mdx_dim_t_set": 8,
|
| 274 |
+
"mdx_n_fft_scale_set": 5120,
|
| 275 |
+
"primary_stem": "Instrumental"
|
| 276 |
+
},
|
| 277 |
+
"b78da4afc6512f98e4756f5977f5c6b9": {
|
| 278 |
+
"compensate": 1.021,
|
| 279 |
+
"mdx_dim_f_set": 3072,
|
| 280 |
+
"mdx_dim_t_set": 8,
|
| 281 |
+
"mdx_n_fft_scale_set": 7680,
|
| 282 |
+
"primary_stem": "Instrumental"
|
| 283 |
+
},
|
| 284 |
+
"77d07b2667ddf05b9e3175941b4454a0": {
|
| 285 |
+
"compensate": 1.021,
|
| 286 |
+
"mdx_dim_f_set": 3072,
|
| 287 |
+
"mdx_dim_t_set": 8,
|
| 288 |
+
"mdx_n_fft_scale_set": 7680,
|
| 289 |
+
"primary_stem": "Vocals"
|
| 290 |
+
},
|
| 291 |
+
"0f2a6bc5b49d87d64728ee40e23bceb1": {
|
| 292 |
+
"compensate": 1.019,
|
| 293 |
+
"mdx_dim_f_set": 2560,
|
| 294 |
+
"mdx_dim_t_set": 8,
|
| 295 |
+
"mdx_n_fft_scale_set": 5120,
|
| 296 |
+
"primary_stem": "Instrumental"
|
| 297 |
+
},
|
| 298 |
+
"b02be2d198d4968a121030cf8950b492": {
|
| 299 |
+
"compensate": 1.020,
|
| 300 |
+
"mdx_dim_f_set": 2560,
|
| 301 |
+
"mdx_dim_t_set": 8,
|
| 302 |
+
"mdx_n_fft_scale_set": 5120,
|
| 303 |
+
"primary_stem": "No Crowd"
|
| 304 |
+
},
|
| 305 |
+
"2154254ee89b2945b97a7efed6e88820": {
|
| 306 |
+
"config_yaml": "model_2_stem_061321.yaml"
|
| 307 |
+
},
|
| 308 |
+
"063aadd735d58150722926dcbf5852a9": {
|
| 309 |
+
"config_yaml": "model_2_stem_061321.yaml"
|
| 310 |
+
},
|
| 311 |
+
"c09f714d978b41d718facfe3427e6001": {
|
| 312 |
+
"config_yaml": "model_2_stem_061321.yaml"
|
| 313 |
+
},
|
| 314 |
+
"fe96801369f6a148df2720f5ced88c19": {
|
| 315 |
+
"config_yaml": "model3.yaml"
|
| 316 |
+
},
|
| 317 |
+
"02e8b226f85fb566e5db894b9931c640": {
|
| 318 |
+
"config_yaml": "model2.yaml"
|
| 319 |
+
},
|
| 320 |
+
"e3de6d861635ab9c1d766149edd680d6": {
|
| 321 |
+
"config_yaml": "model1.yaml"
|
| 322 |
+
},
|
| 323 |
+
"3f2936c554ab73ce2e396d54636bd373": {
|
| 324 |
+
"config_yaml": "modelB.yaml"
|
| 325 |
+
},
|
| 326 |
+
"890d0f6f82d7574bca741a9e8bcb8168": {
|
| 327 |
+
"config_yaml": "modelB.yaml"
|
| 328 |
+
},
|
| 329 |
+
"63a3cb8c37c474681049be4ad1ba8815": {
|
| 330 |
+
"config_yaml": "modelB.yaml"
|
| 331 |
+
},
|
| 332 |
+
"a7fc5d719743c7fd6b61bd2b4d48b9f0": {
|
| 333 |
+
"config_yaml": "modelA.yaml"
|
| 334 |
+
},
|
| 335 |
+
"3567f3dee6e77bf366fcb1c7b8bc3745": {
|
| 336 |
+
"config_yaml": "modelA.yaml"
|
| 337 |
+
},
|
| 338 |
+
"a28f4d717bd0d34cd2ff7a3b0a3d065e": {
|
| 339 |
+
"config_yaml": "modelA.yaml"
|
| 340 |
+
},
|
| 341 |
+
"c9971a18da20911822593dc81caa8be9": {
|
| 342 |
+
"config_yaml": "sndfx.yaml"
|
| 343 |
+
},
|
| 344 |
+
"57d94d5ed705460d21c75a5ac829a605": {
|
| 345 |
+
"config_yaml": "sndfx.yaml"
|
| 346 |
+
},
|
| 347 |
+
"e7a25f8764f25a52c1b96c4946e66ba2": {
|
| 348 |
+
"config_yaml": "sndfx.yaml"
|
| 349 |
+
},
|
| 350 |
+
"104081d24e37217086ce5fde09147ee1": {
|
| 351 |
+
"config_yaml": "model_2_stem_061321.yaml"
|
| 352 |
+
},
|
| 353 |
+
"1e6165b601539f38d0a9330f3facffeb": {
|
| 354 |
+
"config_yaml": "model_2_stem_061321.yaml"
|
| 355 |
+
},
|
| 356 |
+
"fe0108464ce0d8271be5ab810891bd7c": {
|
| 357 |
+
"config_yaml": "model_2_stem_full_band.yaml"
|
| 358 |
+
},
|
| 359 |
+
"e9b82ec90ee56c507a3a982f1555714c": {
|
| 360 |
+
"config_yaml": "model_2_stem_full_band_2.yaml"
|
| 361 |
+
},
|
| 362 |
+
"99b6ceaae542265a3b6d657bf9fde79f": {
|
| 363 |
+
"config_yaml": "model_2_stem_full_band_8k.yaml"
|
| 364 |
+
},
|
| 365 |
+
"116f6f9dabb907b53d847ed9f7a9475f": {
|
| 366 |
+
"config_yaml": "model_2_stem_full_band_8k.yaml"
|
| 367 |
+
},
|
| 368 |
+
"53f707017bfcbb56f5e1bfac420d6732": {
|
| 369 |
+
"config_yaml": "model_bs_roformer_ep_317_sdr_12.9755.yaml",
|
| 370 |
+
"is_roformer": true
|
| 371 |
+
},
|
| 372 |
+
"63e41acc264bf681a73aa9f7e5f606cc": {
|
| 373 |
+
"config_yaml": "model_mel_band_roformer_ep_3005_sdr_11.4360.yaml",
|
| 374 |
+
"is_roformer": true
|
| 375 |
+
},
|
| 376 |
+
"e733736763234047587931fc35322fd9": {
|
| 377 |
+
"config_yaml": "model_bs_roformer_ep_937_sdr_10.5309.yaml",
|
| 378 |
+
"is_roformer": true
|
| 379 |
+
},
|
| 380 |
+
"d789065adfd747d6f585b27b495bcdae": {
|
| 381 |
+
"config_yaml": "model_bs_roformer_ep_368_sdr_12.9628.yaml",
|
| 382 |
+
"is_roformer": true
|
| 383 |
+
}
|
| 384 |
+
}
|
assets/model_tools.py
ADDED
|
@@ -0,0 +1,380 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import hashlib
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import sys
|
| 5 |
+
import subprocess
|
| 6 |
+
import requests
|
| 7 |
+
from huggingface_hub import HfApi, snapshot_download
|
| 8 |
+
import hashlib
|
| 9 |
+
from collections import defaultdict
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 13 |
+
|
| 14 |
+
def calculate_file_hash(filepath, block_size=65536):
|
| 15 |
+
"""Calculates the SHA256 hash of a file's content."""
|
| 16 |
+
sha256 = hashlib.sha256()
|
| 17 |
+
try:
|
| 18 |
+
with open(filepath, "rb") as f:
|
| 19 |
+
while chunk := f.read(block_size):
|
| 20 |
+
sha256.update(chunk)
|
| 21 |
+
except FileNotFoundError:
|
| 22 |
+
return None # Handle cases where a file might be deleted during the scan
|
| 23 |
+
|
| 24 |
+
return sha256.hexdigest()
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def find_and_remove_duplicates(directory="."):
|
| 28 |
+
"""Finds duplicate files in the given directory and removes the one with the longer filename."""
|
| 29 |
+
hashes_to_files = defaultdict(list)
|
| 30 |
+
files_to_hash = {}
|
| 31 |
+
|
| 32 |
+
# Step 1: Hash all files in the directory
|
| 33 |
+
for filename in os.listdir(directory):
|
| 34 |
+
filepath = os.path.join(directory, filename)
|
| 35 |
+
if os.path.isfile(filepath):
|
| 36 |
+
file_hash = calculate_file_hash(filepath)
|
| 37 |
+
if file_hash:
|
| 38 |
+
hashes_to_files[file_hash].append(filepath)
|
| 39 |
+
files_to_hash[filepath] = file_hash
|
| 40 |
+
|
| 41 |
+
# Step 2: Identify duplicate groups (more than one file per hash)
|
| 42 |
+
duplicates = {h: files for h, files in hashes_to_files.items() if len(files) > 1}
|
| 43 |
+
|
| 44 |
+
if not duplicates:
|
| 45 |
+
print("No duplicate files found.")
|
| 46 |
+
return
|
| 47 |
+
|
| 48 |
+
# Step 3: Iterate over duplicates, compare filename length, and delete the longer one
|
| 49 |
+
for file_hash, file_list in duplicates.items():
|
| 50 |
+
# Sort files by filename length (ascending). The one to keep is the first item.
|
| 51 |
+
# If lengths are equal, an arbitrary one is kept.
|
| 52 |
+
files_sorted_by_length = sorted(file_list, key=len)
|
| 53 |
+
file_to_keep = files_sorted_by_length[0]
|
| 54 |
+
files_to_delete = files_sorted_by_length[1:]
|
| 55 |
+
|
| 56 |
+
print(f"\nDuplicate group (Hash: {file_hash[:10]}...):")
|
| 57 |
+
print(f" Keeping: {file_to_keep}")
|
| 58 |
+
for file_to_delete in files_to_delete:
|
| 59 |
+
try:
|
| 60 |
+
os.remove(file_to_delete)
|
| 61 |
+
print(f" Deleted: {file_to_delete} (longer filename)")
|
| 62 |
+
except OSError as e:
|
| 63 |
+
print(f" Error deleting {file_to_delete}: {e}")
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def download_file(url, local_dir):
|
| 67 |
+
"""Helper function to download a single file."""
|
| 68 |
+
try:
|
| 69 |
+
# Extract filename from URL (e.g., https://example.com/file.jpg -> file.jpg)
|
| 70 |
+
filename = url.split("/")[-1].split("?")[0] or "downloaded_file"
|
| 71 |
+
save_path = os.path.join(local_dir, filename)
|
| 72 |
+
|
| 73 |
+
# Download the file content
|
| 74 |
+
response = requests.get(url, stream=True, timeout=10)
|
| 75 |
+
response.raise_for_status()
|
| 76 |
+
|
| 77 |
+
with open(save_path, "wb") as f:
|
| 78 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 79 |
+
f.write(chunk)
|
| 80 |
+
return f"Successfully downloaded: {filename}"
|
| 81 |
+
except Exception as e:
|
| 82 |
+
return f"Failed to download {url}: {e}"
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def download_files_from_txt(filename, local_dir):
|
| 86 |
+
"""Main function to read URLs and download them using 20 threads."""
|
| 87 |
+
# Ensure local directory exists
|
| 88 |
+
if not os.path.exists(local_dir):
|
| 89 |
+
os.makedirs(local_dir)
|
| 90 |
+
|
| 91 |
+
# Read URLs from the text file
|
| 92 |
+
with open(filename, "r") as f:
|
| 93 |
+
urls = [line.strip() for line in f if line.strip()]
|
| 94 |
+
|
| 95 |
+
# Use ThreadPoolExecutor to handle 20 downloads at a time
|
| 96 |
+
with ThreadPoolExecutor(max_workers=20) as executor:
|
| 97 |
+
# Submit all download tasks to the pool
|
| 98 |
+
results = [executor.submit(download_file, url, local_dir) for url in urls]
|
| 99 |
+
|
| 100 |
+
# Monitor results as they complete
|
| 101 |
+
for future in results:
|
| 102 |
+
print(future.result())
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def download_files_from_txt_aria(filename, local_dir):
|
| 106 |
+
command = [
|
| 107 |
+
"aria2c",
|
| 108 |
+
"--input-file",
|
| 109 |
+
filename,
|
| 110 |
+
"--dir",
|
| 111 |
+
local_dir,
|
| 112 |
+
"-c", # Continue downloading a partially downloaded file
|
| 113 |
+
"-j",
|
| 114 |
+
"30", # Set max concurrent downloads (adjust as needed)
|
| 115 |
+
"-x",
|
| 116 |
+
"16", # Set max connections per server (adjust as needed)
|
| 117 |
+
]
|
| 118 |
+
print(f"Starting downloads with aria2c in directory: {os.path.abspath(local_dir)}")
|
| 119 |
+
try:
|
| 120 |
+
# Execute the command
|
| 121 |
+
subprocess.run(
|
| 122 |
+
command,
|
| 123 |
+
check=True,
|
| 124 |
+
stdout=subprocess.PIPE,
|
| 125 |
+
stderr=subprocess.PIPE,
|
| 126 |
+
text=True,
|
| 127 |
+
)
|
| 128 |
+
print("All downloads finished successfully.")
|
| 129 |
+
except subprocess.CalledProcessError as e:
|
| 130 |
+
print(f"An error occurred during aria2c execution: {e.stderr}")
|
| 131 |
+
except Exception as e:
|
| 132 |
+
print(f"An unexpected error occurred: {e}")
|
| 133 |
+
finally:
|
| 134 |
+
# os.remove(filename)
|
| 135 |
+
print(f"Downloaded all files: {filename}")
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def download_hf_repo(repo_id, local_dir, repo_type, token):
|
| 139 |
+
if not token:
|
| 140 |
+
token = os.getenv("HF_TOKEN")
|
| 141 |
+
"""
|
| 142 |
+
Downloads an entire Hugging Face repository to a specified local directory.
|
| 143 |
+
"""
|
| 144 |
+
print(f"Downloading {repo_id} to {local_dir}...")
|
| 145 |
+
|
| 146 |
+
# Ensure the target directory exists
|
| 147 |
+
os.makedirs(local_dir, exist_ok=True)
|
| 148 |
+
|
| 149 |
+
# Download the snapshot
|
| 150 |
+
downloaded_path = snapshot_download(
|
| 151 |
+
repo_id=repo_id,
|
| 152 |
+
local_dir=local_dir,
|
| 153 |
+
token=token,
|
| 154 |
+
local_dir_use_symlinks=False, # Set to False to ensure actual files are moved to local_dir
|
| 155 |
+
repo_type=repo_type,
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
print(f"Download complete! Files are located in: {downloaded_path}")
|
| 159 |
+
return downloaded_path
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def remove_duplicate_lines(input_file_path, output_file_path):
|
| 163 |
+
"""
|
| 164 |
+
Reads lines from input_file_path, removes duplicates, and writes
|
| 165 |
+
unique lines to output_file_path while preserving order.
|
| 166 |
+
"""
|
| 167 |
+
try:
|
| 168 |
+
# Use an ordered set to maintain the original file's line order.
|
| 169 |
+
# An easy way to do this in Python 3.7+ is using a dictionary's keys.
|
| 170 |
+
unique_lines_dict = {}
|
| 171 |
+
with open(input_file_path, "r") as input_file:
|
| 172 |
+
for line in input_file:
|
| 173 |
+
# Store line as a dictionary key; duplicates will be ignored
|
| 174 |
+
unique_lines_dict[line] = None
|
| 175 |
+
|
| 176 |
+
unique_lines = unique_lines_dict.keys()
|
| 177 |
+
|
| 178 |
+
with open(output_file_path, "w") as output_file:
|
| 179 |
+
# Write all unique lines to the new file
|
| 180 |
+
output_file.writelines(unique_lines)
|
| 181 |
+
|
| 182 |
+
print(f"Duplicates removed. Unique lines saved to '{output_file_path}'")
|
| 183 |
+
|
| 184 |
+
except FileNotFoundError:
|
| 185 |
+
print(f"Error: The file '{input_file_path}' was not found.")
|
| 186 |
+
except Exception as e:
|
| 187 |
+
print(f"An error occurred: {e}")
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def push_to_hf(repo_id, repo_type):
|
| 191 |
+
api = HfApi()
|
| 192 |
+
|
| 193 |
+
print(f"Uploading current directory to: {repo_id}")
|
| 194 |
+
|
| 195 |
+
# Upload everything in the current directory ('.') to the repo root
|
| 196 |
+
api.upload_folder(
|
| 197 |
+
folder_path=".",
|
| 198 |
+
repo_id=repo_id,
|
| 199 |
+
repo_type=repo_type,
|
| 200 |
+
commit_message="Initial model upload",
|
| 201 |
+
)
|
| 202 |
+
print("Upload complete!")
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def push_large_folder_to_hf(repo_id, repo_type):
|
| 206 |
+
api = HfApi()
|
| 207 |
+
print(f"Starting large folder upload to: {repo_id}")
|
| 208 |
+
|
| 209 |
+
# 3. Use upload_large_folder for resilience and speed
|
| 210 |
+
# This automatically handles multi-threading and local caching for resuming
|
| 211 |
+
api.upload_large_folder(
|
| 212 |
+
folder_path=".",
|
| 213 |
+
repo_id=repo_id,
|
| 214 |
+
repo_type=repo_type,
|
| 215 |
+
# Optional: ignore large junk files to save time
|
| 216 |
+
ignore_patterns=[
|
| 217 |
+
".git/",
|
| 218 |
+
"__pycache__/",
|
| 219 |
+
"*.tmp",
|
| 220 |
+
".DS_Store",
|
| 221 |
+
"*.cache",
|
| 222 |
+
"*.trash",
|
| 223 |
+
],
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
print(
|
| 227 |
+
"\nUpload complete! Progress was cached locally; if it failed, just run again to resume."
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def get_model_hash(model_path):
|
| 232 |
+
"""
|
| 233 |
+
Get the hash of a model file
|
| 234 |
+
"""
|
| 235 |
+
# print(f"Getting hash for model at {model_path}")
|
| 236 |
+
try:
|
| 237 |
+
with open(model_path, "rb") as f:
|
| 238 |
+
f.seek(
|
| 239 |
+
-10000 * 1024, 2
|
| 240 |
+
) # Move the file pointer 10MB before the end of the file
|
| 241 |
+
hash_result = hashlib.md5(f.read()).hexdigest()
|
| 242 |
+
# print(f"Hash for {model_path}: {hash_result}")
|
| 243 |
+
return hash_result
|
| 244 |
+
except IOError:
|
| 245 |
+
with open(model_path, "rb") as f:
|
| 246 |
+
hash_result = hashlib.md5(f.read()).hexdigest()
|
| 247 |
+
# print(f"IOError encountered, hash for {model_path}: {hash_result}")
|
| 248 |
+
return hash_result
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def download_file_if_missing(url, local_path):
|
| 252 |
+
"""
|
| 253 |
+
Download a file from a URL if it doesn't exist locally
|
| 254 |
+
"""
|
| 255 |
+
print(f"Checking if {local_path} needs to be downloaded from {url}")
|
| 256 |
+
if not os.path.exists(local_path):
|
| 257 |
+
print(f"Downloading {url} to {local_path}")
|
| 258 |
+
with requests.get(url, stream=True, timeout=10) as r:
|
| 259 |
+
r.raise_for_status()
|
| 260 |
+
with open(local_path, "wb") as f:
|
| 261 |
+
for chunk in r.iter_content(chunk_size=8192):
|
| 262 |
+
f.write(chunk)
|
| 263 |
+
print(f"Downloaded {url} to {local_path}")
|
| 264 |
+
else:
|
| 265 |
+
print(f"{local_path} already exists. Skipping download.")
|
| 266 |
+
|
| 267 |
+
|
| 268 |
+
def load_json_data(file_path):
|
| 269 |
+
"""
|
| 270 |
+
Load JSON data from a file
|
| 271 |
+
"""
|
| 272 |
+
print(f"Loading JSON data from {file_path}")
|
| 273 |
+
try:
|
| 274 |
+
with open(file_path, "r", encoding="utf-8") as file:
|
| 275 |
+
data = json.load(file)
|
| 276 |
+
print(f"Loaded JSON data successfully from {file_path}")
|
| 277 |
+
return data
|
| 278 |
+
except FileNotFoundError:
|
| 279 |
+
print(f"{file_path} not found.")
|
| 280 |
+
sys.exit(1)
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
def iterate_and_hash(
|
| 284 |
+
directory,
|
| 285 |
+
vr_model_data_url,
|
| 286 |
+
mdx_model_data_url,
|
| 287 |
+
vr_model_data_local_path,
|
| 288 |
+
mdx_model_data_local_path,
|
| 289 |
+
):
|
| 290 |
+
"""
|
| 291 |
+
Iterate through a directory and hash all model files
|
| 292 |
+
"""
|
| 293 |
+
print(f"Iterating through directory {directory} to hash model files")
|
| 294 |
+
model_files = [
|
| 295 |
+
(file, os.path.join(root, file))
|
| 296 |
+
for root, _, files in os.walk(directory)
|
| 297 |
+
for file in files
|
| 298 |
+
if file.endswith((".pth", ".onnx"))
|
| 299 |
+
]
|
| 300 |
+
|
| 301 |
+
download_file_if_missing(vr_model_data_url, vr_model_data_local_path)
|
| 302 |
+
download_file_if_missing(mdx_model_data_url, mdx_model_data_local_path)
|
| 303 |
+
|
| 304 |
+
vr_model_data = load_json_data(vr_model_data_local_path)
|
| 305 |
+
mdx_model_data = load_json_data(mdx_model_data_local_path)
|
| 306 |
+
|
| 307 |
+
combined_model_params = {
|
| 308 |
+
**vr_model_data,
|
| 309 |
+
**mdx_model_data,
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
model_info_list = []
|
| 313 |
+
for file, file_path in sorted(model_files):
|
| 314 |
+
file_hash = get_model_hash(file_path)
|
| 315 |
+
model_info = {
|
| 316 |
+
"file": file,
|
| 317 |
+
"hash": file_hash,
|
| 318 |
+
"params": combined_model_params.get(file_hash, "Parameters not found"),
|
| 319 |
+
}
|
| 320 |
+
model_info_list.append(model_info)
|
| 321 |
+
|
| 322 |
+
print(f"Writing model info list to {OUTPUT_PATH}")
|
| 323 |
+
with open(OUTPUT_PATH, "w", encoding="utf-8") as json_file:
|
| 324 |
+
json.dump(model_info_list, json_file, indent=4)
|
| 325 |
+
print(f"Successfully wrote model info list to {OUTPUT_PATH}")
|
| 326 |
+
|
| 327 |
+
|
| 328 |
+
def sort_links_by_extension(input_file, output_file):
|
| 329 |
+
# Define the custom priority order
|
| 330 |
+
priority = {
|
| 331 |
+
".json": 0,
|
| 332 |
+
".yaml": 1,
|
| 333 |
+
".th": 2,
|
| 334 |
+
".pth": 3,
|
| 335 |
+
".ckpt": 4,
|
| 336 |
+
".onnx": 5, # Added .onnx (common typo for .onnx or .onx)
|
| 337 |
+
}
|
| 338 |
+
|
| 339 |
+
# Handle the specific user request for .onnx
|
| 340 |
+
# Example: Map .onnx to priority 5
|
| 341 |
+
# priority['.onnx'] = 5
|
| 342 |
+
|
| 343 |
+
try:
|
| 344 |
+
with open(input_file, "r") as f:
|
| 345 |
+
# Read lines and strip whitespace/newlines
|
| 346 |
+
links = [line.strip() for line in f if line.strip()]
|
| 347 |
+
|
| 348 |
+
def sort_key(link):
|
| 349 |
+
# Extract extension (case-insensitive)
|
| 350 |
+
_, ext = os.path.splitext(link.lower())
|
| 351 |
+
# Return priority index; if not in list, place at the end (index 100)
|
| 352 |
+
return priority.get(ext, 100), link
|
| 353 |
+
|
| 354 |
+
# Sort the links
|
| 355 |
+
sorted_links = sorted(links, key=sort_key)
|
| 356 |
+
|
| 357 |
+
with open(output_file, "w") as f:
|
| 358 |
+
for link in sorted_links:
|
| 359 |
+
f.write(link + "\n")
|
| 360 |
+
|
| 361 |
+
print(f"Successfully sorted links into: {output_file}")
|
| 362 |
+
|
| 363 |
+
except FileNotFoundError:
|
| 364 |
+
print(f"Error: The file '{input_file}' was not found.")
|
| 365 |
+
|
| 366 |
+
|
| 367 |
+
# 1. Load the JSON data
|
| 368 |
+
# Ensure 'models.json' is in your current directory
|
| 369 |
+
def get_links_from_json(file_input):
|
| 370 |
+
try:
|
| 371 |
+
with open(file_input, "r") as file:
|
| 372 |
+
data = json.load(file)
|
| 373 |
+
except FileNotFoundError:
|
| 374 |
+
print("Error: 'models.json' not found.")
|
| 375 |
+
data = {}
|
| 376 |
+
|
| 377 |
+
# 2. Process and Download
|
| 378 |
+
for model_name, links in data.items():
|
| 379 |
+
if not isinstance(links, list) or len(links) == 0:
|
| 380 |
+
continue
|
config_aspiration_mel_band_roformer.yaml
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 352800
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 801 # don't work (use in model)
|
| 5 |
+
hop_length: 441 # don't work (use in model)
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 256
|
| 13 |
+
depth: 8
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 2
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0.1
|
| 22 |
+
ff_dropout: 0.1
|
| 23 |
+
flash_attn: True
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: False
|
| 30 |
+
mask_estimator_depth: 2
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: False
|
| 40 |
+
|
| 41 |
+
training:
|
| 42 |
+
batch_size: 1
|
| 43 |
+
gradient_accumulation_steps: 8
|
| 44 |
+
grad_clip: 0
|
| 45 |
+
instruments:
|
| 46 |
+
- aspiration
|
| 47 |
+
- other
|
| 48 |
+
lr: 4.0e-05
|
| 49 |
+
patience: 2
|
| 50 |
+
reduce_factor: 0.95
|
| 51 |
+
target_instrument: null
|
| 52 |
+
num_epochs: 1000
|
| 53 |
+
num_steps: 1000
|
| 54 |
+
q: 0.95
|
| 55 |
+
coarse_loss_clip: true
|
| 56 |
+
ema_momentum: 0.999
|
| 57 |
+
optimizer: adam
|
| 58 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 59 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 60 |
+
|
| 61 |
+
augmentations:
|
| 62 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
| 63 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
| 64 |
+
loudness_min: 0.5
|
| 65 |
+
loudness_max: 1.5
|
| 66 |
+
mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
| 67 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
| 68 |
+
- 0.2
|
| 69 |
+
- 0.02
|
| 70 |
+
mixup_loudness_min: 0.5
|
| 71 |
+
mixup_loudness_max: 1.5
|
| 72 |
+
|
| 73 |
+
inference:
|
| 74 |
+
batch_size: 4
|
| 75 |
+
dim_t: 801
|
| 76 |
+
num_overlap: 2
|
config_bs_roformer_instrumental_resurrection_unwa.yaml
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 749259
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 1700 # don't work (use in model)
|
| 5 |
+
hop_length: 441 # don't work (use in model)
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 256
|
| 13 |
+
depth: 12
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
linear_transformer_depth: 0
|
| 19 |
+
freqs_per_bands: !!python/tuple
|
| 20 |
+
- 2
|
| 21 |
+
- 2
|
| 22 |
+
- 2
|
| 23 |
+
- 2
|
| 24 |
+
- 2
|
| 25 |
+
- 2
|
| 26 |
+
- 2
|
| 27 |
+
- 2
|
| 28 |
+
- 2
|
| 29 |
+
- 2
|
| 30 |
+
- 2
|
| 31 |
+
- 2
|
| 32 |
+
- 2
|
| 33 |
+
- 2
|
| 34 |
+
- 2
|
| 35 |
+
- 2
|
| 36 |
+
- 2
|
| 37 |
+
- 2
|
| 38 |
+
- 2
|
| 39 |
+
- 2
|
| 40 |
+
- 2
|
| 41 |
+
- 2
|
| 42 |
+
- 2
|
| 43 |
+
- 2
|
| 44 |
+
- 4
|
| 45 |
+
- 4
|
| 46 |
+
- 4
|
| 47 |
+
- 4
|
| 48 |
+
- 4
|
| 49 |
+
- 4
|
| 50 |
+
- 4
|
| 51 |
+
- 4
|
| 52 |
+
- 4
|
| 53 |
+
- 4
|
| 54 |
+
- 4
|
| 55 |
+
- 4
|
| 56 |
+
- 12
|
| 57 |
+
- 12
|
| 58 |
+
- 12
|
| 59 |
+
- 12
|
| 60 |
+
- 12
|
| 61 |
+
- 12
|
| 62 |
+
- 12
|
| 63 |
+
- 12
|
| 64 |
+
- 24
|
| 65 |
+
- 24
|
| 66 |
+
- 24
|
| 67 |
+
- 24
|
| 68 |
+
- 24
|
| 69 |
+
- 24
|
| 70 |
+
- 24
|
| 71 |
+
- 24
|
| 72 |
+
- 48
|
| 73 |
+
- 48
|
| 74 |
+
- 48
|
| 75 |
+
- 48
|
| 76 |
+
- 48
|
| 77 |
+
- 48
|
| 78 |
+
- 48
|
| 79 |
+
- 48
|
| 80 |
+
- 128
|
| 81 |
+
- 129
|
| 82 |
+
dim_head: 64
|
| 83 |
+
heads: 8
|
| 84 |
+
attn_dropout: 0.
|
| 85 |
+
ff_dropout: 0.
|
| 86 |
+
flash_attn: true
|
| 87 |
+
dim_freqs_in: 1025
|
| 88 |
+
stft_n_fft: 2048
|
| 89 |
+
stft_hop_length: 441
|
| 90 |
+
stft_win_length: 2048
|
| 91 |
+
stft_normalized: false
|
| 92 |
+
mask_estimator_depth: 2
|
| 93 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 94 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 95 |
+
- 4096
|
| 96 |
+
- 2048
|
| 97 |
+
- 1024
|
| 98 |
+
- 512
|
| 99 |
+
- 256
|
| 100 |
+
multi_stft_hop_size: 147
|
| 101 |
+
multi_stft_normalized: False
|
| 102 |
+
|
| 103 |
+
training:
|
| 104 |
+
batch_size: 2
|
| 105 |
+
gradient_accumulation_steps: 1
|
| 106 |
+
grad_clip: 0
|
| 107 |
+
instruments: ['vocals', 'other']
|
| 108 |
+
patience: 3
|
| 109 |
+
reduce_factor: 0.95
|
| 110 |
+
target_instrument: other
|
| 111 |
+
num_epochs: 1000
|
| 112 |
+
num_steps: 1000
|
| 113 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
| 114 |
+
augmentation_type: simple1
|
| 115 |
+
use_mp3_compress: false # Deprecated
|
| 116 |
+
augmentation_mix: true # Mix several stems of the same type with some probability
|
| 117 |
+
augmentation_loudness: true # randomly change loudness of each stem
|
| 118 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
| 119 |
+
augmentation_loudness_min: 0.5
|
| 120 |
+
augmentation_loudness_max: 1.5
|
| 121 |
+
q: 0.95
|
| 122 |
+
coarse_loss_clip: true
|
| 123 |
+
ema_momentum: 0.999
|
| 124 |
+
# optimizer: prodigy
|
| 125 |
+
optimizer: adam
|
| 126 |
+
# lr: 1.0
|
| 127 |
+
lr: 1.0e-5
|
| 128 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 129 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 130 |
+
|
| 131 |
+
inference:
|
| 132 |
+
batch_size: 2
|
| 133 |
+
dim_t: 1700
|
| 134 |
+
num_overlap: 2
|
| 135 |
+
normalize: false
|
config_bs_roformer_karaoke_frazer_becruily.yaml
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 882000
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 801
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 256
|
| 13 |
+
depth: 12
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
linear_transformer_depth: 0
|
| 19 |
+
freqs_per_bands: !!python/tuple
|
| 20 |
+
- 2
|
| 21 |
+
- 2
|
| 22 |
+
- 2
|
| 23 |
+
- 2
|
| 24 |
+
- 2
|
| 25 |
+
- 2
|
| 26 |
+
- 2
|
| 27 |
+
- 2
|
| 28 |
+
- 2
|
| 29 |
+
- 2
|
| 30 |
+
- 2
|
| 31 |
+
- 2
|
| 32 |
+
- 2
|
| 33 |
+
- 2
|
| 34 |
+
- 2
|
| 35 |
+
- 2
|
| 36 |
+
- 2
|
| 37 |
+
- 2
|
| 38 |
+
- 2
|
| 39 |
+
- 2
|
| 40 |
+
- 2
|
| 41 |
+
- 2
|
| 42 |
+
- 2
|
| 43 |
+
- 2
|
| 44 |
+
- 4
|
| 45 |
+
- 4
|
| 46 |
+
- 4
|
| 47 |
+
- 4
|
| 48 |
+
- 4
|
| 49 |
+
- 4
|
| 50 |
+
- 4
|
| 51 |
+
- 4
|
| 52 |
+
- 4
|
| 53 |
+
- 4
|
| 54 |
+
- 4
|
| 55 |
+
- 4
|
| 56 |
+
- 12
|
| 57 |
+
- 12
|
| 58 |
+
- 12
|
| 59 |
+
- 12
|
| 60 |
+
- 12
|
| 61 |
+
- 12
|
| 62 |
+
- 12
|
| 63 |
+
- 12
|
| 64 |
+
- 24
|
| 65 |
+
- 24
|
| 66 |
+
- 24
|
| 67 |
+
- 24
|
| 68 |
+
- 24
|
| 69 |
+
- 24
|
| 70 |
+
- 24
|
| 71 |
+
- 24
|
| 72 |
+
- 48
|
| 73 |
+
- 48
|
| 74 |
+
- 48
|
| 75 |
+
- 48
|
| 76 |
+
- 48
|
| 77 |
+
- 48
|
| 78 |
+
- 48
|
| 79 |
+
- 48
|
| 80 |
+
- 128
|
| 81 |
+
- 129
|
| 82 |
+
dim_head: 64
|
| 83 |
+
heads: 8
|
| 84 |
+
attn_dropout: 0
|
| 85 |
+
ff_dropout: 0
|
| 86 |
+
flash_attn: true
|
| 87 |
+
dim_freqs_in: 1025
|
| 88 |
+
stft_n_fft: 2048
|
| 89 |
+
stft_hop_length: 512
|
| 90 |
+
stft_win_length: 2048
|
| 91 |
+
stft_normalized: false
|
| 92 |
+
mask_estimator_depth: 2
|
| 93 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 94 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 95 |
+
- 4096
|
| 96 |
+
- 2048
|
| 97 |
+
- 1024
|
| 98 |
+
- 512
|
| 99 |
+
- 256
|
| 100 |
+
multi_stft_hop_size: 147
|
| 101 |
+
multi_stft_normalized: false
|
| 102 |
+
mlp_expansion_factor: 4
|
| 103 |
+
|
| 104 |
+
training:
|
| 105 |
+
batch_size: 1
|
| 106 |
+
gradient_accumulation_steps: 1
|
| 107 |
+
grad_clip: 0
|
| 108 |
+
instruments:
|
| 109 |
+
- Vocals
|
| 110 |
+
- Instrumental
|
| 111 |
+
patience: 2
|
| 112 |
+
reduce_factor: 0.95
|
| 113 |
+
target_instrument: Vocals
|
| 114 |
+
num_epochs: 1000
|
| 115 |
+
num_steps: 1000
|
| 116 |
+
q: 0.95
|
| 117 |
+
coarse_loss_clip: true
|
| 118 |
+
ema_momentum: 0.999
|
| 119 |
+
# optimizer: prodigy
|
| 120 |
+
optimizer: adam
|
| 121 |
+
lr: 1.0e-5
|
| 122 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 123 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 124 |
+
|
| 125 |
+
inference:
|
| 126 |
+
batch_size: 2
|
| 127 |
+
dim_t: 2001
|
| 128 |
+
num_overlap: 4
|
| 129 |
+
normalize: false
|
config_bs_roformer_vocals_gabox.yaml
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 352800
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 801 # don't work (use in model)
|
| 5 |
+
hop_length: 441 # don't work (use in model)
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.001
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 512
|
| 13 |
+
depth: 12
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
freqs_per_bands: !!python/tuple
|
| 19 |
+
- 2
|
| 20 |
+
- 2
|
| 21 |
+
- 2
|
| 22 |
+
- 2
|
| 23 |
+
- 2
|
| 24 |
+
- 2
|
| 25 |
+
- 2
|
| 26 |
+
- 2
|
| 27 |
+
- 2
|
| 28 |
+
- 2
|
| 29 |
+
- 2
|
| 30 |
+
- 2
|
| 31 |
+
- 2
|
| 32 |
+
- 2
|
| 33 |
+
- 2
|
| 34 |
+
- 2
|
| 35 |
+
- 2
|
| 36 |
+
- 2
|
| 37 |
+
- 2
|
| 38 |
+
- 2
|
| 39 |
+
- 2
|
| 40 |
+
- 2
|
| 41 |
+
- 2
|
| 42 |
+
- 2
|
| 43 |
+
- 4
|
| 44 |
+
- 4
|
| 45 |
+
- 4
|
| 46 |
+
- 4
|
| 47 |
+
- 4
|
| 48 |
+
- 4
|
| 49 |
+
- 4
|
| 50 |
+
- 4
|
| 51 |
+
- 4
|
| 52 |
+
- 4
|
| 53 |
+
- 4
|
| 54 |
+
- 4
|
| 55 |
+
- 12
|
| 56 |
+
- 12
|
| 57 |
+
- 12
|
| 58 |
+
- 12
|
| 59 |
+
- 12
|
| 60 |
+
- 12
|
| 61 |
+
- 12
|
| 62 |
+
- 12
|
| 63 |
+
- 24
|
| 64 |
+
- 24
|
| 65 |
+
- 24
|
| 66 |
+
- 24
|
| 67 |
+
- 24
|
| 68 |
+
- 24
|
| 69 |
+
- 24
|
| 70 |
+
- 24
|
| 71 |
+
- 48
|
| 72 |
+
- 48
|
| 73 |
+
- 48
|
| 74 |
+
- 48
|
| 75 |
+
- 48
|
| 76 |
+
- 48
|
| 77 |
+
- 48
|
| 78 |
+
- 48
|
| 79 |
+
- 128
|
| 80 |
+
- 129
|
| 81 |
+
dim_head: 64
|
| 82 |
+
heads: 8
|
| 83 |
+
attn_dropout: 0.1
|
| 84 |
+
ff_dropout: 0.1
|
| 85 |
+
flash_attn: true
|
| 86 |
+
dim_freqs_in: 1025
|
| 87 |
+
stft_n_fft: 2048
|
| 88 |
+
stft_hop_length: 441
|
| 89 |
+
stft_win_length: 2048
|
| 90 |
+
stft_normalized: false
|
| 91 |
+
mask_estimator_depth: 2
|
| 92 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 93 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 94 |
+
- 4096
|
| 95 |
+
- 2048
|
| 96 |
+
- 1024
|
| 97 |
+
- 512
|
| 98 |
+
- 256
|
| 99 |
+
multi_stft_hop_size: 147
|
| 100 |
+
multi_stft_normalized: False
|
| 101 |
+
|
| 102 |
+
training:
|
| 103 |
+
batch_size: 16
|
| 104 |
+
gradient_accumulation_steps: 1
|
| 105 |
+
grad_clip: 0
|
| 106 |
+
instruments:
|
| 107 |
+
- Vocals
|
| 108 |
+
- Instrumental
|
| 109 |
+
lr: 5.0e-05
|
| 110 |
+
patience: 2
|
| 111 |
+
reduce_factor: 0.95
|
| 112 |
+
target_instrument: Vocals
|
| 113 |
+
num_epochs: 1000
|
| 114 |
+
num_steps: 1000
|
| 115 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
| 116 |
+
augmentation_type: simple1
|
| 117 |
+
use_mp3_compress: false # Deprecated
|
| 118 |
+
augmentation_mix: true # Mix several stems of the same type with some probability
|
| 119 |
+
augmentation_loudness: true # randomly change loudness of each stem
|
| 120 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
| 121 |
+
augmentation_loudness_min: 0.5
|
| 122 |
+
augmentation_loudness_max: 1.5
|
| 123 |
+
q: 0.95
|
| 124 |
+
coarse_loss_clip: true
|
| 125 |
+
ema_momentum: 0.999
|
| 126 |
+
optimizer: adam
|
| 127 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 128 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 129 |
+
|
| 130 |
+
inference:
|
| 131 |
+
batch_size: 1
|
| 132 |
+
dim_t: 801
|
| 133 |
+
num_overlap: 4
|
config_bs_roformer_vocals_resurrection_unwa.yaml
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 785920
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 1536 # don't work (use in model)
|
| 5 |
+
hop_length: 441 # don't work (use in model)
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 256
|
| 13 |
+
depth: 12
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
linear_transformer_depth: 0
|
| 19 |
+
freqs_per_bands: !!python/tuple
|
| 20 |
+
- 2
|
| 21 |
+
- 2
|
| 22 |
+
- 2
|
| 23 |
+
- 2
|
| 24 |
+
- 2
|
| 25 |
+
- 2
|
| 26 |
+
- 2
|
| 27 |
+
- 2
|
| 28 |
+
- 2
|
| 29 |
+
- 2
|
| 30 |
+
- 2
|
| 31 |
+
- 2
|
| 32 |
+
- 2
|
| 33 |
+
- 2
|
| 34 |
+
- 2
|
| 35 |
+
- 2
|
| 36 |
+
- 2
|
| 37 |
+
- 2
|
| 38 |
+
- 2
|
| 39 |
+
- 2
|
| 40 |
+
- 2
|
| 41 |
+
- 2
|
| 42 |
+
- 2
|
| 43 |
+
- 2
|
| 44 |
+
- 4
|
| 45 |
+
- 4
|
| 46 |
+
- 4
|
| 47 |
+
- 4
|
| 48 |
+
- 4
|
| 49 |
+
- 4
|
| 50 |
+
- 4
|
| 51 |
+
- 4
|
| 52 |
+
- 4
|
| 53 |
+
- 4
|
| 54 |
+
- 4
|
| 55 |
+
- 4
|
| 56 |
+
- 12
|
| 57 |
+
- 12
|
| 58 |
+
- 12
|
| 59 |
+
- 12
|
| 60 |
+
- 12
|
| 61 |
+
- 12
|
| 62 |
+
- 12
|
| 63 |
+
- 12
|
| 64 |
+
- 24
|
| 65 |
+
- 24
|
| 66 |
+
- 24
|
| 67 |
+
- 24
|
| 68 |
+
- 24
|
| 69 |
+
- 24
|
| 70 |
+
- 24
|
| 71 |
+
- 24
|
| 72 |
+
- 48
|
| 73 |
+
- 48
|
| 74 |
+
- 48
|
| 75 |
+
- 48
|
| 76 |
+
- 48
|
| 77 |
+
- 48
|
| 78 |
+
- 48
|
| 79 |
+
- 48
|
| 80 |
+
- 128
|
| 81 |
+
- 129
|
| 82 |
+
dim_head: 64
|
| 83 |
+
heads: 8
|
| 84 |
+
attn_dropout: 0.
|
| 85 |
+
ff_dropout: 0.
|
| 86 |
+
flash_attn: true
|
| 87 |
+
dim_freqs_in: 1025
|
| 88 |
+
stft_n_fft: 2048
|
| 89 |
+
stft_hop_length: 441
|
| 90 |
+
stft_win_length: 2048
|
| 91 |
+
stft_normalized: false
|
| 92 |
+
mask_estimator_depth: 2
|
| 93 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 94 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 95 |
+
- 4096
|
| 96 |
+
- 2048
|
| 97 |
+
- 1024
|
| 98 |
+
- 512
|
| 99 |
+
- 256
|
| 100 |
+
multi_stft_hop_size: 147
|
| 101 |
+
multi_stft_normalized: False
|
| 102 |
+
|
| 103 |
+
training:
|
| 104 |
+
batch_size: 2
|
| 105 |
+
gradient_accumulation_steps: 1
|
| 106 |
+
grad_clip: 0
|
| 107 |
+
instruments: ['vocals', 'other']
|
| 108 |
+
patience: 3
|
| 109 |
+
reduce_factor: 0.95
|
| 110 |
+
target_instrument: vocals
|
| 111 |
+
num_epochs: 1000
|
| 112 |
+
num_steps: 1000
|
| 113 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
| 114 |
+
augmentation_type: simple1
|
| 115 |
+
use_mp3_compress: false # Deprecated
|
| 116 |
+
augmentation_mix: true # Mix several stems of the same type with some probability
|
| 117 |
+
augmentation_loudness: true # randomly change loudness of each stem
|
| 118 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
| 119 |
+
augmentation_loudness_min: 0.5
|
| 120 |
+
augmentation_loudness_max: 1.5
|
| 121 |
+
q: 0.95
|
| 122 |
+
coarse_loss_clip: true
|
| 123 |
+
ema_momentum: 0.999
|
| 124 |
+
# optimizer: prodigy
|
| 125 |
+
optimizer: adam
|
| 126 |
+
# lr: 1.0
|
| 127 |
+
lr: 1.0e-5
|
| 128 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 129 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 130 |
+
|
| 131 |
+
inference:
|
| 132 |
+
batch_size: 2
|
| 133 |
+
dim_t: 1536
|
| 134 |
+
num_overlap: 2
|
| 135 |
+
normalize: false
|
config_bs_roformer_vocals_revive_unwa.yaml
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 485100 #352800 #485100
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 1101
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 512
|
| 13 |
+
depth: 12
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
linear_transformer_depth: 0
|
| 19 |
+
freqs_per_bands: !!python/tuple
|
| 20 |
+
- 2
|
| 21 |
+
- 2
|
| 22 |
+
- 2
|
| 23 |
+
- 2
|
| 24 |
+
- 2
|
| 25 |
+
- 2
|
| 26 |
+
- 2
|
| 27 |
+
- 2
|
| 28 |
+
- 2
|
| 29 |
+
- 2
|
| 30 |
+
- 2
|
| 31 |
+
- 2
|
| 32 |
+
- 2
|
| 33 |
+
- 2
|
| 34 |
+
- 2
|
| 35 |
+
- 2
|
| 36 |
+
- 2
|
| 37 |
+
- 2
|
| 38 |
+
- 2
|
| 39 |
+
- 2
|
| 40 |
+
- 2
|
| 41 |
+
- 2
|
| 42 |
+
- 2
|
| 43 |
+
- 2
|
| 44 |
+
- 4
|
| 45 |
+
- 4
|
| 46 |
+
- 4
|
| 47 |
+
- 4
|
| 48 |
+
- 4
|
| 49 |
+
- 4
|
| 50 |
+
- 4
|
| 51 |
+
- 4
|
| 52 |
+
- 4
|
| 53 |
+
- 4
|
| 54 |
+
- 4
|
| 55 |
+
- 4
|
| 56 |
+
- 12
|
| 57 |
+
- 12
|
| 58 |
+
- 12
|
| 59 |
+
- 12
|
| 60 |
+
- 12
|
| 61 |
+
- 12
|
| 62 |
+
- 12
|
| 63 |
+
- 12
|
| 64 |
+
- 24
|
| 65 |
+
- 24
|
| 66 |
+
- 24
|
| 67 |
+
- 24
|
| 68 |
+
- 24
|
| 69 |
+
- 24
|
| 70 |
+
- 24
|
| 71 |
+
- 24
|
| 72 |
+
- 48
|
| 73 |
+
- 48
|
| 74 |
+
- 48
|
| 75 |
+
- 48
|
| 76 |
+
- 48
|
| 77 |
+
- 48
|
| 78 |
+
- 48
|
| 79 |
+
- 48
|
| 80 |
+
- 128
|
| 81 |
+
- 129
|
| 82 |
+
dim_head: 64
|
| 83 |
+
heads: 8
|
| 84 |
+
attn_dropout: 0.
|
| 85 |
+
ff_dropout: 0.
|
| 86 |
+
flash_attn: true
|
| 87 |
+
dim_freqs_in: 1025
|
| 88 |
+
stft_n_fft: 2048
|
| 89 |
+
stft_hop_length: 441
|
| 90 |
+
stft_win_length: 2048
|
| 91 |
+
stft_normalized: false
|
| 92 |
+
mask_estimator_depth: 2
|
| 93 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 94 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 95 |
+
- 4096
|
| 96 |
+
- 2048
|
| 97 |
+
- 1024
|
| 98 |
+
- 512
|
| 99 |
+
- 256
|
| 100 |
+
multi_stft_hop_size: 147
|
| 101 |
+
multi_stft_normalized: False
|
| 102 |
+
|
| 103 |
+
training:
|
| 104 |
+
batch_size: 1
|
| 105 |
+
gradient_accumulation_steps: 1
|
| 106 |
+
grad_clip: 0
|
| 107 |
+
instruments:
|
| 108 |
+
- vocals
|
| 109 |
+
- other
|
| 110 |
+
lr: 1.0e-05
|
| 111 |
+
patience: 2
|
| 112 |
+
reduce_factor: 0.95
|
| 113 |
+
target_instrument: vocals
|
| 114 |
+
num_epochs: 1000
|
| 115 |
+
num_steps: 1000
|
| 116 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
| 117 |
+
augmentation_type: null
|
| 118 |
+
use_mp3_compress: false # Deprecated
|
| 119 |
+
augmentation_mix: false # Mix several stems of the same type with some probability
|
| 120 |
+
augmentation_loudness: false # randomly change loudness of each stem
|
| 121 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
| 122 |
+
augmentation_loudness_min: 0
|
| 123 |
+
augmentation_loudness_max: 0
|
| 124 |
+
q: 0.95
|
| 125 |
+
coarse_loss_clip: false
|
| 126 |
+
ema_momentum: 0.999
|
| 127 |
+
optimizer: adam
|
| 128 |
+
other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
|
| 129 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 130 |
+
|
| 131 |
+
inference:
|
| 132 |
+
batch_size: 2
|
| 133 |
+
dim_t: 1101
|
| 134 |
+
num_overlap: 2
|
config_dereverb-echo_mel_band_roformer.yaml
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 352800
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 801 # don't work (use in model)
|
| 5 |
+
hop_length: 441 # don't work (use in model)
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 256
|
| 13 |
+
depth: 8
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 2
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0.1
|
| 22 |
+
ff_dropout: 0.1
|
| 23 |
+
flash_attn: True
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: False
|
| 30 |
+
mask_estimator_depth: 2
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: False
|
| 40 |
+
|
| 41 |
+
training:
|
| 42 |
+
batch_size: 1
|
| 43 |
+
gradient_accumulation_steps: 8
|
| 44 |
+
grad_clip: 0
|
| 45 |
+
instruments:
|
| 46 |
+
- dry
|
| 47 |
+
- No dry
|
| 48 |
+
lr: 4.0e-05
|
| 49 |
+
patience: 2
|
| 50 |
+
reduce_factor: 0.95
|
| 51 |
+
target_instrument: null
|
| 52 |
+
num_epochs: 1000
|
| 53 |
+
num_steps: 1000
|
| 54 |
+
q: 0.95
|
| 55 |
+
coarse_loss_clip: true
|
| 56 |
+
ema_momentum: 0.999
|
| 57 |
+
optimizer: adam
|
| 58 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 59 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 60 |
+
|
| 61 |
+
augmentations:
|
| 62 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
| 63 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
| 64 |
+
loudness_min: 0.5
|
| 65 |
+
loudness_max: 1.5
|
| 66 |
+
mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
| 67 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
| 68 |
+
- 0.2
|
| 69 |
+
- 0.02
|
| 70 |
+
mixup_loudness_min: 0.5
|
| 71 |
+
mixup_loudness_max: 1.5
|
| 72 |
+
|
| 73 |
+
inference:
|
| 74 |
+
batch_size: 4
|
| 75 |
+
dim_t: 801
|
| 76 |
+
num_overlap: 4
|
config_mdx23c_similarity.yaml
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 130560
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 256
|
| 5 |
+
hop_length: 512
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.001
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
act: gelu
|
| 13 |
+
bottleneck_factor: 4
|
| 14 |
+
growth: 128
|
| 15 |
+
norm: InstanceNorm
|
| 16 |
+
num_blocks_per_scale: 2
|
| 17 |
+
num_channels: 128
|
| 18 |
+
num_scales: 5
|
| 19 |
+
num_subbands: 4
|
| 20 |
+
scale:
|
| 21 |
+
- 2
|
| 22 |
+
- 2
|
| 23 |
+
|
| 24 |
+
training:
|
| 25 |
+
batch_size: 2
|
| 26 |
+
gradient_accumulation_steps: 3
|
| 27 |
+
grad_clip: 0
|
| 28 |
+
instruments:
|
| 29 |
+
- Similarity
|
| 30 |
+
- Difference
|
| 31 |
+
lr: 1.0
|
| 32 |
+
patience: 15
|
| 33 |
+
reduce_factor: 0.95
|
| 34 |
+
target_instrument: Similarity
|
| 35 |
+
num_epochs: 1000
|
| 36 |
+
num_steps: 2235
|
| 37 |
+
q: 0.95
|
| 38 |
+
coarse_loss_clip: true
|
| 39 |
+
ema_momentum: 0.999
|
| 40 |
+
optimizer: prodigy
|
| 41 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 42 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 43 |
+
|
| 44 |
+
inference:
|
| 45 |
+
batch_size: 8
|
| 46 |
+
dim_t: 256
|
| 47 |
+
num_overlap: 8
|
config_mel_band_roformer_instrumental_becruily.yaml
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 352800
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 256
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 384
|
| 13 |
+
depth: 6
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0
|
| 22 |
+
ff_dropout: 0
|
| 23 |
+
flash_attn: True
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: False
|
| 30 |
+
mask_estimator_depth: 2
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: False
|
| 40 |
+
|
| 41 |
+
training:
|
| 42 |
+
batch_size: 1
|
| 43 |
+
gradient_accumulation_steps: 1
|
| 44 |
+
grad_clip: 0
|
| 45 |
+
instruments:
|
| 46 |
+
- Instrumental
|
| 47 |
+
- Vocals
|
| 48 |
+
lr: 0.0005
|
| 49 |
+
patience: 2
|
| 50 |
+
reduce_factor: 0.95
|
| 51 |
+
target_instrument: Instrumental
|
| 52 |
+
num_epochs: 1000
|
| 53 |
+
num_steps: 1000
|
| 54 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
| 55 |
+
augmentation_type: null
|
| 56 |
+
use_mp3_compress: false # Deprecated
|
| 57 |
+
augmentation_mix: false # Mix several stems of the same type with some probability
|
| 58 |
+
augmentation_loudness: false # randomly change loudness of each stem
|
| 59 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
| 60 |
+
augmentation_loudness_min: 0
|
| 61 |
+
augmentation_loudness_max: 0
|
| 62 |
+
q: 0.95
|
| 63 |
+
coarse_loss_clip: false
|
| 64 |
+
ema_momentum: 0.999
|
| 65 |
+
optimizer: adamw
|
| 66 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 67 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 68 |
+
|
| 69 |
+
inference:
|
| 70 |
+
batch_size: 1
|
| 71 |
+
dim_t: 1101
|
| 72 |
+
num_overlap: 2
|
config_mel_band_roformer_instrumental_gabox.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 485100
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 1101
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 384
|
| 13 |
+
depth: 6
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0
|
| 22 |
+
ff_dropout: 0
|
| 23 |
+
flash_attn: True
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: False
|
| 30 |
+
mask_estimator_depth: 2
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: False
|
| 40 |
+
|
| 41 |
+
training:
|
| 42 |
+
instruments:
|
| 43 |
+
- Instrumental
|
| 44 |
+
- Vocals
|
| 45 |
+
target_instrument: Instrumental
|
| 46 |
+
use_amp: True
|
| 47 |
+
|
| 48 |
+
inference:
|
| 49 |
+
batch_size: 1
|
| 50 |
+
dim_t: 1101
|
| 51 |
+
num_overlap: 2
|
config_mel_band_roformer_karaoke_becruily.yaml
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 485100
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 256
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 384
|
| 13 |
+
depth: 6
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 2
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0
|
| 22 |
+
ff_dropout: 0
|
| 23 |
+
flash_attn: true
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: false
|
| 30 |
+
mask_estimator_depth: 2
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: false
|
| 40 |
+
|
| 41 |
+
training:
|
| 42 |
+
batch_size: 1
|
| 43 |
+
gradient_accumulation_steps: 1
|
| 44 |
+
grad_clip: 0
|
| 45 |
+
instruments:
|
| 46 |
+
- Vocals
|
| 47 |
+
- Instrumental
|
| 48 |
+
lr: 0.0005
|
| 49 |
+
patience: 2
|
| 50 |
+
reduce_factor: 0.95
|
| 51 |
+
target_instrument: null
|
| 52 |
+
num_epochs: 1000
|
| 53 |
+
num_steps: 1000
|
| 54 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
| 55 |
+
augmentation_type:
|
| 56 |
+
use_mp3_compress: false # Deprecated
|
| 57 |
+
augmentation_mix: false # Mix several stems of the same type with some probability
|
| 58 |
+
augmentation_loudness: false # randomly change loudness of each stem
|
| 59 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
| 60 |
+
augmentation_loudness_min: 0
|
| 61 |
+
augmentation_loudness_max: 0
|
| 62 |
+
q: 0.95
|
| 63 |
+
coarse_loss_clip: false
|
| 64 |
+
ema_momentum: 0.999
|
| 65 |
+
optimizer: adamw
|
| 66 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 67 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 68 |
+
|
| 69 |
+
inference:
|
| 70 |
+
batch_size: 1
|
| 71 |
+
dim_t: 1101
|
| 72 |
+
num_overlap: 8
|
config_mel_band_roformer_kim_ft_unwa.yaml
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 485100
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 801
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 384
|
| 13 |
+
depth: 6
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0
|
| 22 |
+
ff_dropout: 0
|
| 23 |
+
flash_attn: True
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: False
|
| 30 |
+
mask_estimator_depth: 2
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: False
|
| 40 |
+
|
| 41 |
+
training:
|
| 42 |
+
batch_size: 1
|
| 43 |
+
gradient_accumulation_steps: 1
|
| 44 |
+
grad_clip: 0
|
| 45 |
+
instruments:
|
| 46 |
+
- vocals
|
| 47 |
+
- other
|
| 48 |
+
lr: 1.0e-05
|
| 49 |
+
patience: 2
|
| 50 |
+
reduce_factor: 0.95
|
| 51 |
+
target_instrument: vocals
|
| 52 |
+
num_epochs: 1000
|
| 53 |
+
num_steps: 1000
|
| 54 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
| 55 |
+
augmentation_type: null
|
| 56 |
+
use_mp3_compress: false # Deprecated
|
| 57 |
+
augmentation_mix: false # Mix several stems of the same type with some probability
|
| 58 |
+
augmentation_loudness: false # randomly change loudness of each stem
|
| 59 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
| 60 |
+
augmentation_loudness_min: 0
|
| 61 |
+
augmentation_loudness_max: 0
|
| 62 |
+
q: 0.95
|
| 63 |
+
coarse_loss_clip: false
|
| 64 |
+
ema_momentum: 0.999
|
| 65 |
+
optimizer: adam
|
| 66 |
+
other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
|
| 67 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 68 |
+
|
| 69 |
+
inference:
|
| 70 |
+
batch_size: 1
|
| 71 |
+
dim_t: 801
|
| 72 |
+
num_overlap: 8
|
config_mel_band_roformer_vocal_fullness_aname.yaml
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 661500
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 256
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.001
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 384
|
| 13 |
+
depth: 6
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0
|
| 22 |
+
ff_dropout: 0
|
| 23 |
+
flash_attn: True
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: False
|
| 30 |
+
mask_estimator_depth: 2
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: False
|
| 40 |
+
|
| 41 |
+
training:
|
| 42 |
+
batch_size: 1
|
| 43 |
+
gradient_accumulation_steps: 1
|
| 44 |
+
grad_clip: 0
|
| 45 |
+
instruments:
|
| 46 |
+
- vocals
|
| 47 |
+
- other
|
| 48 |
+
target_instrument: vocals
|
| 49 |
+
use_amp: true
|
| 50 |
+
|
| 51 |
+
inference:
|
| 52 |
+
batch_size: 4
|
| 53 |
+
dim_t: 1101
|
| 54 |
+
num_overlap: 4
|
config_mel_band_roformer_vocals_becruily.yaml
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 352800
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 256
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 384
|
| 13 |
+
depth: 6
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0
|
| 22 |
+
ff_dropout: 0
|
| 23 |
+
flash_attn: True
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: False
|
| 30 |
+
mask_estimator_depth: 2
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: False
|
| 40 |
+
|
| 41 |
+
training:
|
| 42 |
+
batch_size: 1
|
| 43 |
+
gradient_accumulation_steps: 1
|
| 44 |
+
grad_clip: 0
|
| 45 |
+
instruments:
|
| 46 |
+
- vocals
|
| 47 |
+
- other
|
| 48 |
+
lr: 0.0005
|
| 49 |
+
patience: 2
|
| 50 |
+
reduce_factor: 0.95
|
| 51 |
+
target_instrument: vocals
|
| 52 |
+
num_epochs: 1000
|
| 53 |
+
num_steps: 1000
|
| 54 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
| 55 |
+
augmentation_type: null
|
| 56 |
+
use_mp3_compress: false # Deprecated
|
| 57 |
+
augmentation_mix: false # Mix several stems of the same type with some probability
|
| 58 |
+
augmentation_loudness: false # randomly change loudness of each stem
|
| 59 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
| 60 |
+
augmentation_loudness_min: 0
|
| 61 |
+
augmentation_loudness_max: 0
|
| 62 |
+
q: 0.95
|
| 63 |
+
coarse_loss_clip: false
|
| 64 |
+
ema_momentum: 0.999
|
| 65 |
+
optimizer: adamw
|
| 66 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 67 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 68 |
+
|
| 69 |
+
inference:
|
| 70 |
+
batch_size: 1
|
| 71 |
+
dim_t: 1101
|
| 72 |
+
num_overlap: 2
|
config_mel_band_roformer_vocals_gabox.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 352800
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 256
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.001
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 384
|
| 13 |
+
depth: 6
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0
|
| 22 |
+
ff_dropout: 0
|
| 23 |
+
flash_attn: True
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: False
|
| 30 |
+
mask_estimator_depth: 2
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: False
|
| 40 |
+
|
| 41 |
+
training:
|
| 42 |
+
instruments:
|
| 43 |
+
- Vocals
|
| 44 |
+
- Instrumental
|
| 45 |
+
target_instrument: Vocals
|
| 46 |
+
|
| 47 |
+
inference:
|
| 48 |
+
batch_size: 1
|
| 49 |
+
dim_t: 1101
|
| 50 |
+
num_overlap: 1
|
| 51 |
+
chunk_size: 352800
|
config_melbandroformer_inst.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 485100
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 1101
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 384
|
| 13 |
+
depth: 6
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0
|
| 22 |
+
ff_dropout: 0
|
| 23 |
+
flash_attn: True
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: False
|
| 30 |
+
mask_estimator_depth: 2
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: False
|
| 40 |
+
|
| 41 |
+
training:
|
| 42 |
+
instruments:
|
| 43 |
+
- other
|
| 44 |
+
- vocals
|
| 45 |
+
target_instrument: other
|
| 46 |
+
use_amp: True
|
| 47 |
+
|
| 48 |
+
inference:
|
| 49 |
+
batch_size: 1
|
| 50 |
+
dim_t: 1101
|
| 51 |
+
num_overlap: 2
|
config_melbandroformer_inst_v2.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 485100
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 1101
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 384
|
| 13 |
+
depth: 12
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0
|
| 22 |
+
ff_dropout: 0
|
| 23 |
+
flash_attn: True
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: False
|
| 30 |
+
mask_estimator_depth: 3
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: False
|
| 40 |
+
|
| 41 |
+
training:
|
| 42 |
+
instruments:
|
| 43 |
+
- Instrumental
|
| 44 |
+
- Vocals
|
| 45 |
+
target_instrument: Instrumental
|
| 46 |
+
use_amp: True
|
| 47 |
+
|
| 48 |
+
inference:
|
| 49 |
+
batch_size: 1
|
| 50 |
+
dim_t: 1101
|
| 51 |
+
num_overlap: 2
|
config_melbandroformer_instvoc_duality.yaml
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 485100
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 256
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 384
|
| 13 |
+
depth: 6
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 2
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0
|
| 22 |
+
ff_dropout: 0
|
| 23 |
+
flash_attn: True
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: False
|
| 30 |
+
mask_estimator_depth: 2
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: False
|
| 40 |
+
|
| 41 |
+
training:
|
| 42 |
+
instruments:
|
| 43 |
+
- Vocals
|
| 44 |
+
- Instrumental
|
| 45 |
+
target_instrument: null
|
| 46 |
+
use_amp: True
|
| 47 |
+
|
| 48 |
+
inference:
|
| 49 |
+
batch_size: 1
|
| 50 |
+
dim_t: 1101
|
| 51 |
+
num_overlap: 2
|
denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768_config.yaml
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 352800
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 801
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 384
|
| 13 |
+
depth: 6
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0
|
| 22 |
+
ff_dropout: 0
|
| 23 |
+
flash_attn: True
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: False
|
| 30 |
+
mask_estimator_depth: 2
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: False
|
| 40 |
+
|
| 41 |
+
training:
|
| 42 |
+
batch_size: 2
|
| 43 |
+
gradient_accumulation_steps: 1
|
| 44 |
+
grad_clip: 0
|
| 45 |
+
instruments:
|
| 46 |
+
- dry
|
| 47 |
+
- other
|
| 48 |
+
lr: 1.0e-05
|
| 49 |
+
patience: 8
|
| 50 |
+
reduce_factor: 0.95
|
| 51 |
+
target_instrument: dry
|
| 52 |
+
num_epochs: 1000
|
| 53 |
+
num_steps: 4032
|
| 54 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
| 55 |
+
augmentation_type: null
|
| 56 |
+
use_mp3_compress: false # Deprecated
|
| 57 |
+
augmentation_mix: false # Mix several stems of the same type with some probability
|
| 58 |
+
augmentation_loudness: false # randomly change loudness of each stem
|
| 59 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
| 60 |
+
augmentation_loudness_min: 0
|
| 61 |
+
augmentation_loudness_max: 0
|
| 62 |
+
q: 0.95
|
| 63 |
+
coarse_loss_clip: false
|
| 64 |
+
ema_momentum: 0.999
|
| 65 |
+
optimizer: adam
|
| 66 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 67 |
+
|
| 68 |
+
inference:
|
| 69 |
+
batch_size: 2
|
| 70 |
+
dim_t: 801
|
| 71 |
+
num_overlap: 4
|
denoise_mel_band_roformer_aufr33_sdr_27.9959_config.yaml
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 352800
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 801
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 384
|
| 13 |
+
depth: 6
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0
|
| 22 |
+
ff_dropout: 0
|
| 23 |
+
flash_attn: True
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: False
|
| 30 |
+
mask_estimator_depth: 2
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: False
|
| 40 |
+
|
| 41 |
+
training:
|
| 42 |
+
batch_size: 2
|
| 43 |
+
gradient_accumulation_steps: 1
|
| 44 |
+
grad_clip: 0
|
| 45 |
+
instruments:
|
| 46 |
+
- dry
|
| 47 |
+
- other
|
| 48 |
+
lr: 1.0e-05
|
| 49 |
+
patience: 8
|
| 50 |
+
reduce_factor: 0.95
|
| 51 |
+
target_instrument: dry
|
| 52 |
+
num_epochs: 1000
|
| 53 |
+
num_steps: 4032
|
| 54 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
| 55 |
+
augmentation_type: null
|
| 56 |
+
use_mp3_compress: false # Deprecated
|
| 57 |
+
augmentation_mix: false # Mix several stems of the same type with some probability
|
| 58 |
+
augmentation_loudness: false # randomly change loudness of each stem
|
| 59 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
| 60 |
+
augmentation_loudness_min: 0
|
| 61 |
+
augmentation_loudness_max: 0
|
| 62 |
+
q: 0.95
|
| 63 |
+
coarse_loss_clip: false
|
| 64 |
+
ema_momentum: 0.999
|
| 65 |
+
optimizer: adam
|
| 66 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 67 |
+
|
| 68 |
+
inference:
|
| 69 |
+
batch_size: 2
|
| 70 |
+
dim_t: 801
|
| 71 |
+
num_overlap: 4
|
deverb_bs_roformer_8_384dim_10depth_config.yaml
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 352768
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 801
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.001
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 384
|
| 13 |
+
depth: 10
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
freqs_per_bands: !!python/tuple
|
| 19 |
+
- 2
|
| 20 |
+
- 2
|
| 21 |
+
- 2
|
| 22 |
+
- 2
|
| 23 |
+
- 2
|
| 24 |
+
- 2
|
| 25 |
+
- 2
|
| 26 |
+
- 2
|
| 27 |
+
- 2
|
| 28 |
+
- 2
|
| 29 |
+
- 2
|
| 30 |
+
- 2
|
| 31 |
+
- 2
|
| 32 |
+
- 2
|
| 33 |
+
- 2
|
| 34 |
+
- 2
|
| 35 |
+
- 2
|
| 36 |
+
- 2
|
| 37 |
+
- 2
|
| 38 |
+
- 2
|
| 39 |
+
- 2
|
| 40 |
+
- 2
|
| 41 |
+
- 2
|
| 42 |
+
- 2
|
| 43 |
+
- 4
|
| 44 |
+
- 4
|
| 45 |
+
- 4
|
| 46 |
+
- 4
|
| 47 |
+
- 4
|
| 48 |
+
- 4
|
| 49 |
+
- 4
|
| 50 |
+
- 4
|
| 51 |
+
- 4
|
| 52 |
+
- 4
|
| 53 |
+
- 4
|
| 54 |
+
- 4
|
| 55 |
+
- 12
|
| 56 |
+
- 12
|
| 57 |
+
- 12
|
| 58 |
+
- 12
|
| 59 |
+
- 12
|
| 60 |
+
- 12
|
| 61 |
+
- 12
|
| 62 |
+
- 12
|
| 63 |
+
- 24
|
| 64 |
+
- 24
|
| 65 |
+
- 24
|
| 66 |
+
- 24
|
| 67 |
+
- 24
|
| 68 |
+
- 24
|
| 69 |
+
- 24
|
| 70 |
+
- 24
|
| 71 |
+
- 48
|
| 72 |
+
- 48
|
| 73 |
+
- 48
|
| 74 |
+
- 48
|
| 75 |
+
- 48
|
| 76 |
+
- 48
|
| 77 |
+
- 48
|
| 78 |
+
- 48
|
| 79 |
+
- 128
|
| 80 |
+
- 129
|
| 81 |
+
dim_head: 64
|
| 82 |
+
heads: 8
|
| 83 |
+
attn_dropout: 0.1
|
| 84 |
+
ff_dropout: 0.1
|
| 85 |
+
flash_attn: true
|
| 86 |
+
dim_freqs_in: 1025
|
| 87 |
+
stft_n_fft: 2048
|
| 88 |
+
stft_hop_length: 441
|
| 89 |
+
stft_win_length: 2048
|
| 90 |
+
stft_normalized: false
|
| 91 |
+
mask_estimator_depth: 2
|
| 92 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 93 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 94 |
+
- 4096
|
| 95 |
+
- 2048
|
| 96 |
+
- 1024
|
| 97 |
+
- 512
|
| 98 |
+
- 256
|
| 99 |
+
multi_stft_hop_size: 147
|
| 100 |
+
multi_stft_normalized: False
|
| 101 |
+
|
| 102 |
+
training:
|
| 103 |
+
batch_size: 1
|
| 104 |
+
gradient_accumulation_steps: 1
|
| 105 |
+
grad_clip: 0
|
| 106 |
+
instruments:
|
| 107 |
+
- noreverb
|
| 108 |
+
- reverb
|
| 109 |
+
lr: 5.0e-05
|
| 110 |
+
patience: 2
|
| 111 |
+
reduce_factor: 0.95
|
| 112 |
+
target_instrument: noreverb
|
| 113 |
+
num_epochs: 1000
|
| 114 |
+
num_steps: 1000
|
| 115 |
+
q: 0.95
|
| 116 |
+
coarse_loss_clip: true
|
| 117 |
+
ema_momentum: 0.999
|
| 118 |
+
optimizer: adam
|
| 119 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 120 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 121 |
+
|
| 122 |
+
augmentations:
|
| 123 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
| 124 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
| 125 |
+
loudness_min: 0.5
|
| 126 |
+
loudness_max: 1.5
|
| 127 |
+
mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
| 128 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
| 129 |
+
- 0.2
|
| 130 |
+
- 0.02
|
| 131 |
+
mixup_loudness_min: 0.5
|
| 132 |
+
mixup_loudness_max: 1.5
|
| 133 |
+
|
| 134 |
+
inference:
|
| 135 |
+
batch_size: 4
|
| 136 |
+
dim_t: 801
|
| 137 |
+
num_overlap: 4
|
hdemucs_mmi.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
models: ['75fc33f5']
|
| 2 |
+
segment: 44
|
htdemucs.yaml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
models: ['955717e8']
|
htdemucs_6s.yaml
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
models: ['5c90dfd2']
|
htdemucs_ft.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
models: ['f7e0c4bc', 'd12395a8', '92cfc3b6', '04573f0d']
|
| 2 |
+
weights: [
|
| 3 |
+
[1., 0., 0., 0.],
|
| 4 |
+
[0., 1., 0., 0.],
|
| 5 |
+
[0., 0., 1., 0.],
|
| 6 |
+
[0., 0., 0., 1.],
|
| 7 |
+
]
|
kuielab_b_other.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b0d0b63950ac332333fea2d58f68c92fd3ab0aae071398c2a8beeae1ad15b655
|
| 3 |
+
size 29703204
|
kuielab_b_vocals.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b7dcb9d878acb0f3e64ff3fd27750faae96577013f6d50f5996875bf4250713
|
| 3 |
+
size 29703204
|
mdx_model_data.json
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"0ddfc0eb5792638ad5dc27850236c246": {
|
| 3 |
+
"compensate": 1.035,
|
| 4 |
+
"mdx_dim_f_set": 2048,
|
| 5 |
+
"mdx_dim_t_set": 8,
|
| 6 |
+
"mdx_n_fft_scale_set": 6144,
|
| 7 |
+
"primary_stem": "Vocals"
|
| 8 |
+
},
|
| 9 |
+
"26d308f91f3423a67dc69a6d12a8793d": {
|
| 10 |
+
"compensate": 1.035,
|
| 11 |
+
"mdx_dim_f_set": 2048,
|
| 12 |
+
"mdx_dim_t_set": 9,
|
| 13 |
+
"mdx_n_fft_scale_set": 8192,
|
| 14 |
+
"primary_stem": "Other"
|
| 15 |
+
},
|
| 16 |
+
"2cdd429caac38f0194b133884160f2c6": {
|
| 17 |
+
"compensate": 1.045,
|
| 18 |
+
"mdx_dim_f_set": 3072,
|
| 19 |
+
"mdx_dim_t_set": 8,
|
| 20 |
+
"mdx_n_fft_scale_set": 7680,
|
| 21 |
+
"primary_stem": "Instrumental"
|
| 22 |
+
},
|
| 23 |
+
"2f5501189a2f6db6349916fabe8c90de": {
|
| 24 |
+
"compensate": 1.035,
|
| 25 |
+
"mdx_dim_f_set": 2048,
|
| 26 |
+
"mdx_dim_t_set": 8,
|
| 27 |
+
"mdx_n_fft_scale_set": 6144,
|
| 28 |
+
"primary_stem": "Vocals",
|
| 29 |
+
"is_karaoke": true
|
| 30 |
+
},
|
| 31 |
+
"398580b6d5d973af3120df54cee6759d": {
|
| 32 |
+
"compensate": 1.75,
|
| 33 |
+
"mdx_dim_f_set": 3072,
|
| 34 |
+
"mdx_dim_t_set": 8,
|
| 35 |
+
"mdx_n_fft_scale_set": 7680,
|
| 36 |
+
"primary_stem": "Vocals"
|
| 37 |
+
},
|
| 38 |
+
"488b3e6f8bd3717d9d7c428476be2d75": {
|
| 39 |
+
"compensate": 1.035,
|
| 40 |
+
"mdx_dim_f_set": 3072,
|
| 41 |
+
"mdx_dim_t_set": 8,
|
| 42 |
+
"mdx_n_fft_scale_set": 7680,
|
| 43 |
+
"primary_stem": "Instrumental"
|
| 44 |
+
},
|
| 45 |
+
"4910e7827f335048bdac11fa967772f9": {
|
| 46 |
+
"compensate": 1.035,
|
| 47 |
+
"mdx_dim_f_set": 2048,
|
| 48 |
+
"mdx_dim_t_set": 7,
|
| 49 |
+
"mdx_n_fft_scale_set": 4096,
|
| 50 |
+
"primary_stem": "Drums"
|
| 51 |
+
},
|
| 52 |
+
"53c4baf4d12c3e6c3831bb8f5b532b93": {
|
| 53 |
+
"compensate": 1.043,
|
| 54 |
+
"mdx_dim_f_set": 3072,
|
| 55 |
+
"mdx_dim_t_set": 8,
|
| 56 |
+
"mdx_n_fft_scale_set": 7680,
|
| 57 |
+
"primary_stem": "Vocals"
|
| 58 |
+
},
|
| 59 |
+
"5d343409ef0df48c7d78cce9f0106781": {
|
| 60 |
+
"compensate": 1.075,
|
| 61 |
+
"mdx_dim_f_set": 3072,
|
| 62 |
+
"mdx_dim_t_set": 8,
|
| 63 |
+
"mdx_n_fft_scale_set": 7680,
|
| 64 |
+
"primary_stem": "Vocals"
|
| 65 |
+
},
|
| 66 |
+
"5f6483271e1efb9bfb59e4a3e6d4d098": {
|
| 67 |
+
"compensate": 1.035,
|
| 68 |
+
"mdx_dim_f_set": 2048,
|
| 69 |
+
"mdx_dim_t_set": 9,
|
| 70 |
+
"mdx_n_fft_scale_set": 6144,
|
| 71 |
+
"primary_stem": "Vocals"
|
| 72 |
+
},
|
| 73 |
+
"65ab5919372a128e4167f5e01a8fda85": {
|
| 74 |
+
"compensate": 1.035,
|
| 75 |
+
"mdx_dim_f_set": 2048,
|
| 76 |
+
"mdx_dim_t_set": 8,
|
| 77 |
+
"mdx_n_fft_scale_set": 8192,
|
| 78 |
+
"primary_stem": "Other"
|
| 79 |
+
},
|
| 80 |
+
"6703e39f36f18aa7855ee1047765621d": {
|
| 81 |
+
"compensate": 1.035,
|
| 82 |
+
"mdx_dim_f_set": 2048,
|
| 83 |
+
"mdx_dim_t_set": 9,
|
| 84 |
+
"mdx_n_fft_scale_set": 16384,
|
| 85 |
+
"primary_stem": "Bass"
|
| 86 |
+
},
|
| 87 |
+
"6b31de20e84392859a3d09d43f089515": {
|
| 88 |
+
"compensate": 1.035,
|
| 89 |
+
"mdx_dim_f_set": 2048,
|
| 90 |
+
"mdx_dim_t_set": 8,
|
| 91 |
+
"mdx_n_fft_scale_set": 6144,
|
| 92 |
+
"primary_stem": "Vocals"
|
| 93 |
+
},
|
| 94 |
+
"867595e9de46f6ab699008295df62798": {
|
| 95 |
+
"compensate": 1.03,
|
| 96 |
+
"mdx_dim_f_set": 3072,
|
| 97 |
+
"mdx_dim_t_set": 8,
|
| 98 |
+
"mdx_n_fft_scale_set": 7680,
|
| 99 |
+
"primary_stem": "Vocals"
|
| 100 |
+
},
|
| 101 |
+
"a3cd63058945e777505c01d2507daf37": {
|
| 102 |
+
"compensate": 1.03,
|
| 103 |
+
"mdx_dim_f_set": 2048,
|
| 104 |
+
"mdx_dim_t_set": 8,
|
| 105 |
+
"mdx_n_fft_scale_set": 6144,
|
| 106 |
+
"primary_stem": "Vocals"
|
| 107 |
+
},
|
| 108 |
+
"b33d9b3950b6cbf5fe90a32608924700": {
|
| 109 |
+
"compensate": 1.03,
|
| 110 |
+
"mdx_dim_f_set": 3072,
|
| 111 |
+
"mdx_dim_t_set": 8,
|
| 112 |
+
"mdx_n_fft_scale_set": 7680,
|
| 113 |
+
"primary_stem": "Vocals"
|
| 114 |
+
},
|
| 115 |
+
"c3b29bdce8c4fa17ec609e16220330ab": {
|
| 116 |
+
"compensate": 1.035,
|
| 117 |
+
"mdx_dim_f_set": 2048,
|
| 118 |
+
"mdx_dim_t_set": 8,
|
| 119 |
+
"mdx_n_fft_scale_set": 16384,
|
| 120 |
+
"primary_stem": "Bass"
|
| 121 |
+
},
|
| 122 |
+
"ceed671467c1f64ebdfac8a2490d0d52": {
|
| 123 |
+
"compensate": 1.035,
|
| 124 |
+
"mdx_dim_f_set": 3072,
|
| 125 |
+
"mdx_dim_t_set": 8,
|
| 126 |
+
"mdx_n_fft_scale_set": 7680,
|
| 127 |
+
"primary_stem": "Instrumental"
|
| 128 |
+
},
|
| 129 |
+
"d2a1376f310e4f7fa37fb9b5774eb701": {
|
| 130 |
+
"compensate": 1.035,
|
| 131 |
+
"mdx_dim_f_set": 3072,
|
| 132 |
+
"mdx_dim_t_set": 8,
|
| 133 |
+
"mdx_n_fft_scale_set": 7680,
|
| 134 |
+
"primary_stem": "Instrumental"
|
| 135 |
+
},
|
| 136 |
+
"d7bff498db9324db933d913388cba6be": {
|
| 137 |
+
"compensate": 1.035,
|
| 138 |
+
"mdx_dim_f_set": 2048,
|
| 139 |
+
"mdx_dim_t_set": 8,
|
| 140 |
+
"mdx_n_fft_scale_set": 6144,
|
| 141 |
+
"primary_stem": "Vocals"
|
| 142 |
+
},
|
| 143 |
+
"d94058f8c7f1fae4164868ae8ae66b20": {
|
| 144 |
+
"compensate": 1.035,
|
| 145 |
+
"mdx_dim_f_set": 2048,
|
| 146 |
+
"mdx_dim_t_set": 8,
|
| 147 |
+
"mdx_n_fft_scale_set": 6144,
|
| 148 |
+
"primary_stem": "Vocals"
|
| 149 |
+
},
|
| 150 |
+
"dc41ede5961d50f277eb846db17f5319": {
|
| 151 |
+
"compensate": 1.035,
|
| 152 |
+
"mdx_dim_f_set": 2048,
|
| 153 |
+
"mdx_dim_t_set": 9,
|
| 154 |
+
"mdx_n_fft_scale_set": 4096,
|
| 155 |
+
"primary_stem": "Drums"
|
| 156 |
+
},
|
| 157 |
+
"e5572e58abf111f80d8241d2e44e7fa4": {
|
| 158 |
+
"compensate": 1.028,
|
| 159 |
+
"mdx_dim_f_set": 3072,
|
| 160 |
+
"mdx_dim_t_set": 8,
|
| 161 |
+
"mdx_n_fft_scale_set": 7680,
|
| 162 |
+
"primary_stem": "Instrumental"
|
| 163 |
+
},
|
| 164 |
+
"e7324c873b1f615c35c1967f912db92a": {
|
| 165 |
+
"compensate": 1.03,
|
| 166 |
+
"mdx_dim_f_set": 3072,
|
| 167 |
+
"mdx_dim_t_set": 8,
|
| 168 |
+
"mdx_n_fft_scale_set": 7680,
|
| 169 |
+
"primary_stem": "Vocals"
|
| 170 |
+
},
|
| 171 |
+
"1c56ec0224f1d559c42fd6fd2a67b154": {
|
| 172 |
+
"compensate": 1.025,
|
| 173 |
+
"mdx_dim_f_set": 2048,
|
| 174 |
+
"mdx_dim_t_set": 8,
|
| 175 |
+
"mdx_n_fft_scale_set": 5120,
|
| 176 |
+
"primary_stem": "Instrumental"
|
| 177 |
+
},
|
| 178 |
+
"f2df6d6863d8f435436d8b561594ff49": {
|
| 179 |
+
"compensate": 1.035,
|
| 180 |
+
"mdx_dim_f_set": 3072,
|
| 181 |
+
"mdx_dim_t_set": 8,
|
| 182 |
+
"mdx_n_fft_scale_set": 7680,
|
| 183 |
+
"primary_stem": "Instrumental"
|
| 184 |
+
},
|
| 185 |
+
"b06327a00d5e5fbc7d96e1781bbdb596": {
|
| 186 |
+
"compensate": 1.035,
|
| 187 |
+
"mdx_dim_f_set": 3072,
|
| 188 |
+
"mdx_dim_t_set": 8,
|
| 189 |
+
"mdx_n_fft_scale_set": 6144,
|
| 190 |
+
"primary_stem": "Instrumental"
|
| 191 |
+
},
|
| 192 |
+
"94ff780b977d3ca07c7a343dab2e25dd": {
|
| 193 |
+
"compensate": 1.039,
|
| 194 |
+
"mdx_dim_f_set": 3072,
|
| 195 |
+
"mdx_dim_t_set": 8,
|
| 196 |
+
"mdx_n_fft_scale_set": 6144,
|
| 197 |
+
"primary_stem": "Instrumental"
|
| 198 |
+
},
|
| 199 |
+
"73492b58195c3b52d34590d5474452f6": {
|
| 200 |
+
"compensate": 1.043,
|
| 201 |
+
"mdx_dim_f_set": 3072,
|
| 202 |
+
"mdx_dim_t_set": 8,
|
| 203 |
+
"mdx_n_fft_scale_set": 7680,
|
| 204 |
+
"primary_stem": "Vocals"
|
| 205 |
+
},
|
| 206 |
+
"970b3f9492014d18fefeedfe4773cb42": {
|
| 207 |
+
"compensate": 1.009,
|
| 208 |
+
"mdx_dim_f_set": 3072,
|
| 209 |
+
"mdx_dim_t_set": 8,
|
| 210 |
+
"mdx_n_fft_scale_set": 7680,
|
| 211 |
+
"primary_stem": "Vocals"
|
| 212 |
+
},
|
| 213 |
+
"1d64a6d2c30f709b8c9b4ce1366d96ee": {
|
| 214 |
+
"compensate": 1.065,
|
| 215 |
+
"mdx_dim_f_set": 2048,
|
| 216 |
+
"mdx_dim_t_set": 8,
|
| 217 |
+
"mdx_n_fft_scale_set": 5120,
|
| 218 |
+
"primary_stem": "Instrumental",
|
| 219 |
+
"is_karaoke": true
|
| 220 |
+
},
|
| 221 |
+
"203f2a3955221b64df85a41af87cf8f0": {
|
| 222 |
+
"compensate": 1.035,
|
| 223 |
+
"mdx_dim_f_set": 3072,
|
| 224 |
+
"mdx_dim_t_set": 8,
|
| 225 |
+
"mdx_n_fft_scale_set": 6144,
|
| 226 |
+
"primary_stem": "Instrumental"
|
| 227 |
+
},
|
| 228 |
+
"291c2049608edb52648b96e27eb80e95": {
|
| 229 |
+
"compensate": 1.035,
|
| 230 |
+
"mdx_dim_f_set": 3072,
|
| 231 |
+
"mdx_dim_t_set": 8,
|
| 232 |
+
"mdx_n_fft_scale_set": 6144,
|
| 233 |
+
"primary_stem": "Instrumental"
|
| 234 |
+
},
|
| 235 |
+
"ead8d05dab12ec571d67549b3aab03fc": {
|
| 236 |
+
"compensate": 1.035,
|
| 237 |
+
"mdx_dim_f_set": 3072,
|
| 238 |
+
"mdx_dim_t_set": 8,
|
| 239 |
+
"mdx_n_fft_scale_set": 6144,
|
| 240 |
+
"primary_stem": "Instrumental"
|
| 241 |
+
},
|
| 242 |
+
"cc63408db3d80b4d85b0287d1d7c9632": {
|
| 243 |
+
"compensate": 1.033,
|
| 244 |
+
"mdx_dim_f_set": 3072,
|
| 245 |
+
"mdx_dim_t_set": 8,
|
| 246 |
+
"mdx_n_fft_scale_set": 6144,
|
| 247 |
+
"primary_stem": "Instrumental"
|
| 248 |
+
},
|
| 249 |
+
"cd5b2989ad863f116c855db1dfe24e39": {
|
| 250 |
+
"compensate": 1.035,
|
| 251 |
+
"mdx_dim_f_set": 3072,
|
| 252 |
+
"mdx_dim_t_set": 9,
|
| 253 |
+
"mdx_n_fft_scale_set": 6144,
|
| 254 |
+
"primary_stem": "Reverb"
|
| 255 |
+
},
|
| 256 |
+
"55657dd70583b0fedfba5f67df11d711": {
|
| 257 |
+
"compensate": 1.022,
|
| 258 |
+
"mdx_dim_f_set": 3072,
|
| 259 |
+
"mdx_dim_t_set": 8,
|
| 260 |
+
"mdx_n_fft_scale_set": 6144,
|
| 261 |
+
"primary_stem": "Instrumental"
|
| 262 |
+
},
|
| 263 |
+
"b6bccda408a436db8500083ef3491e8b": {
|
| 264 |
+
"compensate": 1.02,
|
| 265 |
+
"mdx_dim_f_set": 3072,
|
| 266 |
+
"mdx_dim_t_set": 8,
|
| 267 |
+
"mdx_n_fft_scale_set": 7680,
|
| 268 |
+
"primary_stem": "Instrumental"
|
| 269 |
+
},
|
| 270 |
+
"8a88db95c7fb5dbe6a095ff2ffb428b1": {
|
| 271 |
+
"compensate": 1.026,
|
| 272 |
+
"mdx_dim_f_set": 2048,
|
| 273 |
+
"mdx_dim_t_set": 8,
|
| 274 |
+
"mdx_n_fft_scale_set": 5120,
|
| 275 |
+
"primary_stem": "Instrumental"
|
| 276 |
+
},
|
| 277 |
+
"b78da4afc6512f98e4756f5977f5c6b9": {
|
| 278 |
+
"compensate": 1.021,
|
| 279 |
+
"mdx_dim_f_set": 3072,
|
| 280 |
+
"mdx_dim_t_set": 8,
|
| 281 |
+
"mdx_n_fft_scale_set": 7680,
|
| 282 |
+
"primary_stem": "Instrumental"
|
| 283 |
+
},
|
| 284 |
+
"77d07b2667ddf05b9e3175941b4454a0": {
|
| 285 |
+
"compensate": 1.021,
|
| 286 |
+
"mdx_dim_f_set": 3072,
|
| 287 |
+
"mdx_dim_t_set": 8,
|
| 288 |
+
"mdx_n_fft_scale_set": 7680,
|
| 289 |
+
"primary_stem": "Vocals"
|
| 290 |
+
},
|
| 291 |
+
"0f2a6bc5b49d87d64728ee40e23bceb1": {
|
| 292 |
+
"compensate": 1.019,
|
| 293 |
+
"mdx_dim_f_set": 2560,
|
| 294 |
+
"mdx_dim_t_set": 8,
|
| 295 |
+
"mdx_n_fft_scale_set": 5120,
|
| 296 |
+
"primary_stem": "Instrumental"
|
| 297 |
+
},
|
| 298 |
+
"b02be2d198d4968a121030cf8950b492": {
|
| 299 |
+
"compensate": 1.020,
|
| 300 |
+
"mdx_dim_f_set": 2560,
|
| 301 |
+
"mdx_dim_t_set": 8,
|
| 302 |
+
"mdx_n_fft_scale_set": 5120,
|
| 303 |
+
"primary_stem": "No Crowd"
|
| 304 |
+
},
|
| 305 |
+
"2154254ee89b2945b97a7efed6e88820": {
|
| 306 |
+
"config_yaml": "model_2_stem_061321.yaml"
|
| 307 |
+
},
|
| 308 |
+
"063aadd735d58150722926dcbf5852a9": {
|
| 309 |
+
"config_yaml": "model_2_stem_061321.yaml"
|
| 310 |
+
},
|
| 311 |
+
"c09f714d978b41d718facfe3427e6001": {
|
| 312 |
+
"config_yaml": "model_2_stem_061321.yaml"
|
| 313 |
+
},
|
| 314 |
+
"fe96801369f6a148df2720f5ced88c19": {
|
| 315 |
+
"config_yaml": "model3.yaml"
|
| 316 |
+
},
|
| 317 |
+
"02e8b226f85fb566e5db894b9931c640": {
|
| 318 |
+
"config_yaml": "model2.yaml"
|
| 319 |
+
},
|
| 320 |
+
"e3de6d861635ab9c1d766149edd680d6": {
|
| 321 |
+
"config_yaml": "model1.yaml"
|
| 322 |
+
},
|
| 323 |
+
"3f2936c554ab73ce2e396d54636bd373": {
|
| 324 |
+
"config_yaml": "modelB.yaml"
|
| 325 |
+
},
|
| 326 |
+
"890d0f6f82d7574bca741a9e8bcb8168": {
|
| 327 |
+
"config_yaml": "modelB.yaml"
|
| 328 |
+
},
|
| 329 |
+
"63a3cb8c37c474681049be4ad1ba8815": {
|
| 330 |
+
"config_yaml": "modelB.yaml"
|
| 331 |
+
},
|
| 332 |
+
"a7fc5d719743c7fd6b61bd2b4d48b9f0": {
|
| 333 |
+
"config_yaml": "modelA.yaml"
|
| 334 |
+
},
|
| 335 |
+
"3567f3dee6e77bf366fcb1c7b8bc3745": {
|
| 336 |
+
"config_yaml": "modelA.yaml"
|
| 337 |
+
},
|
| 338 |
+
"a28f4d717bd0d34cd2ff7a3b0a3d065e": {
|
| 339 |
+
"config_yaml": "modelA.yaml"
|
| 340 |
+
},
|
| 341 |
+
"c9971a18da20911822593dc81caa8be9": {
|
| 342 |
+
"config_yaml": "sndfx.yaml"
|
| 343 |
+
},
|
| 344 |
+
"57d94d5ed705460d21c75a5ac829a605": {
|
| 345 |
+
"config_yaml": "sndfx.yaml"
|
| 346 |
+
},
|
| 347 |
+
"e7a25f8764f25a52c1b96c4946e66ba2": {
|
| 348 |
+
"config_yaml": "sndfx.yaml"
|
| 349 |
+
},
|
| 350 |
+
"104081d24e37217086ce5fde09147ee1": {
|
| 351 |
+
"config_yaml": "model_2_stem_061321.yaml"
|
| 352 |
+
},
|
| 353 |
+
"1e6165b601539f38d0a9330f3facffeb": {
|
| 354 |
+
"config_yaml": "model_2_stem_061321.yaml"
|
| 355 |
+
},
|
| 356 |
+
"fe0108464ce0d8271be5ab810891bd7c": {
|
| 357 |
+
"config_yaml": "model_2_stem_full_band.yaml"
|
| 358 |
+
},
|
| 359 |
+
"e9b82ec90ee56c507a3a982f1555714c": {
|
| 360 |
+
"config_yaml": "model_2_stem_full_band_2.yaml"
|
| 361 |
+
},
|
| 362 |
+
"99b6ceaae542265a3b6d657bf9fde79f": {
|
| 363 |
+
"config_yaml": "model_2_stem_full_band_8k.yaml"
|
| 364 |
+
},
|
| 365 |
+
"116f6f9dabb907b53d847ed9f7a9475f": {
|
| 366 |
+
"config_yaml": "model_2_stem_full_band_8k.yaml"
|
| 367 |
+
},
|
| 368 |
+
"53f707017bfcbb56f5e1bfac420d6732": {
|
| 369 |
+
"config_yaml": "model_bs_roformer_ep_317_sdr_12.9755.yaml",
|
| 370 |
+
"is_roformer": true
|
| 371 |
+
},
|
| 372 |
+
"63e41acc264bf681a73aa9f7e5f606cc": {
|
| 373 |
+
"config_yaml": "model_mel_band_roformer_ep_3005_sdr_11.4360.yaml",
|
| 374 |
+
"is_roformer": true
|
| 375 |
+
},
|
| 376 |
+
"e733736763234047587931fc35322fd9": {
|
| 377 |
+
"config_yaml": "model_bs_roformer_ep_937_sdr_10.5309.yaml",
|
| 378 |
+
"is_roformer": true
|
| 379 |
+
},
|
| 380 |
+
"d789065adfd747d6f585b27b495bcdae": {
|
| 381 |
+
"config_yaml": "model_bs_roformer_ep_368_sdr_12.9628.yaml",
|
| 382 |
+
"is_roformer": true
|
| 383 |
+
}
|
| 384 |
+
}
|
mel_band_roformer_crowd_aufr33_viperx_sdr_8.7144_config.yaml
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 352800
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 801
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 384
|
| 13 |
+
depth: 6
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0
|
| 22 |
+
ff_dropout: 0
|
| 23 |
+
flash_attn: True
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: False
|
| 30 |
+
mask_estimator_depth: 2
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: False
|
| 40 |
+
|
| 41 |
+
training:
|
| 42 |
+
batch_size: 2
|
| 43 |
+
gradient_accumulation_steps: 1
|
| 44 |
+
grad_clip: 0
|
| 45 |
+
instruments:
|
| 46 |
+
- crowd
|
| 47 |
+
- other
|
| 48 |
+
lr: 1.0e-05
|
| 49 |
+
patience: 8
|
| 50 |
+
reduce_factor: 0.95
|
| 51 |
+
target_instrument: crowd
|
| 52 |
+
num_epochs: 1000
|
| 53 |
+
num_steps: 4032
|
| 54 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
| 55 |
+
augmentation_type: null
|
| 56 |
+
use_mp3_compress: false # Deprecated
|
| 57 |
+
augmentation_mix: false # Mix several stems of the same type with some probability
|
| 58 |
+
augmentation_loudness: false # randomly change loudness of each stem
|
| 59 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
| 60 |
+
augmentation_loudness_min: 0
|
| 61 |
+
augmentation_loudness_max: 0
|
| 62 |
+
q: 0.95
|
| 63 |
+
coarse_loss_clip: false
|
| 64 |
+
ema_momentum: 0.999
|
| 65 |
+
optimizer: adam
|
| 66 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 67 |
+
|
| 68 |
+
inference:
|
| 69 |
+
batch_size: 1
|
| 70 |
+
dim_t: 801
|
| 71 |
+
num_overlap: 4
|
mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956_config.yaml
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 352800
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 801
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 384
|
| 13 |
+
depth: 6
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0
|
| 22 |
+
ff_dropout: 0
|
| 23 |
+
flash_attn: True
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: False
|
| 30 |
+
mask_estimator_depth: 2
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: False
|
| 40 |
+
|
| 41 |
+
training:
|
| 42 |
+
batch_size: 4
|
| 43 |
+
gradient_accumulation_steps: 1
|
| 44 |
+
grad_clip: 0
|
| 45 |
+
instruments:
|
| 46 |
+
- Vocals
|
| 47 |
+
- Instrumental
|
| 48 |
+
lr: 1.0e-05
|
| 49 |
+
patience: 2
|
| 50 |
+
reduce_factor: 0.95
|
| 51 |
+
target_instrument: Vocals
|
| 52 |
+
num_epochs: 1000
|
| 53 |
+
num_steps: 2000
|
| 54 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
| 55 |
+
augmentation_type: null
|
| 56 |
+
use_mp3_compress: false # Deprecated
|
| 57 |
+
augmentation_mix: false # Mix several stems of the same type with some probability
|
| 58 |
+
augmentation_loudness: false # randomly change loudness of each stem
|
| 59 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
| 60 |
+
augmentation_loudness_min: 0
|
| 61 |
+
augmentation_loudness_max: 0
|
| 62 |
+
q: 0.95
|
| 63 |
+
coarse_loss_clip: false
|
| 64 |
+
ema_momentum: 0.999
|
| 65 |
+
optimizer: adam
|
| 66 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 67 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 68 |
+
inference:
|
| 69 |
+
batch_size: 1
|
| 70 |
+
dim_t: 801
|
| 71 |
+
num_overlap: 4
|
model_bs_roformer_ep_317_sdr_12.9755.yaml
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 352800
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 801 # don't work (use in model)
|
| 5 |
+
hop_length: 441 # don't work (use in model)
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.001
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 512
|
| 13 |
+
depth: 12
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
freqs_per_bands: !!python/tuple
|
| 19 |
+
- 2
|
| 20 |
+
- 2
|
| 21 |
+
- 2
|
| 22 |
+
- 2
|
| 23 |
+
- 2
|
| 24 |
+
- 2
|
| 25 |
+
- 2
|
| 26 |
+
- 2
|
| 27 |
+
- 2
|
| 28 |
+
- 2
|
| 29 |
+
- 2
|
| 30 |
+
- 2
|
| 31 |
+
- 2
|
| 32 |
+
- 2
|
| 33 |
+
- 2
|
| 34 |
+
- 2
|
| 35 |
+
- 2
|
| 36 |
+
- 2
|
| 37 |
+
- 2
|
| 38 |
+
- 2
|
| 39 |
+
- 2
|
| 40 |
+
- 2
|
| 41 |
+
- 2
|
| 42 |
+
- 2
|
| 43 |
+
- 4
|
| 44 |
+
- 4
|
| 45 |
+
- 4
|
| 46 |
+
- 4
|
| 47 |
+
- 4
|
| 48 |
+
- 4
|
| 49 |
+
- 4
|
| 50 |
+
- 4
|
| 51 |
+
- 4
|
| 52 |
+
- 4
|
| 53 |
+
- 4
|
| 54 |
+
- 4
|
| 55 |
+
- 12
|
| 56 |
+
- 12
|
| 57 |
+
- 12
|
| 58 |
+
- 12
|
| 59 |
+
- 12
|
| 60 |
+
- 12
|
| 61 |
+
- 12
|
| 62 |
+
- 12
|
| 63 |
+
- 24
|
| 64 |
+
- 24
|
| 65 |
+
- 24
|
| 66 |
+
- 24
|
| 67 |
+
- 24
|
| 68 |
+
- 24
|
| 69 |
+
- 24
|
| 70 |
+
- 24
|
| 71 |
+
- 48
|
| 72 |
+
- 48
|
| 73 |
+
- 48
|
| 74 |
+
- 48
|
| 75 |
+
- 48
|
| 76 |
+
- 48
|
| 77 |
+
- 48
|
| 78 |
+
- 48
|
| 79 |
+
- 128
|
| 80 |
+
- 129
|
| 81 |
+
dim_head: 64
|
| 82 |
+
heads: 8
|
| 83 |
+
attn_dropout: 0.1
|
| 84 |
+
ff_dropout: 0.1
|
| 85 |
+
flash_attn: true
|
| 86 |
+
dim_freqs_in: 1025
|
| 87 |
+
stft_n_fft: 2048
|
| 88 |
+
stft_hop_length: 441
|
| 89 |
+
stft_win_length: 2048
|
| 90 |
+
stft_normalized: false
|
| 91 |
+
mask_estimator_depth: 2
|
| 92 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 93 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 94 |
+
- 4096
|
| 95 |
+
- 2048
|
| 96 |
+
- 1024
|
| 97 |
+
- 512
|
| 98 |
+
- 256
|
| 99 |
+
multi_stft_hop_size: 147
|
| 100 |
+
multi_stft_normalized: False
|
| 101 |
+
|
| 102 |
+
training:
|
| 103 |
+
batch_size: 16
|
| 104 |
+
gradient_accumulation_steps: 1
|
| 105 |
+
grad_clip: 0
|
| 106 |
+
instruments:
|
| 107 |
+
- Vocals
|
| 108 |
+
- Instrumental
|
| 109 |
+
lr: 5.0e-05
|
| 110 |
+
patience: 2
|
| 111 |
+
reduce_factor: 0.95
|
| 112 |
+
target_instrument: Vocals
|
| 113 |
+
num_epochs: 1000
|
| 114 |
+
num_steps: 1000
|
| 115 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
| 116 |
+
augmentation_type: simple1
|
| 117 |
+
use_mp3_compress: false # Deprecated
|
| 118 |
+
augmentation_mix: true # Mix several stems of the same type with some probability
|
| 119 |
+
augmentation_loudness: true # randomly change loudness of each stem
|
| 120 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
| 121 |
+
augmentation_loudness_min: 0.5
|
| 122 |
+
augmentation_loudness_max: 1.5
|
| 123 |
+
q: 0.95
|
| 124 |
+
coarse_loss_clip: true
|
| 125 |
+
ema_momentum: 0.999
|
| 126 |
+
optimizer: adam
|
| 127 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 128 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 129 |
+
|
| 130 |
+
inference:
|
| 131 |
+
batch_size: 1
|
| 132 |
+
dim_t: 801
|
| 133 |
+
num_overlap: 4
|
model_bs_roformer_ep_368_sdr_12.9628.yaml
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 352800
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 801 # don't work (use in model)
|
| 5 |
+
hop_length: 441 # don't work (use in model)
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.001
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 512
|
| 13 |
+
depth: 12
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
freqs_per_bands: !!python/tuple
|
| 19 |
+
- 2
|
| 20 |
+
- 2
|
| 21 |
+
- 2
|
| 22 |
+
- 2
|
| 23 |
+
- 2
|
| 24 |
+
- 2
|
| 25 |
+
- 2
|
| 26 |
+
- 2
|
| 27 |
+
- 2
|
| 28 |
+
- 2
|
| 29 |
+
- 2
|
| 30 |
+
- 2
|
| 31 |
+
- 2
|
| 32 |
+
- 2
|
| 33 |
+
- 2
|
| 34 |
+
- 2
|
| 35 |
+
- 2
|
| 36 |
+
- 2
|
| 37 |
+
- 2
|
| 38 |
+
- 2
|
| 39 |
+
- 2
|
| 40 |
+
- 2
|
| 41 |
+
- 2
|
| 42 |
+
- 2
|
| 43 |
+
- 4
|
| 44 |
+
- 4
|
| 45 |
+
- 4
|
| 46 |
+
- 4
|
| 47 |
+
- 4
|
| 48 |
+
- 4
|
| 49 |
+
- 4
|
| 50 |
+
- 4
|
| 51 |
+
- 4
|
| 52 |
+
- 4
|
| 53 |
+
- 4
|
| 54 |
+
- 4
|
| 55 |
+
- 12
|
| 56 |
+
- 12
|
| 57 |
+
- 12
|
| 58 |
+
- 12
|
| 59 |
+
- 12
|
| 60 |
+
- 12
|
| 61 |
+
- 12
|
| 62 |
+
- 12
|
| 63 |
+
- 24
|
| 64 |
+
- 24
|
| 65 |
+
- 24
|
| 66 |
+
- 24
|
| 67 |
+
- 24
|
| 68 |
+
- 24
|
| 69 |
+
- 24
|
| 70 |
+
- 24
|
| 71 |
+
- 48
|
| 72 |
+
- 48
|
| 73 |
+
- 48
|
| 74 |
+
- 48
|
| 75 |
+
- 48
|
| 76 |
+
- 48
|
| 77 |
+
- 48
|
| 78 |
+
- 48
|
| 79 |
+
- 128
|
| 80 |
+
- 129
|
| 81 |
+
dim_head: 64
|
| 82 |
+
heads: 8
|
| 83 |
+
attn_dropout: 0.1
|
| 84 |
+
ff_dropout: 0.1
|
| 85 |
+
flash_attn: true
|
| 86 |
+
dim_freqs_in: 1025
|
| 87 |
+
stft_n_fft: 2048
|
| 88 |
+
stft_hop_length: 441
|
| 89 |
+
stft_win_length: 2048
|
| 90 |
+
stft_normalized: false
|
| 91 |
+
mask_estimator_depth: 2
|
| 92 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 93 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 94 |
+
- 4096
|
| 95 |
+
- 2048
|
| 96 |
+
- 1024
|
| 97 |
+
- 512
|
| 98 |
+
- 256
|
| 99 |
+
multi_stft_hop_size: 147
|
| 100 |
+
multi_stft_normalized: False
|
| 101 |
+
|
| 102 |
+
training:
|
| 103 |
+
batch_size: 16
|
| 104 |
+
gradient_accumulation_steps: 1
|
| 105 |
+
grad_clip: 0
|
| 106 |
+
instruments:
|
| 107 |
+
- Vocals
|
| 108 |
+
- Instrumental
|
| 109 |
+
lr: 5.0e-05
|
| 110 |
+
patience: 2
|
| 111 |
+
reduce_factor: 0.95
|
| 112 |
+
target_instrument: Vocals
|
| 113 |
+
num_epochs: 1000
|
| 114 |
+
num_steps: 1000
|
| 115 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
| 116 |
+
augmentation_type: simple1
|
| 117 |
+
use_mp3_compress: false # Deprecated
|
| 118 |
+
augmentation_mix: true # Mix several stems of the same type with some probability
|
| 119 |
+
augmentation_loudness: true # randomly change loudness of each stem
|
| 120 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
| 121 |
+
augmentation_loudness_min: 0.5
|
| 122 |
+
augmentation_loudness_max: 1.5
|
| 123 |
+
q: 0.95
|
| 124 |
+
coarse_loss_clip: true
|
| 125 |
+
ema_momentum: 0.999
|
| 126 |
+
optimizer: adam
|
| 127 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 128 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 129 |
+
|
| 130 |
+
inference:
|
| 131 |
+
batch_size: 1
|
| 132 |
+
dim_t: 801
|
| 133 |
+
num_overlap: 4
|
model_bs_roformer_ep_937_sdr_10.5309.yaml
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 131584
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 256
|
| 5 |
+
hop_length: 512
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.001
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 384
|
| 13 |
+
depth: 12
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
linear_transformer_depth: 0
|
| 19 |
+
freqs_per_bands: !!python/tuple
|
| 20 |
+
- 2
|
| 21 |
+
- 2
|
| 22 |
+
- 2
|
| 23 |
+
- 2
|
| 24 |
+
- 2
|
| 25 |
+
- 2
|
| 26 |
+
- 2
|
| 27 |
+
- 2
|
| 28 |
+
- 2
|
| 29 |
+
- 2
|
| 30 |
+
- 2
|
| 31 |
+
- 2
|
| 32 |
+
- 2
|
| 33 |
+
- 2
|
| 34 |
+
- 2
|
| 35 |
+
- 2
|
| 36 |
+
- 2
|
| 37 |
+
- 2
|
| 38 |
+
- 2
|
| 39 |
+
- 2
|
| 40 |
+
- 2
|
| 41 |
+
- 2
|
| 42 |
+
- 2
|
| 43 |
+
- 2
|
| 44 |
+
- 4
|
| 45 |
+
- 4
|
| 46 |
+
- 4
|
| 47 |
+
- 4
|
| 48 |
+
- 4
|
| 49 |
+
- 4
|
| 50 |
+
- 4
|
| 51 |
+
- 4
|
| 52 |
+
- 4
|
| 53 |
+
- 4
|
| 54 |
+
- 4
|
| 55 |
+
- 4
|
| 56 |
+
- 12
|
| 57 |
+
- 12
|
| 58 |
+
- 12
|
| 59 |
+
- 12
|
| 60 |
+
- 12
|
| 61 |
+
- 12
|
| 62 |
+
- 12
|
| 63 |
+
- 12
|
| 64 |
+
- 24
|
| 65 |
+
- 24
|
| 66 |
+
- 24
|
| 67 |
+
- 24
|
| 68 |
+
- 24
|
| 69 |
+
- 24
|
| 70 |
+
- 24
|
| 71 |
+
- 24
|
| 72 |
+
- 48
|
| 73 |
+
- 48
|
| 74 |
+
- 48
|
| 75 |
+
- 48
|
| 76 |
+
- 48
|
| 77 |
+
- 48
|
| 78 |
+
- 48
|
| 79 |
+
- 48
|
| 80 |
+
- 128
|
| 81 |
+
- 129
|
| 82 |
+
dim_head: 64
|
| 83 |
+
heads: 8
|
| 84 |
+
attn_dropout: 0.1
|
| 85 |
+
ff_dropout: 0.1
|
| 86 |
+
flash_attn: true
|
| 87 |
+
dim_freqs_in: 1025
|
| 88 |
+
stft_n_fft: 2048
|
| 89 |
+
stft_hop_length: 512
|
| 90 |
+
stft_win_length: 2048
|
| 91 |
+
stft_normalized: false
|
| 92 |
+
mask_estimator_depth: 2
|
| 93 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 94 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 95 |
+
- 4096
|
| 96 |
+
- 2048
|
| 97 |
+
- 1024
|
| 98 |
+
- 512
|
| 99 |
+
- 256
|
| 100 |
+
multi_stft_hop_size: 147
|
| 101 |
+
multi_stft_normalized: False
|
| 102 |
+
|
| 103 |
+
training:
|
| 104 |
+
batch_size: 4
|
| 105 |
+
gradient_accumulation_steps: 1
|
| 106 |
+
grad_clip: 0
|
| 107 |
+
instruments:
|
| 108 |
+
- No Drum-Bass
|
| 109 |
+
- Drum-Bass
|
| 110 |
+
lr: 5.0e-05
|
| 111 |
+
patience: 2
|
| 112 |
+
reduce_factor: 0.95
|
| 113 |
+
target_instrument: No Drum-Bass
|
| 114 |
+
num_epochs: 1000
|
| 115 |
+
num_steps: 1000
|
| 116 |
+
q: 0.95
|
| 117 |
+
coarse_loss_clip: true
|
| 118 |
+
ema_momentum: 0.999
|
| 119 |
+
optimizer: adam
|
| 120 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 121 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 122 |
+
|
| 123 |
+
augmentations:
|
| 124 |
+
enable: true # enable or disable all augmentations (to fast disable if needed)
|
| 125 |
+
loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
|
| 126 |
+
loudness_min: 0.5
|
| 127 |
+
loudness_max: 1.5
|
| 128 |
+
mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
|
| 129 |
+
mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
|
| 130 |
+
- 0.2
|
| 131 |
+
- 0.02
|
| 132 |
+
mixup_loudness_min: 0.5
|
| 133 |
+
mixup_loudness_max: 1.5
|
| 134 |
+
|
| 135 |
+
inference:
|
| 136 |
+
batch_size: 1
|
| 137 |
+
dim_t: 512
|
| 138 |
+
num_overlap: 4
|
model_mel_band_roformer_ep_3005_sdr_11.4360.yaml
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 352800
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 801 # don't work (use in model)
|
| 5 |
+
hop_length: 441 # don't work (use in model)
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.001
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 384
|
| 13 |
+
depth: 12
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0.1
|
| 22 |
+
ff_dropout: 0.1
|
| 23 |
+
flash_attn: True
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: False
|
| 30 |
+
mask_estimator_depth: 2
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: False
|
| 40 |
+
|
| 41 |
+
training:
|
| 42 |
+
batch_size: 9
|
| 43 |
+
gradient_accumulation_steps: 8
|
| 44 |
+
grad_clip: 0
|
| 45 |
+
instruments:
|
| 46 |
+
- Vocals
|
| 47 |
+
- Instrumental
|
| 48 |
+
lr: 4.0e-05
|
| 49 |
+
patience: 2
|
| 50 |
+
reduce_factor: 0.95
|
| 51 |
+
target_instrument: Vocals
|
| 52 |
+
num_epochs: 1000
|
| 53 |
+
num_steps: 1000
|
| 54 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
| 55 |
+
augmentation_type: simple1
|
| 56 |
+
use_mp3_compress: false # Deprecated
|
| 57 |
+
augmentation_mix: true # Mix several stems of the same type with some probability
|
| 58 |
+
augmentation_loudness: true # randomly change loudness of each stem
|
| 59 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
| 60 |
+
augmentation_loudness_min: 0.5
|
| 61 |
+
augmentation_loudness_max: 1.5
|
| 62 |
+
q: 0.95
|
| 63 |
+
coarse_loss_clip: true
|
| 64 |
+
ema_momentum: 0.999
|
| 65 |
+
optimizer: adam
|
| 66 |
+
other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
|
| 67 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 68 |
+
|
| 69 |
+
inference:
|
| 70 |
+
batch_size: 1
|
| 71 |
+
dim_t: 801
|
| 72 |
+
num_overlap: 4
|
repro_mdx_a_time_only.yaml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
models: ['9a6b4851', '9a6b4851', '1ef250f1', '1ef250f1']
|
| 2 |
+
segment: 44
|
scnet_checkpoint_musdb18.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1bc0d1abb20bfdf966dcd07637bafd03e4bc13653d09ef18bc9b3e342eafe2aa
|
| 3 |
+
size 42434986
|
vocals_mel_band_roformer.yaml
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 352800
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 256
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.001
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 384
|
| 13 |
+
depth: 6
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0
|
| 22 |
+
ff_dropout: 0
|
| 23 |
+
flash_attn: True
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: False
|
| 30 |
+
mask_estimator_depth: 2
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: False
|
| 40 |
+
|
| 41 |
+
training:
|
| 42 |
+
instruments:
|
| 43 |
+
- vocals
|
| 44 |
+
- other
|
| 45 |
+
target_instrument: vocals
|
| 46 |
+
|
| 47 |
+
inference:
|
| 48 |
+
dim_t: 1101
|
| 49 |
+
num_overlap: 1
|
| 50 |
+
chunk_size: 352800
|