diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..8a82ee018b5ae32aaa494efdf6f6e0fce967138c --- /dev/null +++ b/.gitattributes @@ -0,0 +1,6 @@ +kuielab_b_other.onnx filter=lfs diff=lfs merge=lfs -text +kuielab_b_vocals.onnx filter=lfs diff=lfs merge=lfs -text +UVR_MDXNET_3_9662.onnx filter=lfs diff=lfs merge=lfs -text +UVR_MDXNET_1_9703.onnx filter=lfs diff=lfs merge=lfs -text +scnet_checkpoint_musdb18.ckpt filter=lfs diff=lfs merge=lfs -text +UVR-MDX-NET_Crowd_HQ_1.onnx filter=lfs diff=lfs merge=lfs -text diff --git a/BS-Roformer-SW.yaml b/BS-Roformer-SW.yaml new file mode 100644 index 0000000000000000000000000000000000000000..49760ce0b48d341ee70760803dd59f4cb0982728 --- /dev/null +++ b/BS-Roformer-SW.yaml @@ -0,0 +1,198 @@ +audio: + chunk_size: 588800 #882000 + dim_f: 1024 + dim_t: 801 # don't work (use in model) + hop_length: 441 # don't work (use in model) + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 256 + depth: 12 + stereo: true + num_stems: 6 + time_transformer_depth: 1 + freq_transformer_depth: 1 + linear_transformer_depth: 0 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 512 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + mlp_expansion_factor: 4 + use_torch_checkpoint: False # it allows to greatly reduce GPU memory consumption during training (not fully tested) + skip_connection: False # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training + +training: + batch_size: 2 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: ['bass', 'drums', 'other', 'vocals', 'guitar', 'piano'] + patience: 3 + reduce_factor: 0.95 + target_instrument: null + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: simple1 + use_mp3_compress: false # Deprecated + augmentation_mix: true # Mix several stems of the same type with some probability + augmentation_loudness: true # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0.5 + augmentation_loudness_max: 1.5 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + # optimizer: prodigy + optimizer: adam + # lr: 1.0 + lr: 1.0e-5 + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +augmentations: + enable: true # enable or disable all augmentations (to fast disable if needed) + loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max) + loudness_min: 0.5 + loudness_max: 1.5 + mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3) + mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02) + - 0.2 + - 0.02 + mixup_loudness_min: 0.5 + mixup_loudness_max: 1.5 + + all: + channel_shuffle: 0.5 # Set 0 or lower to disable + random_inverse: 0.1 # inverse track (better lower probability) + random_polarity: 0.5 # polarity change (multiply waveform to -1) + + vocals: + pitch_shift: 0.1 + pitch_shift_min_semitones: -5 + pitch_shift_max_semitones: 5 + seven_band_parametric_eq: 0.1 + seven_band_parametric_eq_min_gain_db: -9 + seven_band_parametric_eq_max_gain_db: 9 + tanh_distortion: 0.1 + tanh_distortion_min: 0.1 + tanh_distortion_max: 0.7 + bass: + pitch_shift: 0.1 + pitch_shift_min_semitones: -2 + pitch_shift_max_semitones: 2 + seven_band_parametric_eq: 0.1 + seven_band_parametric_eq_min_gain_db: -3 + seven_band_parametric_eq_max_gain_db: 6 + tanh_distortion: 0.1 + tanh_distortion_min: 0.1 + tanh_distortion_max: 0.5 + drums: + pitch_shift: 0.1 + pitch_shift_min_semitones: -5 + pitch_shift_max_semitones: 5 + seven_band_parametric_eq: 0.1 + seven_band_parametric_eq_min_gain_db: -9 + seven_band_parametric_eq_max_gain_db: 9 + tanh_distortion: 0.1 + tanh_distortion_min: 0.1 + tanh_distortion_max: 0.6 + other: + pitch_shift: 0.1 + pitch_shift_min_semitones: -4 + pitch_shift_max_semitones: 4 + gaussian_noise: 0.1 + gaussian_noise_min_amplitude: 0.001 + gaussian_noise_max_amplitude: 0.015 + time_stretch: 0.1 + time_stretch_min_rate: 0.8 + time_stretch_max_rate: 1.25 + + +inference: + batch_size: 1 + dim_t: 801 # Changed from 1101 to match training + num_overlap: 2 + normalize: false + diff --git a/UVR-MDX-NET_Crowd_HQ_1.onnx b/UVR-MDX-NET_Crowd_HQ_1.onnx new file mode 100644 index 0000000000000000000000000000000000000000..466c3fa69b05f5b27c19cc11eb23c99909d2a4d0 --- /dev/null +++ b/UVR-MDX-NET_Crowd_HQ_1.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:313b7bf869c411fdafe005cf0d5a635c405cb3d0df137178a64091952d75225c +size 59074342 diff --git a/UVR_Demucs_Model_1.yaml b/UVR_Demucs_Model_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0995b75dd7149595388255185ab68a1a81ea9477 --- /dev/null +++ b/UVR_Demucs_Model_1.yaml @@ -0,0 +1,2 @@ +models: ['ebf34a2db'] +segment: 44 \ No newline at end of file diff --git a/UVR_MDXNET_1_9703.onnx b/UVR_MDXNET_1_9703.onnx new file mode 100644 index 0000000000000000000000000000000000000000..50d00b7e34e7763954283b9fc13f2d903072be03 --- /dev/null +++ b/UVR_MDXNET_1_9703.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:229ad3bb96a037e89d8ed86732d6d3675856e6a07c3e3f02896eac01ec7ee4be +size 29704436 diff --git a/UVR_MDXNET_3_9662.onnx b/UVR_MDXNET_3_9662.onnx new file mode 100644 index 0000000000000000000000000000000000000000..140b8e3eb273df75384c691462998774b3928a52 --- /dev/null +++ b/UVR_MDXNET_3_9662.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e02220e80d8253f4c2209f8924298b2b686bbdf2868b788ff5500fb9bd94aadc +size 29704436 diff --git a/assets/__pycache__/model_tools.cpython-310.pyc b/assets/__pycache__/model_tools.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..10a3001f5b79176c41ce87c16a75452728edafdd Binary files /dev/null and b/assets/__pycache__/model_tools.cpython-310.pyc differ diff --git a/assets/__pycache__/model_tools.cpython-313.pyc b/assets/__pycache__/model_tools.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..477c4f1fda65c79917d95ce6a80b253d6897f84e Binary files /dev/null and b/assets/__pycache__/model_tools.cpython-313.pyc differ diff --git a/assets/calculate-model-hashes.py b/assets/calculate-model-hashes.py new file mode 100644 index 0000000000000000000000000000000000000000..3031ba1736f36502e106faf1c984a1a3d3fe3df2 --- /dev/null +++ b/assets/calculate-model-hashes.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python3 + +import os +import sys +import json +import hashlib +import requests +import model_tools as mt + +MODEL_CACHE_PATH = "/tmp/audio-separator-models" +VR_MODEL_DATA_LOCAL_PATH = f"{MODEL_CACHE_PATH}/vr_model_data.json" +MDX_MODEL_DATA_LOCAL_PATH = f"{MODEL_CACHE_PATH}/mdx_model_data.json" + +MODEL_DATA_URL_PREFIX = "https://raw.githubusercontent.com/TRvlvr/application_data/main" +VR_MODEL_DATA_URL = f"{MODEL_DATA_URL_PREFIX}/vr_model_data/model_data_new.json" +MDX_MODEL_DATA_URL = f"{MODEL_DATA_URL_PREFIX}/mdx_model_data/model_data_new.json" + +OUTPUT_PATH = f"{MODEL_CACHE_PATH}/model_hashes.json" + +if __name__ == "__main__": + mt.iterate_and_hash(MODEL_CACHE_PATH) diff --git a/assets/delete_duplicate_models.py b/assets/delete_duplicate_models.py new file mode 100644 index 0000000000000000000000000000000000000000..7f7193407638956006ffaea30a8113d30370fe85 --- /dev/null +++ b/assets/delete_duplicate_models.py @@ -0,0 +1,8 @@ +import os +import hashlib +from collections import defaultdict +import model_tools as mt + +if __name__ == "__main__": + print(f"Scanning directory: {os.getcwd()}") + mt.find_and_remove_duplicates() diff --git a/assets/list_duplicate_remove.py b/assets/list_duplicate_remove.py new file mode 100644 index 0000000000000000000000000000000000000000..d4c5b2f81f722d40df9216bcbf0126cb1908f128 --- /dev/null +++ b/assets/list_duplicate_remove.py @@ -0,0 +1,7 @@ +from model_tools import remove_duplicate_lines + +input_filename = "file.txt" +output_filename = "processed_links.txt" + +if __name__ == "__main__": + remove_duplicate_lines(input_filename, output_filename) diff --git a/assets/model_data/mdx_model_data.json b/assets/model_data/mdx_model_data.json new file mode 100644 index 0000000000000000000000000000000000000000..ba79b189d787ecd9b3035fca87421377f79cd67f --- /dev/null +++ b/assets/model_data/mdx_model_data.json @@ -0,0 +1,384 @@ +{ + "0ddfc0eb5792638ad5dc27850236c246": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "26d308f91f3423a67dc69a6d12a8793d": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 9, + "mdx_n_fft_scale_set": 8192, + "primary_stem": "Other" + }, + "2cdd429caac38f0194b133884160f2c6": { + "compensate": 1.045, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "2f5501189a2f6db6349916fabe8c90de": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals", + "is_karaoke": true + }, + "398580b6d5d973af3120df54cee6759d": { + "compensate": 1.75, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "488b3e6f8bd3717d9d7c428476be2d75": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "4910e7827f335048bdac11fa967772f9": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 7, + "mdx_n_fft_scale_set": 4096, + "primary_stem": "Drums" + }, + "53c4baf4d12c3e6c3831bb8f5b532b93": { + "compensate": 1.043, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "5d343409ef0df48c7d78cce9f0106781": { + "compensate": 1.075, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "5f6483271e1efb9bfb59e4a3e6d4d098": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 9, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "65ab5919372a128e4167f5e01a8fda85": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 8192, + "primary_stem": "Other" + }, + "6703e39f36f18aa7855ee1047765621d": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 9, + "mdx_n_fft_scale_set": 16384, + "primary_stem": "Bass" + }, + "6b31de20e84392859a3d09d43f089515": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "867595e9de46f6ab699008295df62798": { + "compensate": 1.03, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "a3cd63058945e777505c01d2507daf37": { + "compensate": 1.03, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "b33d9b3950b6cbf5fe90a32608924700": { + "compensate": 1.03, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "c3b29bdce8c4fa17ec609e16220330ab": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 16384, + "primary_stem": "Bass" + }, + "ceed671467c1f64ebdfac8a2490d0d52": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "d2a1376f310e4f7fa37fb9b5774eb701": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "d7bff498db9324db933d913388cba6be": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "d94058f8c7f1fae4164868ae8ae66b20": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "dc41ede5961d50f277eb846db17f5319": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 9, + "mdx_n_fft_scale_set": 4096, + "primary_stem": "Drums" + }, + "e5572e58abf111f80d8241d2e44e7fa4": { + "compensate": 1.028, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "e7324c873b1f615c35c1967f912db92a": { + "compensate": 1.03, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "1c56ec0224f1d559c42fd6fd2a67b154": { + "compensate": 1.025, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 5120, + "primary_stem": "Instrumental" + }, + "f2df6d6863d8f435436d8b561594ff49": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "b06327a00d5e5fbc7d96e1781bbdb596": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "94ff780b977d3ca07c7a343dab2e25dd": { + "compensate": 1.039, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "73492b58195c3b52d34590d5474452f6": { + "compensate": 1.043, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "970b3f9492014d18fefeedfe4773cb42": { + "compensate": 1.009, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "1d64a6d2c30f709b8c9b4ce1366d96ee": { + "compensate": 1.065, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 5120, + "primary_stem": "Instrumental", + "is_karaoke": true + }, + "203f2a3955221b64df85a41af87cf8f0": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "291c2049608edb52648b96e27eb80e95": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "ead8d05dab12ec571d67549b3aab03fc": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "cc63408db3d80b4d85b0287d1d7c9632": { + "compensate": 1.033, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "cd5b2989ad863f116c855db1dfe24e39": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 9, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Reverb" + }, + "55657dd70583b0fedfba5f67df11d711": { + "compensate": 1.022, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "b6bccda408a436db8500083ef3491e8b": { + "compensate": 1.02, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "8a88db95c7fb5dbe6a095ff2ffb428b1": { + "compensate": 1.026, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 5120, + "primary_stem": "Instrumental" + }, + "b78da4afc6512f98e4756f5977f5c6b9": { + "compensate": 1.021, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "77d07b2667ddf05b9e3175941b4454a0": { + "compensate": 1.021, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "0f2a6bc5b49d87d64728ee40e23bceb1": { + "compensate": 1.019, + "mdx_dim_f_set": 2560, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 5120, + "primary_stem": "Instrumental" + }, + "b02be2d198d4968a121030cf8950b492": { + "compensate": 1.020, + "mdx_dim_f_set": 2560, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 5120, + "primary_stem": "No Crowd" + }, + "2154254ee89b2945b97a7efed6e88820": { + "config_yaml": "model_2_stem_061321.yaml" + }, + "063aadd735d58150722926dcbf5852a9": { + "config_yaml": "model_2_stem_061321.yaml" + }, + "c09f714d978b41d718facfe3427e6001": { + "config_yaml": "model_2_stem_061321.yaml" + }, + "fe96801369f6a148df2720f5ced88c19": { + "config_yaml": "model3.yaml" + }, + "02e8b226f85fb566e5db894b9931c640": { + "config_yaml": "model2.yaml" + }, + "e3de6d861635ab9c1d766149edd680d6": { + "config_yaml": "model1.yaml" + }, + "3f2936c554ab73ce2e396d54636bd373": { + "config_yaml": "modelB.yaml" + }, + "890d0f6f82d7574bca741a9e8bcb8168": { + "config_yaml": "modelB.yaml" + }, + "63a3cb8c37c474681049be4ad1ba8815": { + "config_yaml": "modelB.yaml" + }, + "a7fc5d719743c7fd6b61bd2b4d48b9f0": { + "config_yaml": "modelA.yaml" + }, + "3567f3dee6e77bf366fcb1c7b8bc3745": { + "config_yaml": "modelA.yaml" + }, + "a28f4d717bd0d34cd2ff7a3b0a3d065e": { + "config_yaml": "modelA.yaml" + }, + "c9971a18da20911822593dc81caa8be9": { + "config_yaml": "sndfx.yaml" + }, + "57d94d5ed705460d21c75a5ac829a605": { + "config_yaml": "sndfx.yaml" + }, + "e7a25f8764f25a52c1b96c4946e66ba2": { + "config_yaml": "sndfx.yaml" + }, + "104081d24e37217086ce5fde09147ee1": { + "config_yaml": "model_2_stem_061321.yaml" + }, + "1e6165b601539f38d0a9330f3facffeb": { + "config_yaml": "model_2_stem_061321.yaml" + }, + "fe0108464ce0d8271be5ab810891bd7c": { + "config_yaml": "model_2_stem_full_band.yaml" + }, + "e9b82ec90ee56c507a3a982f1555714c": { + "config_yaml": "model_2_stem_full_band_2.yaml" + }, + "99b6ceaae542265a3b6d657bf9fde79f": { + "config_yaml": "model_2_stem_full_band_8k.yaml" + }, + "116f6f9dabb907b53d847ed9f7a9475f": { + "config_yaml": "model_2_stem_full_band_8k.yaml" + }, + "53f707017bfcbb56f5e1bfac420d6732": { + "config_yaml": "model_bs_roformer_ep_317_sdr_12.9755.yaml", + "is_roformer": true + }, + "63e41acc264bf681a73aa9f7e5f606cc": { + "config_yaml": "model_mel_band_roformer_ep_3005_sdr_11.4360.yaml", + "is_roformer": true + }, + "e733736763234047587931fc35322fd9": { + "config_yaml": "model_bs_roformer_ep_937_sdr_10.5309.yaml", + "is_roformer": true + }, + "d789065adfd747d6f585b27b495bcdae": { + "config_yaml": "model_bs_roformer_ep_368_sdr_12.9628.yaml", + "is_roformer": true + } +} diff --git a/assets/model_tools.py b/assets/model_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..90a382edc3e171a9b656e657a90551666b64ca52 --- /dev/null +++ b/assets/model_tools.py @@ -0,0 +1,380 @@ +import hashlib +import json +import os +import sys +import subprocess +import requests +from huggingface_hub import HfApi, snapshot_download +import hashlib +from collections import defaultdict + + +from concurrent.futures import ThreadPoolExecutor + +def calculate_file_hash(filepath, block_size=65536): + """Calculates the SHA256 hash of a file's content.""" + sha256 = hashlib.sha256() + try: + with open(filepath, "rb") as f: + while chunk := f.read(block_size): + sha256.update(chunk) + except FileNotFoundError: + return None # Handle cases where a file might be deleted during the scan + + return sha256.hexdigest() + + +def find_and_remove_duplicates(directory="."): + """Finds duplicate files in the given directory and removes the one with the longer filename.""" + hashes_to_files = defaultdict(list) + files_to_hash = {} + + # Step 1: Hash all files in the directory + for filename in os.listdir(directory): + filepath = os.path.join(directory, filename) + if os.path.isfile(filepath): + file_hash = calculate_file_hash(filepath) + if file_hash: + hashes_to_files[file_hash].append(filepath) + files_to_hash[filepath] = file_hash + + # Step 2: Identify duplicate groups (more than one file per hash) + duplicates = {h: files for h, files in hashes_to_files.items() if len(files) > 1} + + if not duplicates: + print("No duplicate files found.") + return + + # Step 3: Iterate over duplicates, compare filename length, and delete the longer one + for file_hash, file_list in duplicates.items(): + # Sort files by filename length (ascending). The one to keep is the first item. + # If lengths are equal, an arbitrary one is kept. + files_sorted_by_length = sorted(file_list, key=len) + file_to_keep = files_sorted_by_length[0] + files_to_delete = files_sorted_by_length[1:] + + print(f"\nDuplicate group (Hash: {file_hash[:10]}...):") + print(f" Keeping: {file_to_keep}") + for file_to_delete in files_to_delete: + try: + os.remove(file_to_delete) + print(f" Deleted: {file_to_delete} (longer filename)") + except OSError as e: + print(f" Error deleting {file_to_delete}: {e}") + + +def download_file(url, local_dir): + """Helper function to download a single file.""" + try: + # Extract filename from URL (e.g., https://example.com/file.jpg -> file.jpg) + filename = url.split("/")[-1].split("?")[0] or "downloaded_file" + save_path = os.path.join(local_dir, filename) + + # Download the file content + response = requests.get(url, stream=True, timeout=10) + response.raise_for_status() + + with open(save_path, "wb") as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + return f"Successfully downloaded: {filename}" + except Exception as e: + return f"Failed to download {url}: {e}" + + +def download_files_from_txt(filename, local_dir): + """Main function to read URLs and download them using 20 threads.""" + # Ensure local directory exists + if not os.path.exists(local_dir): + os.makedirs(local_dir) + + # Read URLs from the text file + with open(filename, "r") as f: + urls = [line.strip() for line in f if line.strip()] + + # Use ThreadPoolExecutor to handle 20 downloads at a time + with ThreadPoolExecutor(max_workers=20) as executor: + # Submit all download tasks to the pool + results = [executor.submit(download_file, url, local_dir) for url in urls] + + # Monitor results as they complete + for future in results: + print(future.result()) + + +def download_files_from_txt_aria(filename, local_dir): + command = [ + "aria2c", + "--input-file", + filename, + "--dir", + local_dir, + "-c", # Continue downloading a partially downloaded file + "-j", + "30", # Set max concurrent downloads (adjust as needed) + "-x", + "16", # Set max connections per server (adjust as needed) + ] + print(f"Starting downloads with aria2c in directory: {os.path.abspath(local_dir)}") + try: + # Execute the command + subprocess.run( + command, + check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + print("All downloads finished successfully.") + except subprocess.CalledProcessError as e: + print(f"An error occurred during aria2c execution: {e.stderr}") + except Exception as e: + print(f"An unexpected error occurred: {e}") + finally: + # os.remove(filename) + print(f"Downloaded all files: {filename}") + + +def download_hf_repo(repo_id, local_dir, repo_type, token): + if not token: + token = os.getenv("HF_TOKEN") + """ + Downloads an entire Hugging Face repository to a specified local directory. + """ + print(f"Downloading {repo_id} to {local_dir}...") + + # Ensure the target directory exists + os.makedirs(local_dir, exist_ok=True) + + # Download the snapshot + downloaded_path = snapshot_download( + repo_id=repo_id, + local_dir=local_dir, + token=token, + local_dir_use_symlinks=False, # Set to False to ensure actual files are moved to local_dir + repo_type=repo_type, + ) + + print(f"Download complete! Files are located in: {downloaded_path}") + return downloaded_path + + +def remove_duplicate_lines(input_file_path, output_file_path): + """ + Reads lines from input_file_path, removes duplicates, and writes + unique lines to output_file_path while preserving order. + """ + try: + # Use an ordered set to maintain the original file's line order. + # An easy way to do this in Python 3.7+ is using a dictionary's keys. + unique_lines_dict = {} + with open(input_file_path, "r") as input_file: + for line in input_file: + # Store line as a dictionary key; duplicates will be ignored + unique_lines_dict[line] = None + + unique_lines = unique_lines_dict.keys() + + with open(output_file_path, "w") as output_file: + # Write all unique lines to the new file + output_file.writelines(unique_lines) + + print(f"Duplicates removed. Unique lines saved to '{output_file_path}'") + + except FileNotFoundError: + print(f"Error: The file '{input_file_path}' was not found.") + except Exception as e: + print(f"An error occurred: {e}") + + +def push_to_hf(repo_id, repo_type): + api = HfApi() + + print(f"Uploading current directory to: {repo_id}") + + # Upload everything in the current directory ('.') to the repo root + api.upload_folder( + folder_path=".", + repo_id=repo_id, + repo_type=repo_type, + commit_message="Initial model upload", + ) + print("Upload complete!") + + +def push_large_folder_to_hf(repo_id, repo_type): + api = HfApi() + print(f"Starting large folder upload to: {repo_id}") + + # 3. Use upload_large_folder for resilience and speed + # This automatically handles multi-threading and local caching for resuming + api.upload_large_folder( + folder_path=".", + repo_id=repo_id, + repo_type=repo_type, + # Optional: ignore large junk files to save time + ignore_patterns=[ + ".git/", + "__pycache__/", + "*.tmp", + ".DS_Store", + "*.cache", + "*.trash", + ], + ) + + print( + "\nUpload complete! Progress was cached locally; if it failed, just run again to resume." + ) + + +def get_model_hash(model_path): + """ + Get the hash of a model file + """ + # print(f"Getting hash for model at {model_path}") + try: + with open(model_path, "rb") as f: + f.seek( + -10000 * 1024, 2 + ) # Move the file pointer 10MB before the end of the file + hash_result = hashlib.md5(f.read()).hexdigest() + # print(f"Hash for {model_path}: {hash_result}") + return hash_result + except IOError: + with open(model_path, "rb") as f: + hash_result = hashlib.md5(f.read()).hexdigest() + # print(f"IOError encountered, hash for {model_path}: {hash_result}") + return hash_result + + +def download_file_if_missing(url, local_path): + """ + Download a file from a URL if it doesn't exist locally + """ + print(f"Checking if {local_path} needs to be downloaded from {url}") + if not os.path.exists(local_path): + print(f"Downloading {url} to {local_path}") + with requests.get(url, stream=True, timeout=10) as r: + r.raise_for_status() + with open(local_path, "wb") as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + print(f"Downloaded {url} to {local_path}") + else: + print(f"{local_path} already exists. Skipping download.") + + +def load_json_data(file_path): + """ + Load JSON data from a file + """ + print(f"Loading JSON data from {file_path}") + try: + with open(file_path, "r", encoding="utf-8") as file: + data = json.load(file) + print(f"Loaded JSON data successfully from {file_path}") + return data + except FileNotFoundError: + print(f"{file_path} not found.") + sys.exit(1) + + +def iterate_and_hash( + directory, + vr_model_data_url, + mdx_model_data_url, + vr_model_data_local_path, + mdx_model_data_local_path, +): + """ + Iterate through a directory and hash all model files + """ + print(f"Iterating through directory {directory} to hash model files") + model_files = [ + (file, os.path.join(root, file)) + for root, _, files in os.walk(directory) + for file in files + if file.endswith((".pth", ".onnx")) + ] + + download_file_if_missing(vr_model_data_url, vr_model_data_local_path) + download_file_if_missing(mdx_model_data_url, mdx_model_data_local_path) + + vr_model_data = load_json_data(vr_model_data_local_path) + mdx_model_data = load_json_data(mdx_model_data_local_path) + + combined_model_params = { + **vr_model_data, + **mdx_model_data, + } + + model_info_list = [] + for file, file_path in sorted(model_files): + file_hash = get_model_hash(file_path) + model_info = { + "file": file, + "hash": file_hash, + "params": combined_model_params.get(file_hash, "Parameters not found"), + } + model_info_list.append(model_info) + + print(f"Writing model info list to {OUTPUT_PATH}") + with open(OUTPUT_PATH, "w", encoding="utf-8") as json_file: + json.dump(model_info_list, json_file, indent=4) + print(f"Successfully wrote model info list to {OUTPUT_PATH}") + + +def sort_links_by_extension(input_file, output_file): + # Define the custom priority order + priority = { + ".json": 0, + ".yaml": 1, + ".th": 2, + ".pth": 3, + ".ckpt": 4, + ".onnx": 5, # Added .onnx (common typo for .onnx or .onx) + } + + # Handle the specific user request for .onnx + # Example: Map .onnx to priority 5 + # priority['.onnx'] = 5 + + try: + with open(input_file, "r") as f: + # Read lines and strip whitespace/newlines + links = [line.strip() for line in f if line.strip()] + + def sort_key(link): + # Extract extension (case-insensitive) + _, ext = os.path.splitext(link.lower()) + # Return priority index; if not in list, place at the end (index 100) + return priority.get(ext, 100), link + + # Sort the links + sorted_links = sorted(links, key=sort_key) + + with open(output_file, "w") as f: + for link in sorted_links: + f.write(link + "\n") + + print(f"Successfully sorted links into: {output_file}") + + except FileNotFoundError: + print(f"Error: The file '{input_file}' was not found.") + + +# 1. Load the JSON data +# Ensure 'models.json' is in your current directory +def get_links_from_json(file_input): + try: + with open(file_input, "r") as file: + data = json.load(file) + except FileNotFoundError: + print("Error: 'models.json' not found.") + data = {} + + # 2. Process and Download + for model_name, links in data.items(): + if not isinstance(links, list) or len(links) == 0: + continue diff --git a/config_aspiration_mel_band_roformer.yaml b/config_aspiration_mel_band_roformer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..75983773005de1549919f2f50dc456f76f199b18 --- /dev/null +++ b/config_aspiration_mel_band_roformer.yaml @@ -0,0 +1,76 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 # don't work (use in model) + hop_length: 441 # don't work (use in model) + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 256 + depth: 8 + stereo: true + num_stems: 2 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 8 + grad_clip: 0 + instruments: + - aspiration + - other + lr: 4.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: null + num_epochs: 1000 + num_steps: 1000 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +augmentations: + enable: true # enable or disable all augmentations (to fast disable if needed) + loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max) + loudness_min: 0.5 + loudness_max: 1.5 + mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3) + mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02) + - 0.2 + - 0.02 + mixup_loudness_min: 0.5 + mixup_loudness_max: 1.5 + +inference: + batch_size: 4 + dim_t: 801 + num_overlap: 2 \ No newline at end of file diff --git a/config_bs_roformer_instrumental_resurrection_unwa.yaml b/config_bs_roformer_instrumental_resurrection_unwa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1b73093ad41459a7995340d2d8b81ecbc70dd05f --- /dev/null +++ b/config_bs_roformer_instrumental_resurrection_unwa.yaml @@ -0,0 +1,135 @@ +audio: + chunk_size: 749259 + dim_f: 1024 + dim_t: 1700 # don't work (use in model) + hop_length: 441 # don't work (use in model) + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 256 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + linear_transformer_depth: 0 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0. + ff_dropout: 0. + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 2 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: ['vocals', 'other'] + patience: 3 + reduce_factor: 0.95 + target_instrument: other + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: simple1 + use_mp3_compress: false # Deprecated + augmentation_mix: true # Mix several stems of the same type with some probability + augmentation_loudness: true # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0.5 + augmentation_loudness_max: 1.5 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + # optimizer: prodigy + optimizer: adam + # lr: 1.0 + lr: 1.0e-5 + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 2 + dim_t: 1700 + num_overlap: 2 + normalize: false diff --git a/config_bs_roformer_karaoke_frazer_becruily.yaml b/config_bs_roformer_karaoke_frazer_becruily.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a0170aeacb3d3a55d9c598ed7faacbdc8799071f --- /dev/null +++ b/config_bs_roformer_karaoke_frazer_becruily.yaml @@ -0,0 +1,129 @@ +audio: + chunk_size: 882000 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 256 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + linear_transformer_depth: 0 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 512 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: false + mlp_expansion_factor: 4 + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - Vocals + - Instrumental + patience: 2 + reduce_factor: 0.95 + target_instrument: Vocals + num_epochs: 1000 + num_steps: 1000 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + # optimizer: prodigy + optimizer: adam + lr: 1.0e-5 + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 2 + dim_t: 2001 + num_overlap: 4 + normalize: false diff --git a/config_bs_roformer_vocals_gabox.yaml b/config_bs_roformer_vocals_gabox.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c4a3d323322d75af7d981e9de2ef3fa29e786812 --- /dev/null +++ b/config_bs_roformer_vocals_gabox.yaml @@ -0,0 +1,133 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 # don't work (use in model) + hop_length: 441 # don't work (use in model) + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + dim: 512 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 16 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - Vocals + - Instrumental + lr: 5.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: Vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: simple1 + use_mp3_compress: false # Deprecated + augmentation_mix: true # Mix several stems of the same type with some probability + augmentation_loudness: true # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0.5 + augmentation_loudness_max: 1.5 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 4 \ No newline at end of file diff --git a/config_bs_roformer_vocals_resurrection_unwa.yaml b/config_bs_roformer_vocals_resurrection_unwa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6d3c18aa4ac9b224fc549fa6df1bbeac36118016 --- /dev/null +++ b/config_bs_roformer_vocals_resurrection_unwa.yaml @@ -0,0 +1,135 @@ +audio: + chunk_size: 785920 + dim_f: 1024 + dim_t: 1536 # don't work (use in model) + hop_length: 441 # don't work (use in model) + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 256 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + linear_transformer_depth: 0 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0. + ff_dropout: 0. + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 2 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: ['vocals', 'other'] + patience: 3 + reduce_factor: 0.95 + target_instrument: vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: simple1 + use_mp3_compress: false # Deprecated + augmentation_mix: true # Mix several stems of the same type with some probability + augmentation_loudness: true # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0.5 + augmentation_loudness_max: 1.5 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + # optimizer: prodigy + optimizer: adam + # lr: 1.0 + lr: 1.0e-5 + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 2 + dim_t: 1536 + num_overlap: 2 + normalize: false diff --git a/config_bs_roformer_vocals_revive_unwa.yaml b/config_bs_roformer_vocals_revive_unwa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b2d60c5d37a6c92afea42ef88d8958b7f01b64db --- /dev/null +++ b/config_bs_roformer_vocals_revive_unwa.yaml @@ -0,0 +1,134 @@ +audio: + chunk_size: 485100 #352800 #485100 + dim_f: 1024 + dim_t: 1101 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0. + +model: + dim: 512 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + linear_transformer_depth: 0 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0. + ff_dropout: 0. + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - vocals + - other + lr: 1.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adam + other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 2 + dim_t: 1101 + num_overlap: 2 diff --git a/config_dereverb-echo_mel_band_roformer.yaml b/config_dereverb-echo_mel_band_roformer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bf766a04152c42bb2f16e6b2929a1024c6d550f5 --- /dev/null +++ b/config_dereverb-echo_mel_band_roformer.yaml @@ -0,0 +1,76 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 # don't work (use in model) + hop_length: 441 # don't work (use in model) + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 256 + depth: 8 + stereo: true + num_stems: 2 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 8 + grad_clip: 0 + instruments: + - dry + - No dry + lr: 4.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: null + num_epochs: 1000 + num_steps: 1000 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +augmentations: + enable: true # enable or disable all augmentations (to fast disable if needed) + loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max) + loudness_min: 0.5 + loudness_max: 1.5 + mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3) + mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02) + - 0.2 + - 0.02 + mixup_loudness_min: 0.5 + mixup_loudness_max: 1.5 + +inference: + batch_size: 4 + dim_t: 801 + num_overlap: 4 \ No newline at end of file diff --git a/config_mdx23c_similarity.yaml b/config_mdx23c_similarity.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9ce8239fc926820db231cb1a240d20a1ff3eca0e --- /dev/null +++ b/config_mdx23c_similarity.yaml @@ -0,0 +1,47 @@ +audio: + chunk_size: 130560 + dim_f: 1024 + dim_t: 256 + hop_length: 512 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + act: gelu + bottleneck_factor: 4 + growth: 128 + norm: InstanceNorm + num_blocks_per_scale: 2 + num_channels: 128 + num_scales: 5 + num_subbands: 4 + scale: + - 2 + - 2 + +training: + batch_size: 2 + gradient_accumulation_steps: 3 + grad_clip: 0 + instruments: + - Similarity + - Difference + lr: 1.0 + patience: 15 + reduce_factor: 0.95 + target_instrument: Similarity + num_epochs: 1000 + num_steps: 2235 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: prodigy + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 8 + dim_t: 256 + num_overlap: 8 diff --git a/config_mel_band_roformer_instrumental_becruily.yaml b/config_mel_band_roformer_instrumental_becruily.yaml new file mode 100644 index 0000000000000000000000000000000000000000..862010f34a3765fa1ac9f22c04ba74042b2fd086 --- /dev/null +++ b/config_mel_band_roformer_instrumental_becruily.yaml @@ -0,0 +1,72 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 256 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - Instrumental + - Vocals + lr: 0.0005 + patience: 2 + reduce_factor: 0.95 + target_instrument: Instrumental + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adamw + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 1 + dim_t: 1101 + num_overlap: 2 \ No newline at end of file diff --git a/config_mel_band_roformer_instrumental_gabox.yaml b/config_mel_band_roformer_instrumental_gabox.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b1395e978d64cb1c37d3015adc2feeb0805e3b94 --- /dev/null +++ b/config_mel_band_roformer_instrumental_gabox.yaml @@ -0,0 +1,51 @@ +audio: + chunk_size: 485100 + dim_f: 1024 + dim_t: 1101 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + instruments: + - Instrumental + - Vocals + target_instrument: Instrumental + use_amp: True + +inference: + batch_size: 1 + dim_t: 1101 + num_overlap: 2 \ No newline at end of file diff --git a/config_mel_band_roformer_karaoke_becruily.yaml b/config_mel_band_roformer_karaoke_becruily.yaml new file mode 100644 index 0000000000000000000000000000000000000000..58cd1747a53d1695128e732aa7aa6802cb77db70 --- /dev/null +++ b/config_mel_band_roformer_karaoke_becruily.yaml @@ -0,0 +1,72 @@ +audio: + chunk_size: 485100 + dim_f: 1024 + dim_t: 256 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 2 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: true + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: false + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - Vocals + - Instrumental + lr: 0.0005 + patience: 2 + reduce_factor: 0.95 + target_instrument: null + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adamw + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 1 + dim_t: 1101 + num_overlap: 8 diff --git a/config_mel_band_roformer_kim_ft_unwa.yaml b/config_mel_band_roformer_kim_ft_unwa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d0527f99399af7f504ead83ce75e6715cd190e56 --- /dev/null +++ b/config_mel_band_roformer_kim_ft_unwa.yaml @@ -0,0 +1,72 @@ +audio: + chunk_size: 485100 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - vocals + - other + lr: 1.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adam + other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 8 \ No newline at end of file diff --git a/config_mel_band_roformer_vocal_fullness_aname.yaml b/config_mel_band_roformer_vocal_fullness_aname.yaml new file mode 100644 index 0000000000000000000000000000000000000000..32fee9c05a82b72931cbb9a8af7d948a538532cb --- /dev/null +++ b/config_mel_band_roformer_vocal_fullness_aname.yaml @@ -0,0 +1,54 @@ +audio: + chunk_size: 661500 + dim_f: 1024 + dim_t: 256 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - vocals + - other + target_instrument: vocals + use_amp: true + +inference: + batch_size: 4 + dim_t: 1101 + num_overlap: 4 \ No newline at end of file diff --git a/config_mel_band_roformer_vocals_becruily.yaml b/config_mel_band_roformer_vocals_becruily.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2d42333851e31c9e5747d818efec365921358a7c --- /dev/null +++ b/config_mel_band_roformer_vocals_becruily.yaml @@ -0,0 +1,72 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 256 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - vocals + - other + lr: 0.0005 + patience: 2 + reduce_factor: 0.95 + target_instrument: vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adamw + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 1 + dim_t: 1101 + num_overlap: 2 \ No newline at end of file diff --git a/config_mel_band_roformer_vocals_gabox.yaml b/config_mel_band_roformer_vocals_gabox.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8130c9958eead0d2efd27f27f4f39ea5ca051a26 --- /dev/null +++ b/config_mel_band_roformer_vocals_gabox.yaml @@ -0,0 +1,51 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 256 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + instruments: + - Vocals + - Instrumental + target_instrument: Vocals + +inference: + batch_size: 1 + dim_t: 1101 + num_overlap: 1 + chunk_size: 352800 \ No newline at end of file diff --git a/config_melbandroformer_inst.yaml b/config_melbandroformer_inst.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d6bdca342644a1194427fe505e2044c5006a1213 --- /dev/null +++ b/config_melbandroformer_inst.yaml @@ -0,0 +1,51 @@ +audio: + chunk_size: 485100 + dim_f: 1024 + dim_t: 1101 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + instruments: + - other + - vocals + target_instrument: other + use_amp: True + +inference: + batch_size: 1 + dim_t: 1101 + num_overlap: 2 \ No newline at end of file diff --git a/config_melbandroformer_inst_v2.yaml b/config_melbandroformer_inst_v2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d4297c088f7b8bd2f28308d8a8d1e0694cdec967 --- /dev/null +++ b/config_melbandroformer_inst_v2.yaml @@ -0,0 +1,51 @@ +audio: + chunk_size: 485100 + dim_f: 1024 + dim_t: 1101 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 3 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + instruments: + - Instrumental + - Vocals + target_instrument: Instrumental + use_amp: True + +inference: + batch_size: 1 + dim_t: 1101 + num_overlap: 2 \ No newline at end of file diff --git a/config_melbandroformer_instvoc_duality.yaml b/config_melbandroformer_instvoc_duality.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b93e721853f4d90efa7f0bead82f6a1b791fc19f --- /dev/null +++ b/config_melbandroformer_instvoc_duality.yaml @@ -0,0 +1,51 @@ +audio: + chunk_size: 485100 + dim_f: 1024 + dim_t: 256 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 2 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + instruments: + - Vocals + - Instrumental + target_instrument: null + use_amp: True + +inference: + batch_size: 1 + dim_t: 1101 + num_overlap: 2 \ No newline at end of file diff --git a/denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768_config.yaml b/denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..265e19c806778d7b2d5ffdaef9e3d503a6dba3f1 --- /dev/null +++ b/denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768_config.yaml @@ -0,0 +1,71 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 2 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - dry + - other + lr: 1.0e-05 + patience: 8 + reduce_factor: 0.95 + target_instrument: dry + num_epochs: 1000 + num_steps: 4032 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + +inference: + batch_size: 2 + dim_t: 801 + num_overlap: 4 \ No newline at end of file diff --git a/denoise_mel_band_roformer_aufr33_sdr_27.9959_config.yaml b/denoise_mel_band_roformer_aufr33_sdr_27.9959_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..265e19c806778d7b2d5ffdaef9e3d503a6dba3f1 --- /dev/null +++ b/denoise_mel_band_roformer_aufr33_sdr_27.9959_config.yaml @@ -0,0 +1,71 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 2 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - dry + - other + lr: 1.0e-05 + patience: 8 + reduce_factor: 0.95 + target_instrument: dry + num_epochs: 1000 + num_steps: 4032 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + +inference: + batch_size: 2 + dim_t: 801 + num_overlap: 4 \ No newline at end of file diff --git a/deverb_bs_roformer_8_384dim_10depth_config.yaml b/deverb_bs_roformer_8_384dim_10depth_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d2c7ce0c4b424baa7731495c432102672b68cfa6 --- /dev/null +++ b/deverb_bs_roformer_8_384dim_10depth_config.yaml @@ -0,0 +1,137 @@ +audio: + chunk_size: 352768 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + dim: 384 + depth: 10 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - noreverb + - reverb + lr: 5.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: noreverb + num_epochs: 1000 + num_steps: 1000 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +augmentations: + enable: true # enable or disable all augmentations (to fast disable if needed) + loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max) + loudness_min: 0.5 + loudness_max: 1.5 + mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3) + mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02) + - 0.2 + - 0.02 + mixup_loudness_min: 0.5 + mixup_loudness_max: 1.5 + +inference: + batch_size: 4 + dim_t: 801 + num_overlap: 4 diff --git a/hdemucs_mmi.yaml b/hdemucs_mmi.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0ea089139bfbef4a1126ab25e93c3dc380a90b46 --- /dev/null +++ b/hdemucs_mmi.yaml @@ -0,0 +1,2 @@ +models: ['75fc33f5'] +segment: 44 diff --git a/htdemucs.yaml b/htdemucs.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0d5f2089fa3e1a0335d93de070f6802598cd4a4d --- /dev/null +++ b/htdemucs.yaml @@ -0,0 +1 @@ +models: ['955717e8'] diff --git a/htdemucs_6s.yaml b/htdemucs_6s.yaml new file mode 100644 index 0000000000000000000000000000000000000000..651a0fa536038a3e6d650f7b2bcc0b50ff7a4be9 --- /dev/null +++ b/htdemucs_6s.yaml @@ -0,0 +1 @@ +models: ['5c90dfd2'] diff --git a/htdemucs_ft.yaml b/htdemucs_ft.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ba5c69c272770f5e5db3dd5fcda75b94ba523250 --- /dev/null +++ b/htdemucs_ft.yaml @@ -0,0 +1,7 @@ +models: ['f7e0c4bc', 'd12395a8', '92cfc3b6', '04573f0d'] +weights: [ + [1., 0., 0., 0.], + [0., 1., 0., 0.], + [0., 0., 1., 0.], + [0., 0., 0., 1.], +] \ No newline at end of file diff --git a/kuielab_b_other.onnx b/kuielab_b_other.onnx new file mode 100644 index 0000000000000000000000000000000000000000..e17baa7f40b3dbd1299ee7ba7e6db461682e8320 --- /dev/null +++ b/kuielab_b_other.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0d0b63950ac332333fea2d58f68c92fd3ab0aae071398c2a8beeae1ad15b655 +size 29703204 diff --git a/kuielab_b_vocals.onnx b/kuielab_b_vocals.onnx new file mode 100644 index 0000000000000000000000000000000000000000..683e555b8a91e3035cc24bee5b47b7347390eb8a --- /dev/null +++ b/kuielab_b_vocals.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b7dcb9d878acb0f3e64ff3fd27750faae96577013f6d50f5996875bf4250713 +size 29703204 diff --git a/mdx_model_data.json b/mdx_model_data.json new file mode 100644 index 0000000000000000000000000000000000000000..ba79b189d787ecd9b3035fca87421377f79cd67f --- /dev/null +++ b/mdx_model_data.json @@ -0,0 +1,384 @@ +{ + "0ddfc0eb5792638ad5dc27850236c246": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "26d308f91f3423a67dc69a6d12a8793d": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 9, + "mdx_n_fft_scale_set": 8192, + "primary_stem": "Other" + }, + "2cdd429caac38f0194b133884160f2c6": { + "compensate": 1.045, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "2f5501189a2f6db6349916fabe8c90de": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals", + "is_karaoke": true + }, + "398580b6d5d973af3120df54cee6759d": { + "compensate": 1.75, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "488b3e6f8bd3717d9d7c428476be2d75": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "4910e7827f335048bdac11fa967772f9": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 7, + "mdx_n_fft_scale_set": 4096, + "primary_stem": "Drums" + }, + "53c4baf4d12c3e6c3831bb8f5b532b93": { + "compensate": 1.043, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "5d343409ef0df48c7d78cce9f0106781": { + "compensate": 1.075, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "5f6483271e1efb9bfb59e4a3e6d4d098": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 9, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "65ab5919372a128e4167f5e01a8fda85": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 8192, + "primary_stem": "Other" + }, + "6703e39f36f18aa7855ee1047765621d": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 9, + "mdx_n_fft_scale_set": 16384, + "primary_stem": "Bass" + }, + "6b31de20e84392859a3d09d43f089515": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "867595e9de46f6ab699008295df62798": { + "compensate": 1.03, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "a3cd63058945e777505c01d2507daf37": { + "compensate": 1.03, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "b33d9b3950b6cbf5fe90a32608924700": { + "compensate": 1.03, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "c3b29bdce8c4fa17ec609e16220330ab": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 16384, + "primary_stem": "Bass" + }, + "ceed671467c1f64ebdfac8a2490d0d52": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "d2a1376f310e4f7fa37fb9b5774eb701": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "d7bff498db9324db933d913388cba6be": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "d94058f8c7f1fae4164868ae8ae66b20": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "dc41ede5961d50f277eb846db17f5319": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 9, + "mdx_n_fft_scale_set": 4096, + "primary_stem": "Drums" + }, + "e5572e58abf111f80d8241d2e44e7fa4": { + "compensate": 1.028, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "e7324c873b1f615c35c1967f912db92a": { + "compensate": 1.03, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "1c56ec0224f1d559c42fd6fd2a67b154": { + "compensate": 1.025, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 5120, + "primary_stem": "Instrumental" + }, + "f2df6d6863d8f435436d8b561594ff49": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "b06327a00d5e5fbc7d96e1781bbdb596": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "94ff780b977d3ca07c7a343dab2e25dd": { + "compensate": 1.039, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "73492b58195c3b52d34590d5474452f6": { + "compensate": 1.043, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "970b3f9492014d18fefeedfe4773cb42": { + "compensate": 1.009, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "1d64a6d2c30f709b8c9b4ce1366d96ee": { + "compensate": 1.065, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 5120, + "primary_stem": "Instrumental", + "is_karaoke": true + }, + "203f2a3955221b64df85a41af87cf8f0": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "291c2049608edb52648b96e27eb80e95": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "ead8d05dab12ec571d67549b3aab03fc": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "cc63408db3d80b4d85b0287d1d7c9632": { + "compensate": 1.033, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "cd5b2989ad863f116c855db1dfe24e39": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 9, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Reverb" + }, + "55657dd70583b0fedfba5f67df11d711": { + "compensate": 1.022, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "b6bccda408a436db8500083ef3491e8b": { + "compensate": 1.02, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "8a88db95c7fb5dbe6a095ff2ffb428b1": { + "compensate": 1.026, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 5120, + "primary_stem": "Instrumental" + }, + "b78da4afc6512f98e4756f5977f5c6b9": { + "compensate": 1.021, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "77d07b2667ddf05b9e3175941b4454a0": { + "compensate": 1.021, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "0f2a6bc5b49d87d64728ee40e23bceb1": { + "compensate": 1.019, + "mdx_dim_f_set": 2560, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 5120, + "primary_stem": "Instrumental" + }, + "b02be2d198d4968a121030cf8950b492": { + "compensate": 1.020, + "mdx_dim_f_set": 2560, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 5120, + "primary_stem": "No Crowd" + }, + "2154254ee89b2945b97a7efed6e88820": { + "config_yaml": "model_2_stem_061321.yaml" + }, + "063aadd735d58150722926dcbf5852a9": { + "config_yaml": "model_2_stem_061321.yaml" + }, + "c09f714d978b41d718facfe3427e6001": { + "config_yaml": "model_2_stem_061321.yaml" + }, + "fe96801369f6a148df2720f5ced88c19": { + "config_yaml": "model3.yaml" + }, + "02e8b226f85fb566e5db894b9931c640": { + "config_yaml": "model2.yaml" + }, + "e3de6d861635ab9c1d766149edd680d6": { + "config_yaml": "model1.yaml" + }, + "3f2936c554ab73ce2e396d54636bd373": { + "config_yaml": "modelB.yaml" + }, + "890d0f6f82d7574bca741a9e8bcb8168": { + "config_yaml": "modelB.yaml" + }, + "63a3cb8c37c474681049be4ad1ba8815": { + "config_yaml": "modelB.yaml" + }, + "a7fc5d719743c7fd6b61bd2b4d48b9f0": { + "config_yaml": "modelA.yaml" + }, + "3567f3dee6e77bf366fcb1c7b8bc3745": { + "config_yaml": "modelA.yaml" + }, + "a28f4d717bd0d34cd2ff7a3b0a3d065e": { + "config_yaml": "modelA.yaml" + }, + "c9971a18da20911822593dc81caa8be9": { + "config_yaml": "sndfx.yaml" + }, + "57d94d5ed705460d21c75a5ac829a605": { + "config_yaml": "sndfx.yaml" + }, + "e7a25f8764f25a52c1b96c4946e66ba2": { + "config_yaml": "sndfx.yaml" + }, + "104081d24e37217086ce5fde09147ee1": { + "config_yaml": "model_2_stem_061321.yaml" + }, + "1e6165b601539f38d0a9330f3facffeb": { + "config_yaml": "model_2_stem_061321.yaml" + }, + "fe0108464ce0d8271be5ab810891bd7c": { + "config_yaml": "model_2_stem_full_band.yaml" + }, + "e9b82ec90ee56c507a3a982f1555714c": { + "config_yaml": "model_2_stem_full_band_2.yaml" + }, + "99b6ceaae542265a3b6d657bf9fde79f": { + "config_yaml": "model_2_stem_full_band_8k.yaml" + }, + "116f6f9dabb907b53d847ed9f7a9475f": { + "config_yaml": "model_2_stem_full_band_8k.yaml" + }, + "53f707017bfcbb56f5e1bfac420d6732": { + "config_yaml": "model_bs_roformer_ep_317_sdr_12.9755.yaml", + "is_roformer": true + }, + "63e41acc264bf681a73aa9f7e5f606cc": { + "config_yaml": "model_mel_band_roformer_ep_3005_sdr_11.4360.yaml", + "is_roformer": true + }, + "e733736763234047587931fc35322fd9": { + "config_yaml": "model_bs_roformer_ep_937_sdr_10.5309.yaml", + "is_roformer": true + }, + "d789065adfd747d6f585b27b495bcdae": { + "config_yaml": "model_bs_roformer_ep_368_sdr_12.9628.yaml", + "is_roformer": true + } +} diff --git a/mel_band_roformer_crowd_aufr33_viperx_sdr_8.7144_config.yaml b/mel_band_roformer_crowd_aufr33_viperx_sdr_8.7144_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7e44ef94c71082af3a619c9b439f808ae8eb3e1c --- /dev/null +++ b/mel_band_roformer_crowd_aufr33_viperx_sdr_8.7144_config.yaml @@ -0,0 +1,71 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 2 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - crowd + - other + lr: 1.0e-05 + patience: 8 + reduce_factor: 0.95 + target_instrument: crowd + num_epochs: 1000 + num_steps: 4032 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 4 \ No newline at end of file diff --git a/mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956_config.yaml b/mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b88403c926bc5957a54ba90271f0cced47c8366f --- /dev/null +++ b/mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956_config.yaml @@ -0,0 +1,71 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 4 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - Vocals + - Instrumental + lr: 1.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: Vocals + num_epochs: 1000 + num_steps: 2000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 4 \ No newline at end of file diff --git a/model_bs_roformer_ep_317_sdr_12.9755.yaml b/model_bs_roformer_ep_317_sdr_12.9755.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c4a3d323322d75af7d981e9de2ef3fa29e786812 --- /dev/null +++ b/model_bs_roformer_ep_317_sdr_12.9755.yaml @@ -0,0 +1,133 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 # don't work (use in model) + hop_length: 441 # don't work (use in model) + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + dim: 512 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 16 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - Vocals + - Instrumental + lr: 5.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: Vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: simple1 + use_mp3_compress: false # Deprecated + augmentation_mix: true # Mix several stems of the same type with some probability + augmentation_loudness: true # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0.5 + augmentation_loudness_max: 1.5 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 4 \ No newline at end of file diff --git a/model_bs_roformer_ep_368_sdr_12.9628.yaml b/model_bs_roformer_ep_368_sdr_12.9628.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fe893b1a68b8ae8ea8bb5a7ac2b7f12e0c53a826 --- /dev/null +++ b/model_bs_roformer_ep_368_sdr_12.9628.yaml @@ -0,0 +1,133 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 # don't work (use in model) + hop_length: 441 # don't work (use in model) + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + dim: 512 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 16 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - Vocals + - Instrumental + lr: 5.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: Vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: simple1 + use_mp3_compress: false # Deprecated + augmentation_mix: true # Mix several stems of the same type with some probability + augmentation_loudness: true # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0.5 + augmentation_loudness_max: 1.5 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 4 diff --git a/model_bs_roformer_ep_937_sdr_10.5309.yaml b/model_bs_roformer_ep_937_sdr_10.5309.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f623832cc06ebc5fa8a049fad6b1319c6038336d --- /dev/null +++ b/model_bs_roformer_ep_937_sdr_10.5309.yaml @@ -0,0 +1,138 @@ +audio: + chunk_size: 131584 + dim_f: 1024 + dim_t: 256 + hop_length: 512 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + dim: 384 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + linear_transformer_depth: 0 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 512 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 4 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - No Drum-Bass + - Drum-Bass + lr: 5.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: No Drum-Bass + num_epochs: 1000 + num_steps: 1000 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +augmentations: + enable: true # enable or disable all augmentations (to fast disable if needed) + loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max) + loudness_min: 0.5 + loudness_max: 1.5 + mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3) + mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02) + - 0.2 + - 0.02 + mixup_loudness_min: 0.5 + mixup_loudness_max: 1.5 + +inference: + batch_size: 1 + dim_t: 512 + num_overlap: 4 \ No newline at end of file diff --git a/model_mel_band_roformer_ep_3005_sdr_11.4360.yaml b/model_mel_band_roformer_ep_3005_sdr_11.4360.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7c906f2931cbae3cf64551c231e285ca10097fe5 --- /dev/null +++ b/model_mel_band_roformer_ep_3005_sdr_11.4360.yaml @@ -0,0 +1,72 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 # don't work (use in model) + hop_length: 441 # don't work (use in model) + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + dim: 384 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 9 + gradient_accumulation_steps: 8 + grad_clip: 0 + instruments: + - Vocals + - Instrumental + lr: 4.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: Vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: simple1 + use_mp3_compress: false # Deprecated + augmentation_mix: true # Mix several stems of the same type with some probability + augmentation_loudness: true # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0.5 + augmentation_loudness_max: 1.5 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 4 \ No newline at end of file diff --git a/repro_mdx_a_time_only.yaml b/repro_mdx_a_time_only.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d5d16ea8bc419198692fc993b560c1bd3f8eb8c9 --- /dev/null +++ b/repro_mdx_a_time_only.yaml @@ -0,0 +1,2 @@ +models: ['9a6b4851', '9a6b4851', '1ef250f1', '1ef250f1'] +segment: 44 diff --git a/scnet_checkpoint_musdb18.ckpt b/scnet_checkpoint_musdb18.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..671b56f708c41055e2fd1ad71391254a8f097aac --- /dev/null +++ b/scnet_checkpoint_musdb18.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bc0d1abb20bfdf966dcd07637bafd03e4bc13653d09ef18bc9b3e342eafe2aa +size 42434986 diff --git a/vocals_mel_band_roformer.yaml b/vocals_mel_band_roformer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9cb005e7a97c66d5fb23bba8bb36bec9619cdd8f --- /dev/null +++ b/vocals_mel_band_roformer.yaml @@ -0,0 +1,50 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 256 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + instruments: + - vocals + - other + target_instrument: vocals + +inference: + dim_t: 1101 + num_overlap: 1 + chunk_size: 352800 \ No newline at end of file diff --git a/vr_model_data.json b/vr_model_data.json new file mode 100644 index 0000000000000000000000000000000000000000..025e093664e798716b177706322649be048dca24 --- /dev/null +++ b/vr_model_data.json @@ -0,0 +1,137 @@ +{ + "0d0e6d143046b0eecc41a22e60224582": { + "vr_model_param": "3band_44100_mid", + "primary_stem": "Instrumental" + }, + "18b52f873021a0af556fb4ecd552bb8e": { + "vr_model_param": "2band_32000", + "primary_stem": "Instrumental" + }, + "1fc66027c82b499c7d8f55f79e64cadc": { + "vr_model_param": "2band_32000", + "primary_stem": "Instrumental" + }, + "2aa34fbc01f8e6d2bf509726481e7142": { + "vr_model_param": "4band_44100", + "primary_stem": "No Piano" + }, + "3e18f639b11abea7361db1a4a91c2559": { + "vr_model_param": "4band_44100", + "primary_stem": "Instrumental" + }, + "570b5f50054609a17741369a35007ddd": { + "vr_model_param": "4band_v3", + "primary_stem": "Instrumental" + }, + "5a6e24c1b530f2dab045a522ef89b751": { + "vr_model_param": "1band_sr44100_hl512", + "primary_stem": "Instrumental" + }, + "6b5916069a49be3fe29d4397ecfd73fa": { + "vr_model_param": "3band_44100_msb2", + "primary_stem": "Instrumental", + "is_karaoke": true + }, + "74b3bc5fa2b69f29baf7839b858bc679": { + "vr_model_param": "4band_44100", + "primary_stem": "Instrumental" + }, + "827213b316df36b52a1f3d04fec89369": { + "vr_model_param": "4band_44100", + "primary_stem": "Instrumental" + }, + "911d4048eee7223eca4ee0efb7d29256": { + "vr_model_param": "4band_44100", + "primary_stem": "Vocals" + }, + "941f3f7f0b0341f12087aacdfef644b1": { + "vr_model_param": "4band_v2", + "primary_stem": "Instrumental" + }, + "a02827cf69d75781a35c0e8a327f3195": { + "vr_model_param": "1band_sr33075_hl384", + "primary_stem": "Instrumental" + }, + "b165fbff113c959dba5303b74c6484bc": { + "vr_model_param": "3band_44100", + "primary_stem": "Instrumental" + }, + "b5f988cd3e891dca7253bf5f0f3427c7": { + "vr_model_param": "4band_44100", + "primary_stem": "Instrumental" + }, + "b99c35723bc35cb11ed14a4780006a80": { + "vr_model_param": "1band_sr44100_hl1024", + "primary_stem": "Instrumental" + }, + "ba02fd25b71d620eebbdb49e18e4c336": { + "vr_model_param": "3band_44100_mid", + "primary_stem": "Instrumental" + }, + "c4476ef424d8cba65f38d8d04e8514e2": { + "vr_model_param": "3band_44100_msb2", + "primary_stem": "Instrumental" + }, + "da2d37b8be2972e550a409bae08335aa": { + "vr_model_param": "4band_44100", + "primary_stem": "Vocals" + }, + "db57205d3133e39df8e050b435a78c80": { + "vr_model_param": "4band_44100", + "primary_stem": "Instrumental" + }, + "ea83b08e32ec2303456fe50659035f69": { + "vr_model_param": "4band_v3", + "primary_stem": "Instrumental" + }, + "f6ea8473ff86017b5ebd586ccacf156b": { + "vr_model_param": "4band_v2_sn", + "primary_stem": "Instrumental", + "is_karaoke": true + }, + "fd297a61eafc9d829033f8b987c39a3d": { + "vr_model_param": "1band_sr32000_hl512", + "primary_stem": "Instrumental" + }, + "0ec76fd9e65f81d8b4fbd13af4826ed8": { + "vr_model_param": "4band_v3", + "primary_stem": "No Woodwinds" + }, + "0fb9249ffe4ffc38d7b16243f394c0ff": { + "vr_model_param": "4band_v3", + "primary_stem": "No Reverb" + }, + "6857b2972e1754913aad0c9a1678c753": { + "vr_model_param": "4band_v3", + "primary_stem": "No Echo", + "nout": 48, + "nout_lstm": 128 + }, + "f200a145434efc7dcf0cd093f517ed52": { + "vr_model_param": "4band_v3", + "primary_stem": "No Echo", + "nout": 48, + "nout_lstm": 128 + }, + "44c55d8b5d2e3edea98c2b2bf93071c7": { + "vr_model_param": "4band_v3", + "primary_stem": "Noise", + "nout": 48, + "nout_lstm": 128 + }, + "51ea8c43a6928ed3c10ef5cb2707d57b": { + "vr_model_param": "1band_sr44100_hl1024", + "primary_stem": "Noise", + "nout": 16, + "nout_lstm": 128 + }, + "944950a9c5963a5eb70b445d67b7068a": { + "vr_model_param": "4band_v3_sn", + "primary_stem": "Vocals", + "nout": 64, + "nout_lstm": 128, + "is_karaoke": false, + "is_bv_model": true, + "is_bv_model_rebalanced": 0.9 + } +}