diff --git a/models/audio-separator-models/.gitattributes b/models/audio-separator-models/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..dab9a4e17afd2ef39d90ccb0b40ef2786fe77422 --- /dev/null +++ b/models/audio-separator-models/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/models/audio-separator-models/README.md b/models/audio-separator-models/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7be5fc7f47d5db027d120b8024982df93db95b74 --- /dev/null +++ b/models/audio-separator-models/README.md @@ -0,0 +1,3 @@ +--- +license: mit +--- diff --git a/models/audio-separator-models/roformers/MelBandRoformerBigSYHFTV1.ckpt b/models/audio-separator-models/roformers/MelBandRoformerBigSYHFTV1.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..3b07b85890712d20ad115ead701bf7799c3b9928 --- /dev/null +++ b/models/audio-separator-models/roformers/MelBandRoformerBigSYHFTV1.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2327e3e81f19e67c307f8c830c54267c09ecb0e9c6ad2b40a80c310899c955f +size 1479738496 diff --git a/models/audio-separator-models/roformers/MelBandRoformerSYHFT.ckpt b/models/audio-separator-models/roformers/MelBandRoformerSYHFT.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..fc8e114879a3761ce9e6f901cab7b0adbc7035b5 --- /dev/null +++ b/models/audio-separator-models/roformers/MelBandRoformerSYHFT.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f319dfcde4396ea3106658f457f5eb0bc577e113491f61ae8bab216fe84b0c0c +size 913096702 diff --git a/models/audio-separator-models/roformers/MelBandRoformerSYHFTV2.5.ckpt b/models/audio-separator-models/roformers/MelBandRoformerSYHFTV2.5.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..232404ba4a0d4e6d32b6f683711c58cd73b10c18 --- /dev/null +++ b/models/audio-separator-models/roformers/MelBandRoformerSYHFTV2.5.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:916e3a2c1e63b1457bcad823b98ca705e4933deffd2a5ab3a370e10f68bf47e2 +size 913090472 diff --git a/models/audio-separator-models/roformers/MelBandRoformerSYHFTV2.ckpt b/models/audio-separator-models/roformers/MelBandRoformerSYHFTV2.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..6b34c66f158e43e0f2c11b6df91a040c5a11a23c --- /dev/null +++ b/models/audio-separator-models/roformers/MelBandRoformerSYHFTV2.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e99f8efa5315300c197295592bd7e56c21c1d77e1884c904b5128c54a2a4632 +size 913095346 diff --git a/models/audio-separator-models/roformers/MelBandRoformerSYHFTV3Epsilon.ckpt b/models/audio-separator-models/roformers/MelBandRoformerSYHFTV3Epsilon.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..650b62bfd05945493a6529898ca0d0023ee7637d --- /dev/null +++ b/models/audio-separator-models/roformers/MelBandRoformerSYHFTV3Epsilon.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c886092e4aae13aa089263a0d54d483643f58c16ec221aed37268e2c1031397 +size 913090472 diff --git a/models/audio-separator-models/roformers/aspiration_mel_band_roformer_less_aggr_sdr_18.1201.ckpt b/models/audio-separator-models/roformers/aspiration_mel_band_roformer_less_aggr_sdr_18.1201.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..0dd326fa1c5c65e52583803582f154a0359a55bd --- /dev/null +++ b/models/audio-separator-models/roformers/aspiration_mel_band_roformer_less_aggr_sdr_18.1201.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83bfe991cec4fbadde9f30d1f79cd5293ad0b1f936256be327bba5cbb4883374 +size 835982664 diff --git a/models/audio-separator-models/roformers/aspiration_mel_band_roformer_sdr_18.9845.ckpt b/models/audio-separator-models/roformers/aspiration_mel_band_roformer_sdr_18.9845.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..1282613860a16200f301a12c7b8ec67d050c63be --- /dev/null +++ b/models/audio-separator-models/roformers/aspiration_mel_band_roformer_sdr_18.9845.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e791258c866c6c8da66052693d8cc3b64f1f42c01e052dbdc570cd278380cc5 +size 835983746 diff --git a/models/audio-separator-models/roformers/bs_roformer_instrumental_resurrection_unwa.ckpt b/models/audio-separator-models/roformers/bs_roformer_instrumental_resurrection_unwa.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..23b7fc62506629fc0ac29c1551ee7045b805dfa5 --- /dev/null +++ b/models/audio-separator-models/roformers/bs_roformer_instrumental_resurrection_unwa.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16311025a5133ae6411760ccfe9e3e66b31a01d9d8bec0a03fa7ec4bedac7a15 +size 204483033 diff --git a/models/audio-separator-models/roformers/bs_roformer_male_female_by_aufr33_sdr_7.2889.ckpt b/models/audio-separator-models/roformers/bs_roformer_male_female_by_aufr33_sdr_7.2889.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..201059f3d93490e5cc91f20ea0bb74ae7c0dd20a --- /dev/null +++ b/models/audio-separator-models/roformers/bs_roformer_male_female_by_aufr33_sdr_7.2889.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cf11736d1b42a11ae55d8299316585921477dd2a671b24b663660846ca9861b +size 527119779 diff --git a/models/audio-separator-models/roformers/bs_roformer_vocals_gabox.ckpt b/models/audio-separator-models/roformers/bs_roformer_vocals_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..cb4a0c8ef7ec4378b27e79a01eb491a2d699a535 --- /dev/null +++ b/models/audio-separator-models/roformers/bs_roformer_vocals_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18d58efe5e949e70fab11b875329af6d06ef11ccc29574bfe943fb57cc827f38 +size 639254584 diff --git a/models/audio-separator-models/roformers/bs_roformer_vocals_resurrection_unwa.ckpt b/models/audio-separator-models/roformers/bs_roformer_vocals_resurrection_unwa.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..14e223aece4474ca86627c065d73e725c4466902 --- /dev/null +++ b/models/audio-separator-models/roformers/bs_roformer_vocals_resurrection_unwa.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dbfe5cb572e4ed32a15ec727d7bd06c8d7aba97509e6fda5bc008bb1e0b2dd5 +size 204510749 diff --git a/models/audio-separator-models/roformers/bs_roformer_vocals_revive_unwa.ckpt b/models/audio-separator-models/roformers/bs_roformer_vocals_revive_unwa.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..5df342a5d9eb8dcdc04f5d04fbedef415a7835d3 --- /dev/null +++ b/models/audio-separator-models/roformers/bs_roformer_vocals_revive_unwa.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1d7e4bfdfef07c6b2bc1d65283a7d03c3c38f8c7dbc8d729b785f93c8b8699a +size 639326600 diff --git a/models/audio-separator-models/roformers/bs_roformer_vocals_revive_v2_unwa.ckpt b/models/audio-separator-models/roformers/bs_roformer_vocals_revive_v2_unwa.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..6826b2658d30a6b6aa5f3896311f7cd4c3160bb9 --- /dev/null +++ b/models/audio-separator-models/roformers/bs_roformer_vocals_revive_v2_unwa.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58098850c882a7472dad39f99fb8040ce6eaafe671cfe9881d89aea276bbb5f5 +size 639326600 diff --git a/models/audio-separator-models/roformers/bs_roformer_vocals_revive_v3e_unwa.ckpt b/models/audio-separator-models/roformers/bs_roformer_vocals_revive_v3e_unwa.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..56be9dd0e47d16ef81bb1be38566cf0a35c8ad43 --- /dev/null +++ b/models/audio-separator-models/roformers/bs_roformer_vocals_revive_v3e_unwa.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b0751b9a15c591407c3b77f08eb4ad3005e42e96051f3f2b39760f1130c467b +size 639326600 diff --git a/models/audio-separator-models/roformers/config_aspiration_mel_band_roformer.yaml b/models/audio-separator-models/roformers/config_aspiration_mel_band_roformer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..882cf4f8aacb59f5445d0621aeb8ace0ffddec5b --- /dev/null +++ b/models/audio-separator-models/roformers/config_aspiration_mel_band_roformer.yaml @@ -0,0 +1,76 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 # don't work (use in model) + hop_length: 441 # don't work (use in model) + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 256 + depth: 8 + stereo: true + num_stems: 2 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 8 + grad_clip: 0 + instruments: + - aspiration + - other + lr: 4.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: null + num_epochs: 1000 + num_steps: 1000 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +augmentations: + enable: true # enable or disable all augmentations (to fast disable if needed) + loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max) + loudness_min: 0.5 + loudness_max: 1.5 + mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3) + mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02) + - 0.2 + - 0.02 + mixup_loudness_min: 0.5 + mixup_loudness_max: 1.5 + +inference: + batch_size: 4 + dim_t: 801 + num_overlap: 2 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_bs_roformer_instrumental_resurrection_unwa.yaml b/models/audio-separator-models/roformers/config_bs_roformer_instrumental_resurrection_unwa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c089f84015c08719dd280dbef5147ad13296430a --- /dev/null +++ b/models/audio-separator-models/roformers/config_bs_roformer_instrumental_resurrection_unwa.yaml @@ -0,0 +1,135 @@ +audio: + chunk_size: 749259 + dim_f: 1024 + dim_t: 1700 # don't work (use in model) + hop_length: 441 # don't work (use in model) + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 256 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + linear_transformer_depth: 0 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0. + ff_dropout: 0. + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 2 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: ['vocals', 'other'] + patience: 3 + reduce_factor: 0.95 + target_instrument: other + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: simple1 + use_mp3_compress: false # Deprecated + augmentation_mix: true # Mix several stems of the same type with some probability + augmentation_loudness: true # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0.5 + augmentation_loudness_max: 1.5 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + # optimizer: prodigy + optimizer: adam + # lr: 1.0 + lr: 1.0e-5 + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 2 + dim_t: 1700 + num_overlap: 2 + normalize: false diff --git a/models/audio-separator-models/roformers/config_bs_roformer_vocals_gabox.yaml b/models/audio-separator-models/roformers/config_bs_roformer_vocals_gabox.yaml new file mode 100644 index 0000000000000000000000000000000000000000..527a3ad8a10366aaa562d007714e74d271a18cbf --- /dev/null +++ b/models/audio-separator-models/roformers/config_bs_roformer_vocals_gabox.yaml @@ -0,0 +1,133 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 # don't work (use in model) + hop_length: 441 # don't work (use in model) + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + dim: 512 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 16 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - Vocals + - Instrumental + lr: 5.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: Vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: simple1 + use_mp3_compress: false # Deprecated + augmentation_mix: true # Mix several stems of the same type with some probability + augmentation_loudness: true # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0.5 + augmentation_loudness_max: 1.5 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 4 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_bs_roformer_vocals_resurrection_unwa.yaml b/models/audio-separator-models/roformers/config_bs_roformer_vocals_resurrection_unwa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d291dea6ba145335a3e4acd09c8df9181a8a11eb --- /dev/null +++ b/models/audio-separator-models/roformers/config_bs_roformer_vocals_resurrection_unwa.yaml @@ -0,0 +1,135 @@ +audio: + chunk_size: 785920 + dim_f: 1024 + dim_t: 1536 # don't work (use in model) + hop_length: 441 # don't work (use in model) + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 256 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + linear_transformer_depth: 0 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0. + ff_dropout: 0. + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 2 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: ['vocals', 'other'] + patience: 3 + reduce_factor: 0.95 + target_instrument: vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: simple1 + use_mp3_compress: false # Deprecated + augmentation_mix: true # Mix several stems of the same type with some probability + augmentation_loudness: true # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0.5 + augmentation_loudness_max: 1.5 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + # optimizer: prodigy + optimizer: adam + # lr: 1.0 + lr: 1.0e-5 + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 2 + dim_t: 1536 + num_overlap: 2 + normalize: false diff --git a/models/audio-separator-models/roformers/config_bs_roformer_vocals_revive_unwa.yaml b/models/audio-separator-models/roformers/config_bs_roformer_vocals_revive_unwa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b2d60c5d37a6c92afea42ef88d8958b7f01b64db --- /dev/null +++ b/models/audio-separator-models/roformers/config_bs_roformer_vocals_revive_unwa.yaml @@ -0,0 +1,134 @@ +audio: + chunk_size: 485100 #352800 #485100 + dim_f: 1024 + dim_t: 1101 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0. + +model: + dim: 512 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + linear_transformer_depth: 0 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0. + ff_dropout: 0. + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - vocals + - other + lr: 1.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adam + other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 2 + dim_t: 1101 + num_overlap: 2 diff --git a/models/audio-separator-models/roformers/config_chorus_male_female_bs_roformer.yaml b/models/audio-separator-models/roformers/config_chorus_male_female_bs_roformer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8090c50d3bf92b4a6f6fd7cc03f08ef834c00a64 --- /dev/null +++ b/models/audio-separator-models/roformers/config_chorus_male_female_bs_roformer.yaml @@ -0,0 +1,125 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 # don't work (use in model) + hop_length: 441 # don't work (use in model) + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 8 + stereo: true + num_stems: 2 + time_transformer_depth: 1 + freq_transformer_depth: 1 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0.0 + ff_dropout: 0.0 + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - male + - female + lr: 1.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: null + num_epochs: 1000 + num_steps: 1000 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 2 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_dereverb-echo_mel_band_roformer.yaml b/models/audio-separator-models/roformers/config_dereverb-echo_mel_band_roformer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..14a76e789312f98216a73f2be18d76a3398080c2 --- /dev/null +++ b/models/audio-separator-models/roformers/config_dereverb-echo_mel_band_roformer.yaml @@ -0,0 +1,76 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 # don't work (use in model) + hop_length: 441 # don't work (use in model) + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 256 + depth: 8 + stereo: true + num_stems: 2 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 8 + grad_clip: 0 + instruments: + - dry + - No dry + lr: 4.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: null + num_epochs: 1000 + num_steps: 1000 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +augmentations: + enable: true # enable or disable all augmentations (to fast disable if needed) + loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max) + loudness_min: 0.5 + loudness_max: 1.5 + mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3) + mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02) + - 0.2 + - 0.02 + mixup_loudness_min: 0.5 + mixup_loudness_max: 1.5 + +inference: + batch_size: 4 + dim_t: 801 + num_overlap: 4 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_dereverb-echo_mel_band_roformer_sdr_13.4843_v2.yaml b/models/audio-separator-models/roformers/config_dereverb-echo_mel_band_roformer_sdr_13.4843_v2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..83c93db69f6f13e770905cb71a1827465b55af1c --- /dev/null +++ b/models/audio-separator-models/roformers/config_dereverb-echo_mel_band_roformer_sdr_13.4843_v2.yaml @@ -0,0 +1,64 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 256 + depth: 8 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 8 + grad_clip: 0 + instruments: + - dry + - No dry + lr: 1.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: dry + num_epochs: 1000 + num_steps: 1000 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: false + use_amp: true + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 4 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_dereverb_echo_mel_band_roformer_v2.yaml b/models/audio-separator-models/roformers/config_dereverb_echo_mel_band_roformer_v2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f263f8b61fb37b757892d64d7cbeccd1db2512ca --- /dev/null +++ b/models/audio-separator-models/roformers/config_dereverb_echo_mel_band_roformer_v2.yaml @@ -0,0 +1,64 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 256 + depth: 8 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 8 + grad_clip: 0 + instruments: + - dry + - other + lr: 1.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: dry + num_epochs: 1000 + num_steps: 1000 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: false + use_amp: true + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 4 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_mel_band_roformer_bleed_suppressor_v1.yaml b/models/audio-separator-models/roformers/config_mel_band_roformer_bleed_suppressor_v1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e9d173f70a8e4cc32a2a76322d4c4c28ebff00af --- /dev/null +++ b/models/audio-separator-models/roformers/config_mel_band_roformer_bleed_suppressor_v1.yaml @@ -0,0 +1,51 @@ +audio: + chunk_size: 485100 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + instruments: + - Instrumental + - Bleed + target_instrument: Instrumental + use_amp: True + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 2 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_mel_band_roformer_instrumental_becruily.yaml b/models/audio-separator-models/roformers/config_mel_band_roformer_instrumental_becruily.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d157b9524c596edd7ff277e8914b55348520ad51 --- /dev/null +++ b/models/audio-separator-models/roformers/config_mel_band_roformer_instrumental_becruily.yaml @@ -0,0 +1,72 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 256 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - Instrumental + - Vocals + lr: 0.0005 + patience: 2 + reduce_factor: 0.95 + target_instrument: Instrumental + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adamw + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 1 + dim_t: 1101 + num_overlap: 2 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_mel_band_roformer_instrumental_gabox.yaml b/models/audio-separator-models/roformers/config_mel_band_roformer_instrumental_gabox.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9796944c69fe5fa28937eb7670e8397343ffa5ca --- /dev/null +++ b/models/audio-separator-models/roformers/config_mel_band_roformer_instrumental_gabox.yaml @@ -0,0 +1,51 @@ +audio: + chunk_size: 485100 + dim_f: 1024 + dim_t: 1101 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + instruments: + - Instrumental + - Vocals + target_instrument: Instrumental + use_amp: True + +inference: + batch_size: 1 + dim_t: 1101 + num_overlap: 2 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_mel_band_roformer_karaoke_becruily.yaml b/models/audio-separator-models/roformers/config_mel_band_roformer_karaoke_becruily.yaml new file mode 100644 index 0000000000000000000000000000000000000000..afd6907645e89d36d78e933ea036850dad66fb8f --- /dev/null +++ b/models/audio-separator-models/roformers/config_mel_band_roformer_karaoke_becruily.yaml @@ -0,0 +1,72 @@ +audio: + chunk_size: 485100 + dim_f: 1024 + dim_t: 256 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 2 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: true + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: false + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - Vocals + - Instrumental + lr: 0.0005 + patience: 2 + reduce_factor: 0.95 + target_instrument: null + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adamw + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 1 + dim_t: 1101 + num_overlap: 8 diff --git a/models/audio-separator-models/roformers/config_mel_band_roformer_karaoke_gabox.yaml b/models/audio-separator-models/roformers/config_mel_band_roformer_karaoke_gabox.yaml new file mode 100644 index 0000000000000000000000000000000000000000..16cb193e3651303c63d76da329b677fd5b16dae8 --- /dev/null +++ b/models/audio-separator-models/roformers/config_mel_band_roformer_karaoke_gabox.yaml @@ -0,0 +1,72 @@ +audio: + chunk_size: 485100 + dim_f: 1024 + dim_t: 256 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: true + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: true + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - Vocals + - Instrumental + lr: 0.0005 + patience: 2 + reduce_factor: 0.95 + target_instrument: Vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adamw + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 1 + dim_t: 1101 + num_overlap: 8 diff --git a/models/audio-separator-models/roformers/config_mel_band_roformer_kim_ft_unwa.yaml b/models/audio-separator-models/roformers/config_mel_band_roformer_kim_ft_unwa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4a8a6a9040c92b78828aa987ce4daa91923953af --- /dev/null +++ b/models/audio-separator-models/roformers/config_mel_band_roformer_kim_ft_unwa.yaml @@ -0,0 +1,72 @@ +audio: + chunk_size: 485100 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - vocals + - other + lr: 1.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adam + other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 8 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_mel_band_roformer_vocal_fullness_aname.yaml b/models/audio-separator-models/roformers/config_mel_band_roformer_vocal_fullness_aname.yaml new file mode 100644 index 0000000000000000000000000000000000000000..723d742c848aec8c67cee58d0904f419f15841cc --- /dev/null +++ b/models/audio-separator-models/roformers/config_mel_band_roformer_vocal_fullness_aname.yaml @@ -0,0 +1,54 @@ +audio: + chunk_size: 661500 + dim_f: 1024 + dim_t: 256 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - vocals + - other + target_instrument: vocals + use_amp: true + +inference: + batch_size: 4 + dim_t: 1101 + num_overlap: 4 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_mel_band_roformer_vocals_becruily.yaml b/models/audio-separator-models/roformers/config_mel_band_roformer_vocals_becruily.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b314fc4e078e4fdc8df60de26862d2bbea194834 --- /dev/null +++ b/models/audio-separator-models/roformers/config_mel_band_roformer_vocals_becruily.yaml @@ -0,0 +1,72 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 256 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - vocals + - other + lr: 0.0005 + patience: 2 + reduce_factor: 0.95 + target_instrument: vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adamw + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 1 + dim_t: 1101 + num_overlap: 2 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_mel_band_roformer_vocals_gabox.yaml b/models/audio-separator-models/roformers/config_mel_band_roformer_vocals_gabox.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8130c9958eead0d2efd27f27f4f39ea5ca051a26 --- /dev/null +++ b/models/audio-separator-models/roformers/config_mel_band_roformer_vocals_gabox.yaml @@ -0,0 +1,51 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 256 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + instruments: + - Vocals + - Instrumental + target_instrument: Vocals + +inference: + batch_size: 1 + dim_t: 1101 + num_overlap: 1 + chunk_size: 352800 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_melband_roformer_big_beta5e.yaml b/models/audio-separator-models/roformers/config_melband_roformer_big_beta5e.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f19339dd96cdbbd843f1ec3a25bd64cd5d00bbce --- /dev/null +++ b/models/audio-separator-models/roformers/config_melband_roformer_big_beta5e.yaml @@ -0,0 +1,51 @@ +audio: + chunk_size: 485100 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 3 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + instruments: + - vocals + - other + target_instrument: vocals + use_amp: True + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 2 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_melbandroformer_big_beta4.yaml b/models/audio-separator-models/roformers/config_melbandroformer_big_beta4.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b187cdf1c454b56531ae02c047df2f8f721b7da2 --- /dev/null +++ b/models/audio-separator-models/roformers/config_melbandroformer_big_beta4.yaml @@ -0,0 +1,51 @@ +audio: + chunk_size: 485100 + dim_f: 1024 + dim_t: 1101 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 3 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + instruments: + - vocals + - other + target_instrument: vocals + use_amp: True + +inference: + batch_size: 1 + dim_t: 1101 + num_overlap: 2 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_melbandroformer_big_beta6.yaml b/models/audio-separator-models/roformers/config_melbandroformer_big_beta6.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f45cd6af81f2a1d0eb684a280d13fac043653f8c --- /dev/null +++ b/models/audio-separator-models/roformers/config_melbandroformer_big_beta6.yaml @@ -0,0 +1,72 @@ +audio: + chunk_size: 529200 + dim_f: 1024 + dim_t: 256 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 512 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - vocals + - other + lr: 1.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adam + other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 2 + dim_t: 1201 + num_overlap: 2 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_melbandroformer_big_beta6x.yaml b/models/audio-separator-models/roformers/config_melbandroformer_big_beta6x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2f959f226e9e4b160b483208512bc457411ac736 --- /dev/null +++ b/models/audio-separator-models/roformers/config_melbandroformer_big_beta6x.yaml @@ -0,0 +1,72 @@ +audio: + chunk_size: 529200 + dim_f: 1024 + dim_t: 256 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 512 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - vocals + - other + lr: 1.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adam + other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 2 + dim_t: 1201 + num_overlap: 2 diff --git a/models/audio-separator-models/roformers/config_melbandroformer_inst.yaml b/models/audio-separator-models/roformers/config_melbandroformer_inst.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a236341b3f8607d534b191e9092690a52b8015af --- /dev/null +++ b/models/audio-separator-models/roformers/config_melbandroformer_inst.yaml @@ -0,0 +1,51 @@ +audio: + chunk_size: 485100 + dim_f: 1024 + dim_t: 1101 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + instruments: + - other + - vocals + target_instrument: other + use_amp: True + +inference: + batch_size: 1 + dim_t: 1101 + num_overlap: 2 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_melbandroformer_inst_v2.yaml b/models/audio-separator-models/roformers/config_melbandroformer_inst_v2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9c13008065433e4eb4522c62c0ae0a4294f595f4 --- /dev/null +++ b/models/audio-separator-models/roformers/config_melbandroformer_inst_v2.yaml @@ -0,0 +1,51 @@ +audio: + chunk_size: 485100 + dim_f: 1024 + dim_t: 1101 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 3 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + instruments: + - Instrumental + - Vocals + target_instrument: Instrumental + use_amp: True + +inference: + batch_size: 1 + dim_t: 1101 + num_overlap: 2 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_melbandroformer_instvoc_duality.yaml b/models/audio-separator-models/roformers/config_melbandroformer_instvoc_duality.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e30df21030b0986fe33c66a29f8ee59478e49357 --- /dev/null +++ b/models/audio-separator-models/roformers/config_melbandroformer_instvoc_duality.yaml @@ -0,0 +1,51 @@ +audio: + chunk_size: 485100 + dim_f: 1024 + dim_t: 256 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 2 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + instruments: + - Vocals + - Instrumental + target_instrument: null + use_amp: True + +inference: + batch_size: 1 + dim_t: 1101 + num_overlap: 2 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_vocals_mel_band_roformer_big_v1_ft.yaml b/models/audio-separator-models/roformers/config_vocals_mel_band_roformer_big_v1_ft.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f19339dd96cdbbd843f1ec3a25bd64cd5d00bbce --- /dev/null +++ b/models/audio-separator-models/roformers/config_vocals_mel_band_roformer_big_v1_ft.yaml @@ -0,0 +1,51 @@ +audio: + chunk_size: 485100 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 3 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + instruments: + - vocals + - other + target_instrument: vocals + use_amp: True + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 2 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/config_vocals_mel_band_roformer_ft.yaml b/models/audio-separator-models/roformers/config_vocals_mel_band_roformer_ft.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c4fe03d72610e09e3c1cc479c4dc8f423c601c22 --- /dev/null +++ b/models/audio-separator-models/roformers/config_vocals_mel_band_roformer_ft.yaml @@ -0,0 +1,72 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 256 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 8 + grad_clip: 0 + instruments: + - vocals + - other + lr: 1.0e-04 + patience: 2 + reduce_factor: 0.95 + target_instrument: vocals + num_epochs: 1000 + num_steps: 100 + augmentation: true # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: true # Mix several stems of the same type with some probability + augmentation_loudness: true # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adamw8bit + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 4 + dim_t: 256 + num_overlap: 2 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768.ckpt b/models/audio-separator-models/roformers/denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..3e232e1d5ebb6e2c74cf9803ca9fb4a3fb860d17 --- /dev/null +++ b/models/audio-separator-models/roformers/denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a25e3b233722cd81e2de7b8e798a3fef29d4b9799ccacda60b0dc958a1e2a5bb +size 913097300 diff --git a/models/audio-separator-models/roformers/denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768_config.yaml b/models/audio-separator-models/roformers/denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6a20587dc9024e992ff06994aed24edec3c3efca --- /dev/null +++ b/models/audio-separator-models/roformers/denoise_mel_band_roformer_aufr33_aggr_sdr_27.9768_config.yaml @@ -0,0 +1,71 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 2 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - dry + - other + lr: 1.0e-05 + patience: 8 + reduce_factor: 0.95 + target_instrument: dry + num_epochs: 1000 + num_steps: 4032 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + +inference: + batch_size: 2 + dim_t: 801 + num_overlap: 4 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/denoise_mel_band_roformer_aufr33_sdr_27.9959.ckpt b/models/audio-separator-models/roformers/denoise_mel_band_roformer_aufr33_sdr_27.9959.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..8d1b634e263572a5a955577112e77036279c6287 --- /dev/null +++ b/models/audio-separator-models/roformers/denoise_mel_band_roformer_aufr33_sdr_27.9959.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c1c39191edc34e942ca7f2346ce6b6c0e1208a5f76349ffce6f696bd12910de +size 913097300 diff --git a/models/audio-separator-models/roformers/denoise_mel_band_roformer_aufr33_sdr_27.9959_config.yaml b/models/audio-separator-models/roformers/denoise_mel_band_roformer_aufr33_sdr_27.9959_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6a20587dc9024e992ff06994aed24edec3c3efca --- /dev/null +++ b/models/audio-separator-models/roformers/denoise_mel_band_roformer_aufr33_sdr_27.9959_config.yaml @@ -0,0 +1,71 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 2 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - dry + - other + lr: 1.0e-05 + patience: 8 + reduce_factor: 0.95 + target_instrument: dry + num_epochs: 1000 + num_steps: 4032 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + +inference: + batch_size: 2 + dim_t: 801 + num_overlap: 4 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/dereverb-echo_mel_band_roformer_sdr_10.0169.ckpt b/models/audio-separator-models/roformers/dereverb-echo_mel_band_roformer_sdr_10.0169.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..b2ec32944c1f4f9c802a1ad02d38682929c77a58 --- /dev/null +++ b/models/audio-separator-models/roformers/dereverb-echo_mel_band_roformer_sdr_10.0169.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd2b737a394cfb80cd48cc9fcbaf89f5f4062f6b93066c2911617a06d8b7860a +size 835997896 diff --git a/models/audio-separator-models/roformers/dereverb-echo_mel_band_roformer_sdr_13.4843_v2.ckpt b/models/audio-separator-models/roformers/dereverb-echo_mel_band_roformer_sdr_13.4843_v2.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..5baa2b3d302d0ab984148ff1f42793220b29c7ca --- /dev/null +++ b/models/audio-separator-models/roformers/dereverb-echo_mel_band_roformer_sdr_13.4843_v2.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:396432f5af25992fe82d0286634bd879027c073721db6ab10199e75459708b9f +size 455862568 diff --git a/models/audio-separator-models/roformers/dereverb_big_mbr_ep_362.ckpt b/models/audio-separator-models/roformers/dereverb_big_mbr_ep_362.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..814f35f8718aab66733f43e60a034fa933dd2f7f --- /dev/null +++ b/models/audio-separator-models/roformers/dereverb_big_mbr_ep_362.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0506455e74ffc02bbec700df9863ae243597034003815f1418227c6dee33b6ea +size 455864012 diff --git a/models/audio-separator-models/roformers/dereverb_echo_mbr_fused.ckpt b/models/audio-separator-models/roformers/dereverb_echo_mbr_fused.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..2d495600912d8c596c4f37c1a902c9b918e934fd --- /dev/null +++ b/models/audio-separator-models/roformers/dereverb_echo_mbr_fused.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1596b1063238f487d54a0510a8c92cb28c000c803a271dd618ac49efc99ef3f7 +size 455776577 diff --git a/models/audio-separator-models/roformers/dereverb_mel_band_roformer_anvuew.yaml b/models/audio-separator-models/roformers/dereverb_mel_band_roformer_anvuew.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4a73f63e7daa18b57764de5d6a8144232b870b4c --- /dev/null +++ b/models/audio-separator-models/roformers/dereverb_mel_band_roformer_anvuew.yaml @@ -0,0 +1,76 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 3 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - noreverb + - reverb + lr: 5.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: noreverb + num_epochs: 1000 + num_steps: 4000 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adamw + other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +augmentations: + enable: true # enable or disable all augmentations (to fast disable if needed) + loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max) + loudness_min: 0.1 + loudness_max: 1.0 + mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3) + mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02) + - 0.2 + - 0.02 + mixup_loudness_min: 0.5 + mixup_loudness_max: 1.5 + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 2 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt b/models/audio-separator-models/roformers/dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..8f21995361b651a6c446cb4e1e7664d94eb42bff --- /dev/null +++ b/models/audio-separator-models/roformers/dereverb_mel_band_roformer_anvuew_sdr_19.1729.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9262877b87e9ebb0fb808a456b0a411fa677f5df31c8383c1254af531c078970 +size 913107578 diff --git a/models/audio-separator-models/roformers/dereverb_mel_band_roformer_less_aggressive_anvuew_sdr_18.8050.ckpt b/models/audio-separator-models/roformers/dereverb_mel_band_roformer_less_aggressive_anvuew_sdr_18.8050.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..45d5062aae81caa8df3565e7d10c818f076cc588 --- /dev/null +++ b/models/audio-separator-models/roformers/dereverb_mel_band_roformer_less_aggressive_anvuew_sdr_18.8050.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0db8f1b41c00cead1112e967262a12802fd32e76c0c3a8eb207e772bae25d07b +size 913107578 diff --git a/models/audio-separator-models/roformers/dereverb_mel_band_roformer_mono_anvuew.ckpt b/models/audio-separator-models/roformers/dereverb_mel_band_roformer_mono_anvuew.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..06855114df2802baa49c15faf4b39a743b84e27b --- /dev/null +++ b/models/audio-separator-models/roformers/dereverb_mel_band_roformer_mono_anvuew.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f099ee717eb57fb0ad5eb0e7c9ad6787c36168140b61ce2b158b90c2c4ecee79 +size 913097978 diff --git a/models/audio-separator-models/roformers/dereverb_super_big_mbr_ep_346.ckpt b/models/audio-separator-models/roformers/dereverb_super_big_mbr_ep_346.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..bc9d947ff83f741caa2504b673eed1983e6ac008 --- /dev/null +++ b/models/audio-separator-models/roformers/dereverb_super_big_mbr_ep_346.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26dda242bce4405555f2d6086d079fe8cc23f1f04e02e501d2689bfe3ece0489 +size 455864012 diff --git a/models/audio-separator-models/roformers/deverb_bs_roformer_8_384dim_10depth.ckpt b/models/audio-separator-models/roformers/deverb_bs_roformer_8_384dim_10depth.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..17a6af84d21d7c3cecf0c627811f42fee20b6d8e --- /dev/null +++ b/models/audio-separator-models/roformers/deverb_bs_roformer_8_384dim_10depth.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c38653aaa5e49f2f7b84dd3be2b6b679e0cbea23978e6b48389ee6f0a914768 +size 361499604 diff --git a/models/audio-separator-models/roformers/deverb_bs_roformer_8_384dim_10depth_config.yaml b/models/audio-separator-models/roformers/deverb_bs_roformer_8_384dim_10depth_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7aa36fa5e356bd5ece008126289ba05f8cd67bf4 --- /dev/null +++ b/models/audio-separator-models/roformers/deverb_bs_roformer_8_384dim_10depth_config.yaml @@ -0,0 +1,137 @@ +audio: + chunk_size: 352768 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + dim: 384 + depth: 10 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 1 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - noreverb + - reverb + lr: 5.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: noreverb + num_epochs: 1000 + num_steps: 1000 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +augmentations: + enable: true # enable or disable all augmentations (to fast disable if needed) + loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max) + loudness_min: 0.5 + loudness_max: 1.5 + mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3) + mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02) + - 0.2 + - 0.02 + mixup_loudness_min: 0.5 + mixup_loudness_max: 1.5 + +inference: + batch_size: 4 + dim_t: 801 + num_overlap: 4 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_bleed_suppressor_v1.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_bleed_suppressor_v1.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..4bf56278572c098907e036b63e094bd7c63c1a20 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_bleed_suppressor_v1.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9a9d10faa7f8997676a78e66d741d7acb9cc449334763f3c8f626d68ec6e575 +size 913102724 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_crowd_aufr33_viperx_sdr_8.7144.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_crowd_aufr33_viperx_sdr_8.7144.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..8d59dd9875ffadf10ca18f66aa3d20a0e289ce90 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_crowd_aufr33_viperx_sdr_8.7144.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca8799531fe51c94172cc047226209ed48bf7d8c02e04671795a15d2a1c318af +size 913096801 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_crowd_aufr33_viperx_sdr_8.7144_config.yaml b/models/audio-separator-models/roformers/mel_band_roformer_crowd_aufr33_viperx_sdr_8.7144_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..18687f8b8bb38020fc951d3ade6a54bbf58f094e --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_crowd_aufr33_viperx_sdr_8.7144_config.yaml @@ -0,0 +1,71 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 2 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - crowd + - other + lr: 1.0e-05 + patience: 8 + reduce_factor: 0.95 + target_instrument: crowd + num_epochs: 1000 + num_steps: 4032 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 4 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/mel_band_roformer_denoise_debleed_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_denoise_debleed_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..8b595aaf271c7d97cbca025436c370b5954e7786 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_denoise_debleed_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91aa7a546ed2e93482e4629c982d35b0d258bb3de6eeab497fd91658cc86c7fd +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_2_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_2_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..7758aa5f0d19e21b8cb82550a8eddb7a7ab10e23 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_2_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e03ca459c339f88b7521c367c897d0c3f5362b38a6cdb96cb28e625ca0f9931e +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_3_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_3_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..1596ed4c464e4346d614bc48bb64a996975bc445 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_3_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9ec9f299cf617bf6afe1c382f4b0761cd9bee78323da94889951812328e10fb +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_becruily.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_becruily.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..9aa6c9ab355e9dfc090251f9703e299b9da7893d --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_becruily.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8da6632a1c25efb1c9be783ce9ea367d226d4b918cd6c3717c8b1d7a396041d +size 913106900 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_bleedless_v1_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_bleedless_v1_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..65745b5be537052b903ae5f13f4e26f51912f6db --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_bleedless_v1_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de972fb724601beef237abe94c8b934c73218e9baf3e344ab4c2333276e5cfe7 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_bleedless_v2_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_bleedless_v2_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..881d22e5fc7fde06ca2c5de68277c4ab8bbceee7 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_bleedless_v2_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6109687febb8f18cd5a45207fee35f18ba8b9467b18a4b2e982a3b7dc04a9d72 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_bleedless_v3_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_bleedless_v3_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..651d00bdc63039e222bd04f9e901d82044d7ff84 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_bleedless_v3_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5578545e094e584835b3184310ed1b12072f15d4b6ed8f4359ecc17358a66676 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fullness_noise_v4_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fullness_noise_v4_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..5a49522bec839a399c6545d2f812bae2ff5f32a1 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fullness_noise_v4_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f50296e913b9af3b5b3b961e92877ef0d4a74f9a433e796e89960c4c2b1abe53 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fullness_v1_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fullness_v1_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..e73ab6f21e270a489e7310d9fe81a150cdfceb13 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fullness_v1_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31140eccf271d2a9e8a538b092b1f70dfd6471aa5ad163b22bccc758b9f38b62 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fullness_v2_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fullness_v2_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..6d849f3079743541477aeb8748cd2dac3f05e0d0 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fullness_v2_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c446c34551333dd3d45b8d0708658a10f28c5e289f8ec27b5f0e22803681bef3 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fullness_v3_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fullness_v3_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..c90caf360ea211abf7467c96c8bde8bf656fb521 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fullness_v3_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbb229209a8942d34664e19d2f4862e357ea3108a4e8c04b69aa0aba523a4481 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fv7z_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fv7z_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..9cd07358dc11187373fb9052aaee3b7434ee4269 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fv7z_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef229f0dcd370c1767e4292981c59e5248814da45f32bfacebcc0f28adaa30a1 +size 913031195 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fv8_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fv8_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..b017d3834de1d9d48cafa54dc7807d19c2ea0880 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fv8_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50bfa127d21f419e0da89730867d28c7ac4484c9473e6f313d036bc8b429df80 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fvx_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fvx_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..93fa162e3077a085662452f147e5fbe0bacd108a --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_fvx_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:545ef13b0cdbac505818a38db98e09c54e7c03ea17b4e0c895a531bfa352fa59 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..6ecff8779cfc8fa3961db8589050b44a43131203 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b3ad6bd8bed3aaaa4d9320ea2ca910d140196a2302186db1754f3a8d8e16fb1 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv5_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv5_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..1a192beafebebb2f4785e076cb13aba8a074b8ce --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv5_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38e935cf1e97afcc1de84d0bdb87dd8090bad530fa0df28e707d16448e1d38e2 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv5n_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv5n_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..5cea44b8d02d4a8379a8fef7d0dc7e3a920b6294 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv5n_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:175203923fac3e52ae00e7e37d41e8a7fef5020b6ee4e4144f4786daabc54b34 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv6_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv6_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..9e4e1ed19b5579f8914a2229b05a859b8cb0bf34 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv6_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:677951b8556a27abe32e39705640638826e78101fa901a51ad73d20522be6d25 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv6n_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv6n_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..edf54dd70e2304cc057d3055a46c02fbb79ef937 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv6n_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:802f3e5d183d7c4b50dea147c320e61634f5be6ff55fa899fdebeaf0f3cf7f42 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv7_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv7_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..96b9a5ec1eb7dddb73acd155f3962118dc4980ee --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv7_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b4f12d81fd7fb02f38609216d59f0e42b3dca655fd90ca275ab5321b3e4d9aa +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv7n_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv7n_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..0bb8eb969ef04c187caa14bc813e8c690d396036 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv7n_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0ca36af5d1314be46b56c8a53b6be02f98511fa5d7e3e196fd895755e65be3c +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv8_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv8_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..9802ecab0b9fa4c33af5610bb61b822076d77dcd --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv8_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d7d97418012e1d241853260d82330869c3e945ffdef9d7841fd90f5b24f20ff +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv8n_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv8n_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..29483cf0cabdabb930587ec5c0be8070d73cc969 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_instrumental_instv8n_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25920c876e601d4ccbf1684b19ecdac4b9fcfcc7f48d2c095d81040ec3fecbea +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..977a5c60ab478ab0b78ac5bc7e5296f9185f5549 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1de20d459332fe8869aeb01327a31df0032262706e1365114e852dc271779813 +size 913096801 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956_config.yaml b/models/audio-separator-models/roformers/mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956_config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..417f042961dd076caf5ba32c8bba4122748e8c9d --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_karaoke_aufr33_viperx_sdr_10.1956_config.yaml @@ -0,0 +1,71 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 000 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 4 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - Vocals + - Instrumental + lr: 1.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: Vocals + num_epochs: 1000 + num_steps: 2000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: null + use_mp3_compress: false # Deprecated + augmentation_mix: false # Mix several stems of the same type with some probability + augmentation_loudness: false # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0 + augmentation_loudness_max: 0 + q: 0.95 + coarse_loss_clip: false + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 4 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/mel_band_roformer_karaoke_becruily.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_karaoke_becruily.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..ceaad63144a44e85b64617f842686673070bf5dc --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_karaoke_becruily.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3aa262ac01df870b9fc033e9c7b6cad33fe04fc9c148b6c40841326a515a0e0 +size 1719139254 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_karaoke_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_karaoke_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..69688626b9c8b8655e6eb549e7909f64ae862819 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_karaoke_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:303fc631e7aa587e9dc1e6ac4bb3667c6ba53aacb6b6a90abcfcf57935b92bd8 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_karaoke_gabox_v2.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_karaoke_gabox_v2.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..f1f8bd05fe7cd5bc814cbc757048f09e5df8f195 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_karaoke_gabox_v2.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec34be50327aeaf1a996c27977f5c30d1ac80c0076d69683d3e5184c31ea29d3 +size 913090472 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_kim_ft2_bleedless_unwa.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_kim_ft2_bleedless_unwa.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..eb05d0fc17641b53deabf68c1940dd237695026c --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_kim_ft2_bleedless_unwa.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c450bd66a98b49dd03231fc5ebb84121eef8418236b179423c2b171d62b04d9 +size 913101368 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_kim_ft2_unwa.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_kim_ft2_unwa.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..64a9c082838de39b001e92ac622efe3de0c810da --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_kim_ft2_unwa.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ed7b9e4c2eebbec7a7e5e8113058f7b68ba5e6048db8eaccfbbeb884c7884c0 +size 913100690 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_kim_ft3_unwa.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_kim_ft3_unwa.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..816aa053cbceb5323a1b6eb5b5c0717b9e0b7a66 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_kim_ft3_unwa.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b10db0095c42576a32aeb3a1c8054af9dea0333e1c38477091d78316a007e52 +size 913090472 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_kim_ft_unwa.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_kim_ft_unwa.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..2912039564c3940b1fb51563a795e3f241841226 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_kim_ft_unwa.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6bd8d333880191254a6ef6be3cb0ffa4dda9d3282e36b0cce2e88a660e00d39 +size 913100690 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_vocal_fullness_aname.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_vocal_fullness_aname.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..380b1baf015018a1c4ae18ef1ade63a6ab76e441 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_vocal_fullness_aname.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a64a27a672b457de23d9decd1fc7b58b0664a9f4f24bb43af154708e2ef07d2f +size 913090472 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_vocals_becruily.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_vocals_becruily.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..fda579c578c11b43fc8267f25fdbe0552ec36c1b --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_vocals_becruily.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a05961310cc55fbb901290c2e8be02682942f73522b6ac76bf2ec11e347ed95a +size 913107578 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_vocals_fv1_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_vocals_fv1_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..5b6fdc61e56b41297707de241f7c353a916c0494 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_vocals_fv1_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4dff354d81152d1b4321f6491f242c060919148239fbfe22a1015513de4a7fe +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_vocals_fv2_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_vocals_fv2_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..dccb5cbb2e7b8aa143e8cd8b803144646998abd5 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_vocals_fv2_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2888813aa5b519941fa8548efc5a4331d63c61909007eb17fe95c367be230196 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_vocals_fv3_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_vocals_fv3_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..f5b0031173c6b54221895b9d66e9553037777cd5 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_vocals_fv3_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49d81446b34a7848446efde7898b25bdc32fe872c2393617acb5356649f7ea93 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_vocals_fv4_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_vocals_fv4_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..16f6cdd40cf3df7b1fbbf3f3702c1f326cdd8f94 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_vocals_fv4_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a9657de5fd3ed87ad4fd1a9d2069743ecb33424836973ad0f3288e2a64e90bc +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_vocals_fv5_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_vocals_fv5_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..144ccb6ac58988f75e37cb1e16dd29d7ae754760 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_vocals_fv5_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ede0504ddc55cb44b966a8212dac75a364f8157974cc40c8e92b9f5d4f17ce2 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_vocals_fv6_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_vocals_fv6_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..09ba0100d657171cb6f17292df03f215102f5bef --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_vocals_fv6_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25033d944288cb032fc51faab044bbd7f90bb81e82cada637ecdec699c2ff773 +size 913031195 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_vocals_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_vocals_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..709869d299af3267439c266eede088a1c19b0745 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_vocals_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff802a67501fac70587c3ff4e8dbc89c2558e7d8911c92222dfea2aaac208517 +size 913026650 diff --git a/models/audio-separator-models/roformers/mel_band_roformer_vocals_v2_gabox.ckpt b/models/audio-separator-models/roformers/mel_band_roformer_vocals_v2_gabox.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..e3589e9ee8a2eba303868270993bbe81cc684625 --- /dev/null +++ b/models/audio-separator-models/roformers/mel_band_roformer_vocals_v2_gabox.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fac81dbebc0992503df55110d64d86c4fb74a1529527a819a253f3d20ef72bc1 +size 913031195 diff --git a/models/audio-separator-models/roformers/melband_roformer_big_beta4.ckpt b/models/audio-separator-models/roformers/melband_roformer_big_beta4.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..62cb0ed8e023394924b1108a5335daa8b91d78d4 --- /dev/null +++ b/models/audio-separator-models/roformers/melband_roformer_big_beta4.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:700a9bd3831d4f7f44cc0019b238774e31045bcbc361fbb69235535c40fc1454 +size 1574477088 diff --git a/models/audio-separator-models/roformers/melband_roformer_big_beta5e.ckpt b/models/audio-separator-models/roformers/melband_roformer_big_beta5e.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..a59473cd797f3c5fc6d6eb420d324145dc7c1104 --- /dev/null +++ b/models/audio-separator-models/roformers/melband_roformer_big_beta5e.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32b876e1163716a9a007438b5a5107069586aa9b9ca653a5f63013b1edf6920c +size 1479749810 diff --git a/models/audio-separator-models/roformers/melband_roformer_big_beta6.ckpt b/models/audio-separator-models/roformers/melband_roformer_big_beta6.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..3e62cb6881f880ecadb2131661a7bffc1c651889 --- /dev/null +++ b/models/audio-separator-models/roformers/melband_roformer_big_beta6.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f51cbb94b4ed5c36cb36fd2024236a8af3ed6886567981702ad6f094b2c6c820 +size 1557078584 diff --git a/models/audio-separator-models/roformers/melband_roformer_big_beta6x.ckpt b/models/audio-separator-models/roformers/melband_roformer_big_beta6x.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..8a45e7206470ce16a36e6c5c60726331d16221db --- /dev/null +++ b/models/audio-separator-models/roformers/melband_roformer_big_beta6x.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e16d702f4e20f13d60b293541c1dea75cb4414a5846b36780e28ef70352a4e5c +size 1708527586 diff --git a/models/audio-separator-models/roformers/melband_roformer_inst_v1.ckpt b/models/audio-separator-models/roformers/melband_roformer_inst_v1.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..0a5775dcf471eae22321cf27576da3155bd773e9 --- /dev/null +++ b/models/audio-separator-models/roformers/melband_roformer_inst_v1.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f88d96958b2b7dec32286b0ced00bbcbd37e28741cad9038758b1eaf9b5c057 +size 913100690 diff --git a/models/audio-separator-models/roformers/melband_roformer_inst_v1_plus.ckpt b/models/audio-separator-models/roformers/melband_roformer_inst_v1_plus.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..ba0efaf8a870d3f5921d78c2fe7107ebab72c7e6 --- /dev/null +++ b/models/audio-separator-models/roformers/melband_roformer_inst_v1_plus.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25fe6ab4db95a6c20468f6c082f9bfc30904b8727ed8d069110bd0960da4879a +size 913090472 diff --git a/models/audio-separator-models/roformers/melband_roformer_inst_v1e.ckpt b/models/audio-separator-models/roformers/melband_roformer_inst_v1e.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..f8370873342b453fb65d96b2fe069cbc0f8130e9 --- /dev/null +++ b/models/audio-separator-models/roformers/melband_roformer_inst_v1e.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df2bcdb8838b88264f5381dbb0ccd84a9926c9775cf548c34d8846f5cd20fe96 +size 913102724 diff --git a/models/audio-separator-models/roformers/melband_roformer_inst_v1e_plus.ckpt b/models/audio-separator-models/roformers/melband_roformer_inst_v1e_plus.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..d96e91089fa7a07f5c4052f55449fd6ed4047ce2 --- /dev/null +++ b/models/audio-separator-models/roformers/melband_roformer_inst_v1e_plus.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a4ddba739f0352407fb6e18b29206b82318ec427fe37fcedb0f83241e4e15fb +size 913090472 diff --git a/models/audio-separator-models/roformers/melband_roformer_inst_v2.ckpt b/models/audio-separator-models/roformers/melband_roformer_inst_v2.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..e59c3365c4976b3c225f0cf9ab34342531dc6955 --- /dev/null +++ b/models/audio-separator-models/roformers/melband_roformer_inst_v2.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd19766620f7d6f58fdf7aaada7e89907fe41bc64490ce3faa9a6dab15d6e1f2 +size 1574477088 diff --git a/models/audio-separator-models/roformers/melband_roformer_instvoc_duality_v1.ckpt b/models/audio-separator-models/roformers/melband_roformer_instvoc_duality_v1.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..2b99b0c8e53949042dce6f1db2300642212b31f7 --- /dev/null +++ b/models/audio-separator-models/roformers/melband_roformer_instvoc_duality_v1.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4640a59d454bf9f69d67460592ab71e7cdce3afa0c0a6f0cf4500bb4ac0b8381 +size 1719116358 diff --git a/models/audio-separator-models/roformers/melband_roformer_instvox_duality_v2.ckpt b/models/audio-separator-models/roformers/melband_roformer_instvox_duality_v2.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..e4042424f484dbb10d5f76f807c04f9b7f307e23 --- /dev/null +++ b/models/audio-separator-models/roformers/melband_roformer_instvox_duality_v2.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4a69558708f2857e36ac86a0e03ed95c4e3d8b9c5b8113963987d0d7df7e20f +size 1719116358 diff --git a/models/audio-separator-models/roformers/model_bs_roformer_ep_317_sdr_12.9755.ckpt b/models/audio-separator-models/roformers/model_bs_roformer_ep_317_sdr_12.9755.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..1d4b892da79c875b3b3028f9f4d2504ebafe72e1 --- /dev/null +++ b/models/audio-separator-models/roformers/model_bs_roformer_ep_317_sdr_12.9755.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b84f37e8d444c8cb30c79d77f613a41c05868ff9c9ac6c7049c00aefae115aa +size 639331213 diff --git a/models/audio-separator-models/roformers/model_bs_roformer_ep_317_sdr_12.9755.yaml b/models/audio-separator-models/roformers/model_bs_roformer_ep_317_sdr_12.9755.yaml new file mode 100644 index 0000000000000000000000000000000000000000..527a3ad8a10366aaa562d007714e74d271a18cbf --- /dev/null +++ b/models/audio-separator-models/roformers/model_bs_roformer_ep_317_sdr_12.9755.yaml @@ -0,0 +1,133 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 # don't work (use in model) + hop_length: 441 # don't work (use in model) + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + dim: 512 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 16 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - Vocals + - Instrumental + lr: 5.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: Vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: simple1 + use_mp3_compress: false # Deprecated + augmentation_mix: true # Mix several stems of the same type with some probability + augmentation_loudness: true # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0.5 + augmentation_loudness_max: 1.5 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 4 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/model_bs_roformer_ep_368_sdr_12.9628.ckpt b/models/audio-separator-models/roformers/model_bs_roformer_ep_368_sdr_12.9628.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..929fbf0bc57f788a3d6b758ee4feedd61976298b --- /dev/null +++ b/models/audio-separator-models/roformers/model_bs_roformer_ep_368_sdr_12.9628.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6c94864adfb73bbb0ca58ec14d58dd0b364549e9fb61433ae51916f3e2f8d0b +size 639317465 diff --git a/models/audio-separator-models/roformers/model_bs_roformer_ep_368_sdr_12.9628.yaml b/models/audio-separator-models/roformers/model_bs_roformer_ep_368_sdr_12.9628.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3703706964572e9f52a8d9003935eb808f094875 --- /dev/null +++ b/models/audio-separator-models/roformers/model_bs_roformer_ep_368_sdr_12.9628.yaml @@ -0,0 +1,133 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 # don't work (use in model) + hop_length: 441 # don't work (use in model) + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + dim: 512 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 16 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - Vocals + - Instrumental + lr: 5.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: Vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: simple1 + use_mp3_compress: false # Deprecated + augmentation_mix: true # Mix several stems of the same type with some probability + augmentation_loudness: true # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0.5 + augmentation_loudness_max: 1.5 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 4 diff --git a/models/audio-separator-models/roformers/model_bs_roformer_ep_937_sdr_10.5309.ckpt b/models/audio-separator-models/roformers/model_bs_roformer_ep_937_sdr_10.5309.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..b30e4893bf1e4b198ea8005346b01efaa135c8b7 --- /dev/null +++ b/models/audio-separator-models/roformers/model_bs_roformer_ep_937_sdr_10.5309.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2e825a03bc908cb04dbd88eddeefbf5147dd1cf1f95cebf453d9dbfabec494b +size 393068365 diff --git a/models/audio-separator-models/roformers/model_bs_roformer_ep_937_sdr_10.5309.yaml b/models/audio-separator-models/roformers/model_bs_roformer_ep_937_sdr_10.5309.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9ea5c4b701d730d9f752aa87f9a82827e40b7b6e --- /dev/null +++ b/models/audio-separator-models/roformers/model_bs_roformer_ep_937_sdr_10.5309.yaml @@ -0,0 +1,138 @@ +audio: + chunk_size: 131584 + dim_f: 1024 + dim_t: 256 + hop_length: 512 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + dim: 384 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + linear_transformer_depth: 0 + freqs_per_bands: !!python/tuple + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 2 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 4 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 12 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 24 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 48 + - 128 + - 129 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: true + dim_freqs_in: 1025 + stft_n_fft: 2048 + stft_hop_length: 512 + stft_win_length: 2048 + stft_normalized: false + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 4 + gradient_accumulation_steps: 1 + grad_clip: 0 + instruments: + - No Drum-Bass + - Drum-Bass + lr: 5.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: No Drum-Bass + num_epochs: 1000 + num_steps: 1000 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +augmentations: + enable: true # enable or disable all augmentations (to fast disable if needed) + loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max) + loudness_min: 0.5 + loudness_max: 1.5 + mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3) + mixup_probs: !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02) + - 0.2 + - 0.02 + mixup_loudness_min: 0.5 + mixup_loudness_max: 1.5 + +inference: + batch_size: 1 + dim_t: 512 + num_overlap: 4 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/model_chorus_bs_roformer_ep_267_sdr_24.1275.ckpt b/models/audio-separator-models/roformers/model_chorus_bs_roformer_ep_267_sdr_24.1275.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..4f9492413f4671a4c218f73cd6df2963e6b75fb8 --- /dev/null +++ b/models/audio-separator-models/roformers/model_chorus_bs_roformer_ep_267_sdr_24.1275.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:123c00786bdbc6bd462dddb35cd21fd6ae99ab8319f93f63a8abc1012e593d94 +size 527121477 diff --git a/models/audio-separator-models/roformers/model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt b/models/audio-separator-models/roformers/model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..1cb36d54103c4fdd30f7e8a089164cba50cfb0b0 --- /dev/null +++ b/models/audio-separator-models/roformers/model_mel_band_roformer_ep_3005_sdr_11.4360.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21b9d0958e35b8ebfbe2afe69bbd5444e5ffe2f5d80ae0d583b833d2f3c0d139 +size 1007816988 diff --git a/models/audio-separator-models/roformers/model_mel_band_roformer_ep_3005_sdr_11.4360.yaml b/models/audio-separator-models/roformers/model_mel_band_roformer_ep_3005_sdr_11.4360.yaml new file mode 100644 index 0000000000000000000000000000000000000000..16802cec559bad5ad47225053feb5d0d0c3b65a8 --- /dev/null +++ b/models/audio-separator-models/roformers/model_mel_band_roformer_ep_3005_sdr_11.4360.yaml @@ -0,0 +1,72 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 801 # don't work (use in model) + hop_length: 441 # don't work (use in model) + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + dim: 384 + depth: 12 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0.1 + ff_dropout: 0.1 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + batch_size: 9 + gradient_accumulation_steps: 8 + grad_clip: 0 + instruments: + - Vocals + - Instrumental + lr: 4.0e-05 + patience: 2 + reduce_factor: 0.95 + target_instrument: Vocals + num_epochs: 1000 + num_steps: 1000 + augmentation: false # enable augmentations by audiomentations and pedalboard + augmentation_type: simple1 + use_mp3_compress: false # Deprecated + augmentation_mix: true # Mix several stems of the same type with some probability + augmentation_loudness: true # randomly change loudness of each stem + augmentation_loudness_type: 1 # Type 1 or 2 + augmentation_loudness_min: 0.5 + augmentation_loudness_max: 1.5 + q: 0.95 + coarse_loss_clip: true + ema_momentum: 0.999 + optimizer: adam + other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental + use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true + +inference: + batch_size: 1 + dim_t: 801 + num_overlap: 4 \ No newline at end of file diff --git a/models/audio-separator-models/roformers/vocals_mel_band_roformer.ckpt b/models/audio-separator-models/roformers/vocals_mel_band_roformer.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..e9269937826d8cedf1855096bc9c1d49298bb4f8 --- /dev/null +++ b/models/audio-separator-models/roformers/vocals_mel_band_roformer.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87201f4d31afb5bc79993230fc49446918425574db48c01c405e44f365c7559e +size 913106900 diff --git a/models/audio-separator-models/roformers/vocals_mel_band_roformer.yaml b/models/audio-separator-models/roformers/vocals_mel_band_roformer.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9cb005e7a97c66d5fb23bba8bb36bec9619cdd8f --- /dev/null +++ b/models/audio-separator-models/roformers/vocals_mel_band_roformer.yaml @@ -0,0 +1,50 @@ +audio: + chunk_size: 352800 + dim_f: 1024 + dim_t: 256 + hop_length: 441 + n_fft: 2048 + num_channels: 2 + sample_rate: 44100 + min_mean_abs: 0.001 + +model: + dim: 384 + depth: 6 + stereo: true + num_stems: 1 + time_transformer_depth: 1 + freq_transformer_depth: 1 + num_bands: 60 + dim_head: 64 + heads: 8 + attn_dropout: 0 + ff_dropout: 0 + flash_attn: True + dim_freqs_in: 1025 + sample_rate: 44100 # needed for mel filter bank from librosa + stft_n_fft: 2048 + stft_hop_length: 441 + stft_win_length: 2048 + stft_normalized: False + mask_estimator_depth: 2 + multi_stft_resolution_loss_weight: 1.0 + multi_stft_resolutions_window_sizes: !!python/tuple + - 4096 + - 2048 + - 1024 + - 512 + - 256 + multi_stft_hop_size: 147 + multi_stft_normalized: False + +training: + instruments: + - vocals + - other + target_instrument: vocals + +inference: + dim_t: 1101 + num_overlap: 1 + chunk_size: 352800 \ No newline at end of file diff --git a/models/audio-separator-models/source.txt b/models/audio-separator-models/source.txt new file mode 100644 index 0000000000000000000000000000000000000000..a031824f31440defb4bcdc3e89d6a25e31b7db7a --- /dev/null +++ b/models/audio-separator-models/source.txt @@ -0,0 +1 @@ +https://huggingface.co/Eddycrack864/audio-separator-models \ No newline at end of file