Uploaded checkpoint + yaml
Browse filesFull release of my rifforge model
Characteristics:
This is a dimension 512 depth 24 model (so fairely large file size at 1.9 GB!), with a sdr of 14.2436.
finetuned from an older Melband Roformer checkpoint with an sdr of 13.7.
Model can have some quirks (just like most models) but its all around clean for me to release.
Metrics from my own validation data :
sdr: 14.2436
l1_frequency: 21.6864
bleedless: 57.1301
fullness: 33.3656
config_rifforge_full_mesk.yaml
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
audio:
|
| 2 |
+
chunk_size: 733824
|
| 3 |
+
dim_f: 1024
|
| 4 |
+
dim_t: 1665
|
| 5 |
+
hop_length: 441
|
| 6 |
+
n_fft: 2048
|
| 7 |
+
num_channels: 2
|
| 8 |
+
sample_rate: 44100
|
| 9 |
+
min_mean_abs: 0.000
|
| 10 |
+
|
| 11 |
+
model:
|
| 12 |
+
dim: 512
|
| 13 |
+
depth: 24
|
| 14 |
+
stereo: true
|
| 15 |
+
num_stems: 1
|
| 16 |
+
time_transformer_depth: 1
|
| 17 |
+
freq_transformer_depth: 1
|
| 18 |
+
num_bands: 60
|
| 19 |
+
dim_head: 64
|
| 20 |
+
heads: 8
|
| 21 |
+
attn_dropout: 0
|
| 22 |
+
ff_dropout: 0
|
| 23 |
+
flash_attn: True
|
| 24 |
+
dim_freqs_in: 1025
|
| 25 |
+
sample_rate: 44100 # needed for mel filter bank from librosa
|
| 26 |
+
stft_n_fft: 2048
|
| 27 |
+
stft_hop_length: 441
|
| 28 |
+
stft_win_length: 2048
|
| 29 |
+
stft_normalized: False
|
| 30 |
+
mask_estimator_depth: 2
|
| 31 |
+
multi_stft_resolution_loss_weight: 1.0
|
| 32 |
+
multi_stft_resolutions_window_sizes: !!python/tuple
|
| 33 |
+
- 4096
|
| 34 |
+
- 2048
|
| 35 |
+
- 1024
|
| 36 |
+
- 512
|
| 37 |
+
- 256
|
| 38 |
+
multi_stft_hop_size: 147
|
| 39 |
+
multi_stft_normalized: False
|
| 40 |
+
use_torch_checkpoint: true
|
| 41 |
+
|
| 42 |
+
training:
|
| 43 |
+
batch_size: 1
|
| 44 |
+
gradient_accumulation_steps: 1
|
| 45 |
+
grad_clip: 0
|
| 46 |
+
instruments:
|
| 47 |
+
- vocals
|
| 48 |
+
- other
|
| 49 |
+
lr: 1.0e-05
|
| 50 |
+
patience: 1000
|
| 51 |
+
reduce_factor: 0.95
|
| 52 |
+
target_instrument: other
|
| 53 |
+
num_epochs: 1000
|
| 54 |
+
num_steps: 1000
|
| 55 |
+
augmentation: false # enable augmentations by audiomentations and pedalboard
|
| 56 |
+
augmentation_type: null
|
| 57 |
+
use_mp3_compress: false # Deprecated
|
| 58 |
+
augmentation_mix: false # Mix several stems of the same type with some probability
|
| 59 |
+
augmentation_loudness: false # randomly change loudness of each stem
|
| 60 |
+
augmentation_loudness_type: 1 # Type 1 or 2
|
| 61 |
+
augmentation_loudness_min: 0
|
| 62 |
+
augmentation_loudness_max: 0
|
| 63 |
+
q: 0.95
|
| 64 |
+
coarse_loss_clip: false
|
| 65 |
+
ema_momentum: 0.999
|
| 66 |
+
optimizer: adamw8bit
|
| 67 |
+
other_fix: true # it's needed for checking on multisong dataset if other is actually instrumental
|
| 68 |
+
use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
|
| 69 |
+
use_torch_checkpoint: true
|
| 70 |
+
|
| 71 |
+
inference:
|
| 72 |
+
batch_size: 2
|
| 73 |
+
dim_t: 1665
|
| 74 |
+
num_overlap: 2
|
rifforge_full_sdr_14.2436.ckpt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:22f4eb72181d43336a1437b5d85079fa678d7b39d293283db56b3cb48908d5d5
|
| 3 |
+
size 2011438773
|