Upload folder using huggingface_hub

Browse files

Files changed (5) hide show

bs_roformer/bs_inst_exp_vlp_unwa.ckpt +3 -0
bs_roformer/bs_inst_exp_vlp_unwa_config.yaml +122 -0
mel_band_roformer/mbr_lead_rhythm_guitar_listra92_config.yaml +102 -108
mel_band_roformer/mbr_vocalsfv7_gabox.ckpt +3 -0
mel_band_roformer/mbr_vocalsfv7_gabox_config.yaml +70 -0

bs_roformer/bs_inst_exp_vlp_unwa.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c035e2a102243405e45bf33faa175f62fd7118f63b62771fafdf81062b804131
+size 393351501

bs_roformer/bs_inst_exp_vlp_unwa_config.yaml ADDED Viewed

	@@ -0,0 +1,122 @@

+audio:
+  chunk_size: 485100
+  dim_f: 1024
+  dim_t: 801
+  hop_length: 441
+  n_fft: 2048
+  num_channels: 2
+  sample_rate: 44100
+  min_mean_abs: 0.0
+model:
+  dim: 384
+  depth: 12
+  stereo: true
+  num_stems: 1
+  time_transformer_depth: 1
+  freq_transformer_depth: 1
+  freqs_per_bands: !!python/tuple
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 2
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 4
+    - 12
+    - 12
+    - 12
+    - 12
+    - 12
+    - 12
+    - 12
+    - 12
+    - 24
+    - 24
+    - 24
+    - 24
+    - 24
+    - 24
+    - 24
+    - 24
+    - 48
+    - 48
+    - 48
+    - 48
+    - 48
+    - 48
+    - 48
+    - 48
+    - 128
+    - 129
+  dim_head: 64
+  heads: 8
+  attn_dropout: 0
+  ff_dropout: 0
+  flash_attn: true
+  dim_freqs_in: 1025
+  stft_n_fft: 2048
+  stft_hop_length: 441
+  stft_win_length: 2048
+  stft_normalized: false
+  mask_estimator_depth: 2
+  multi_stft_resolution_loss_weight: 1.0
+  multi_stft_resolutions_window_sizes: !!python/tuple
+    - 4096
+    - 2048
+    - 1024
+    - 512
+    - 256
+  multi_stft_hop_size: 147
+  multi_stft_normalized: false
+training:
+  batch_size: 1
+  gradient_accumulation_steps: 1
+  grad_clip: 0
+  instruments:
+    - vocals
+    - other
+  lr: 0.0001
+  patience: 2
+  reduce_factor: 0.95
+  target_instrument: other
+  num_epochs: 1
+  num_steps: 1000
+  q: 0.95
+  coarse_loss_clip: true
+  ema_momentum: 0.999
+  optimizer: adamw
+  other_fix: true
+  use_amp: true
+inference:
+  batch_size: 1
+  dim_t: 1101
+  num_overlap: 2

mel_band_roformer/mbr_lead_rhythm_guitar_listra92_config.yaml CHANGED Viewed

@@ -1,108 +1,102 @@
-audio:
-  chunk_size: 132300
-  dim_f: 1024
-  dim_t: 256
-  hop_length: 441
-  n_fft: 2048
-  num_channels: 2
-  sample_rate: 44100
-  min_mean_abs: 000
-model:
-  dim: 384
-  depth: 4
-  stereo: true
-  num_stems: 1
-  time_transformer_depth: 1
-  freq_transformer_depth: 1
-  num_bands: 60
-  dim_head: 64
-  heads: 8
-  attn_dropout: 0
-  ff_dropout: 0
-  flash_attn: true
-  dim_freqs_in: 1025
-  sample_rate: 44100  # needed for mel filter bank from librosa
-  stft_n_fft: 2048
-  stft_hop_length: 441
-  stft_win_length: 2048
-  stft_normalized: false
-  mask_estimator_depth: 2
-  multi_stft_resolution_loss_weight: 2.0
-  multi_stft_resolutions_window_sizes: !!python/tuple
-    - 4096
-    - 2048
-    - 1024
-    - 512
-    - 256
-  multi_stft_hop_size: 147
-  multi_stft_normalized: false
-  mlp_expansion_factor: 2 # Probably too big (requires a lot of memory for weights)
-  use_torch_checkpoint: false  # it allows to greatly reduce GPU memory consumption during training (not fully tested)
-  skip_connection: false # Enable skip connection between transformer blocks - can solve problem with gradients and probably faster training
-loss_multistft:
-  fft_sizes:
-    - 1024
-    - 2048
-    - 4096
-  hop_sizes:
-    - 512
-    - 1024
-    - 2048
-  win_lengths:
-    - 1024
-    - 2048
-    - 4096
-  window: "hann_window"
-  scale: "mel"
-  n_bins: 128
-  sample_rate: 44100
-  perceptual_weighting: true
-  w_sc: 1.0
-  w_log_mag: 1.0
-  w_lin_mag: 0.0
-  w_phs: 0.0
-  mag_distance: "L1"
-training:
-  batch_size: 2
-  gradient_accumulation_steps: 2
-  grad_clip: 0
-  instruments:
-    - Lead
-    - Rhythm
-  lr: 1.0e-04
-  patience: 5
-  reduce_factor: 0.95
-  target_instrument: Lead
-  num_epochs: 1000
-  num_steps: 1000
-  q: 0.95
-  coarse_loss_clip: true
-  ema_momentum: 0.999
-  optimizer: adamw
-  other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
-  use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
-augmentations:
-  enable: true # enable or disable all augmentations (to fast disable if needed)
-  loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
-  loudness_min: 0.5
-  loudness_max: 1.5
-  difference:
-    channel_shuffle: 0.5 # Set 0 or lower to disable
-    random_inverse: 0.01 # inverse track (better lower probability)
-    random_polarity: 0.5 # polarity change (multiply waveform to -1)
-inference:
-  batch_size: 12
-  dim_t: 256
-  num_overlap: 1
-lora:
-  r: 8
-  lora_alpha: 16. #alpha / rank > 1
-  lora_dropout: 0.05
-  merge_weights: true
-  fan_in_fan_out: false

+audio:
+  chunk_size: 132300
+  dim_f: 1024
+  dim_t: 256
+  hop_length: 441
+  n_fft: 2048
+  num_channels: 2
+  sample_rate: 44100
+  min_mean_abs: 0
+model:
+  dim: 384
+  depth: 4
+  stereo: true
+  num_stems: 1
+  time_transformer_depth: 1
+  freq_transformer_depth: 1
+  num_bands: 60
+  dim_head: 64
+  heads: 8
+  attn_dropout: 0
+  ff_dropout: 0
+  flash_attn: true
+  dim_freqs_in: 1025
+  sample_rate: 44100
+  stft_n_fft: 2048
+  stft_hop_length: 441
+  stft_win_length: 2048
+  stft_normalized: false
+  mask_estimator_depth: 2
+  multi_stft_resolution_loss_weight: 2.0
+  multi_stft_resolutions_window_sizes: !!python/tuple
+    - 4096
+    - 2048
+    - 1024
+    - 512
+    - 256
+  multi_stft_hop_size: 147
+  multi_stft_normalized: false
+  mlp_expansion_factor: 2
+  use_torch_checkpoint: false
+  skip_connection: false
+loss_multistft:
+  fft_sizes:
+    - 1024
+    - 2048
+    - 4096
+  hop_sizes:
+    - 512
+    - 1024
+    - 2048
+  win_lengths:
+    - 1024
+    - 2048
+    - 4096
+  window: hann_window
+  scale: mel
+  n_bins: 128
+  sample_rate: 44100
+  perceptual_weighting: true
+  w_sc: 1.0
+  w_log_mag: 1.0
+  w_lin_mag: 0.0
+  w_phs: 0.0
+  mag_distance: L1
+training:
+  batch_size: 2
+  gradient_accumulation_steps: 2
+  grad_clip: 0
+  instruments:
+    - Lead
+    - Rhythm
+  lr: 0.0001
+  patience: 5
+  reduce_factor: 0.95
+  target_instrument: Lead
+  num_epochs: 1000
+  num_steps: 1000
+  q: 0.95
+  coarse_loss_clip: true
+  ema_momentum: 0.999
+  optimizer: adamw
+  other_fix: false
+  use_amp: true
+augmentations:
+  enable: true
+  loudness: true
+  loudness_min: 0.5
+  loudness_max: 1.5
+  difference:
+    channel_shuffle: 0.5
+    random_inverse: 0.01
+    random_polarity: 0.5
+inference:
+  batch_size: 1
+  dim_t: 256
+  num_overlap: 2
+lora:
+  r: 8
+  lora_alpha: 16.0
+  lora_dropout: 0.05
+  merge_weights: true
+  fan_in_fan_out: false

mel_band_roformer/mbr_vocalsfv7_gabox.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:77b9e4b54802d670c02daceb30c61ec825dc54ca6c29c34b03cdd5e9f78382b6
+size 489571079

mel_band_roformer/mbr_vocalsfv7_gabox_config.yaml ADDED Viewed

	@@ -0,0 +1,70 @@

+audio:
+  chunk_size: 352800
+  dim_f: 1024
+  dim_t: 256
+  hop_length: 441
+  n_fft: 2048
+  num_channels: 2
+  sample_rate: 44100
+  min_mean_abs: 0.0
+model:
+  dim: 256
+  depth: 12
+  stereo: true
+  num_stems: 1
+  time_transformer_depth: 1
+  freq_transformer_depth: 1
+  num_bands: 60
+  dim_head: 64
+  heads: 8
+  attn_dropout: 0
+  ff_dropout: 0
+  flash_attn: true
+  dim_freqs_in: 1025
+  sample_rate: 44100
+  stft_n_fft: 2048
+  stft_hop_length: 441
+  stft_win_length: 2048
+  stft_normalized: false
+  mask_estimator_depth: 2
+  multi_stft_resolution_loss_weight: 1.0
+  multi_stft_resolutions_window_sizes: !!python/tuple
+    - 4096
+    - 2048
+    - 1024
+    - 512
+    - 256
+  multi_stft_hop_size: 147
+  multi_stft_normalized: false
+  use_torch_checkpoint: true
+training:
+  batch_size: 1
+  gradient_accumulation_steps: 1
+  grad_clip: 0
+  instruments:
+    - Vocals
+    - Instrumental
+  lr: 1.0e-05
+  patience: 100000000
+  reduce_factor: 0.95
+  target_instrument: Vocals
+  num_epochs: 1000
+  num_steps: 1000
+  augmentation: false
+  augmentation_type: simple1
+  use_mp3_compress: false
+  augmentation_mix: false
+  augmentation_loudness: true
+  augmentation_loudness_type: 1
+  augmentation_loudness_min: 0
+  augmentation_loudness_max: 0
+  q: 0.95
+  coarse_loss_clip: false
+  ema_momentum: 0.999
+  optimizer: adamw
+  other_fix: false
+  use_amp: true
+inference:
+  batch_size: 1
+  dim_t: 1101
+  num_overlap: 2