added choirsep models by concert.isolations.business@gmail.com

Browse files

Files changed (2) hide show

demucs_choirsep/config_htdemucs_choirsep.yaml +151 -0
scnet_choirsep/config_scnet_choirsep.yaml +107 -0

demucs_choirsep/config_htdemucs_choirsep.yaml ADDED Viewed

	@@ -0,0 +1,151 @@

+audio:
+  chunk_size: 132300 # samplerate * segment
+  min_mean_abs: 0.001
+  hop_length: 1024
+training:
+  batch_size: 4
+  gradient_accumulation_steps: 1
+  grad_clip: 0
+  segment: 3
+  shift: 1
+  samplerate: 44100
+  channels: 2
+  normalize: true
+  instruments: ['alto', 'bass', 'soprano', 'tenor']
+  target_instrument: null
+  num_epochs: 1000
+  num_steps: 1000
+  optimizer: adam
+  lr: 1.0e-04
+  patience: 2
+  reduce_factor: 0.95
+  q: 0.95
+  coarse_loss_clip: true
+  ema_momentum: 0.999
+  other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
+  use_amp: false # enable or disable usage of mixed precision (float16) - usually it must be true
+loss_multistft:
+  fft_sizes:
+  - 1024
+  - 2048
+  - 4096
+  hop_sizes:
+  - 512
+  - 1024
+  - 2048
+  win_lengths:
+  - 1024
+  - 2048
+  - 4096
+  window: "hann_window"
+  scale: "mel"
+  n_bins: 128
+  sample_rate: 44100
+  perceptual_weighting: true
+  w_sc: 1.0
+  w_log_mag: 1.0
+  w_lin_mag: 0.0
+  w_phs: 0.0
+  mag_distance: "L1"
+augmentations:
+  enable: false # enable or disable all augmentations (to fast disable if needed)
+  loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
+  loudness_min: 0.5
+  loudness_max: 1.5
+  mixup: true # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
+  mixup_probs: [0.2, 0.02]
+  mixup_loudness_min: 0.5
+  mixup_loudness_max: 1.5
+  all:
+    channel_shuffle: 0.5 # Set 0 or lower to disable
+    random_inverse: 0.1 # inverse track (better lower probability)
+    random_polarity: 0.5 # polarity change (multiply waveform to -1)
+inference:
+  num_overlap: 4
+  batch_size: 8
+model: htdemucs
+htdemucs:  # see demucs/htdemucs.py for a detailed description
+  # Channels
+  channels: 48
+  channels_time:
+  growth: 2
+  # STFT
+  num_subbands: 1
+  nfft: 4096
+  wiener_iters: 0
+  end_iters: 0
+  wiener_residual: false
+  cac: true
+  # Main structure
+  depth: 4
+  rewrite: true
+  # Frequency Branch
+  multi_freqs: []
+  multi_freqs_depth: 3
+  freq_emb: 0.2
+  emb_scale: 10
+  emb_smooth: true
+  # Convolutions
+  kernel_size: 8
+  stride: 4
+  time_stride: 2
+  context: 1
+  context_enc: 0
+  # normalization
+  norm_starts: 4
+  norm_groups: 4
+  # DConv residual branch
+  dconv_mode: 3
+  dconv_depth: 2
+  dconv_comp: 8
+  dconv_init: 1e-3
+  # Before the Transformer
+  bottom_channels: 0
+  # CrossTransformer
+  # ------ Common to all
+  # Regular parameters
+  t_layers: 5
+  t_hidden_scale: 4.0
+  t_heads: 8
+  t_dropout: 0.0
+  t_layer_scale: True
+  t_gelu: True
+  # ------------- Positional Embedding
+  t_emb: sin
+  t_max_positions: 10000 # for the scaled embedding
+  t_max_period: 10000.0
+  t_weight_pos_embed: 1.0
+  t_cape_mean_normalize: True
+  t_cape_augment: True
+  t_cape_glob_loc_scale: [5000.0, 1.0, 1.4]
+  t_sin_random_shift: 0
+  # ------------- norm before a transformer encoder
+  t_norm_in: True
+  t_norm_in_group: False
+  # ------------- norm inside the encoder
+  t_group_norm: False
+  t_norm_first: True
+  t_norm_out: True
+  # ------------- optim
+  t_weight_decay: 0.0
+  t_lr:
+  # ------------- sparsity
+  t_sparse_self_attn: False
+  t_sparse_cross_attn: False
+  t_mask_type: diag
+  t_mask_random_seed: 42
+  t_sparse_attn_window: 400
+  t_global_window: 100
+  t_sparsity: 0.95
+  t_auto_sparsity: False
+  # Cross Encoder First (False)
+  t_cross_first: False
+  # Weight init
+  rescale: 0.1

scnet_choirsep/config_scnet_choirsep.yaml ADDED Viewed

	@@ -0,0 +1,107 @@

+audio:
+  chunk_size: 131072 # 44100 * 11
+  num_channels: 2
+  sample_rate: 44100
+  min_mean_abs: 0.000
+model:
+  sources:
+    - alto
+    - bass
+    - soprano
+    - tenor
+  audio_channels: 2
+  dims:
+    - 4
+    - 32
+    - 64
+    - 128
+  nfft: 4096
+  hop_size: 1024
+  win_size: 4096
+  normalized: True
+  band_SR:
+    - 0.175
+    - 0.392
+    - 0.433
+  band_stride:
+    - 1
+    - 4
+    - 16
+  band_kernel:
+    - 3
+    - 4
+    - 16
+  conv_depths:
+    - 3
+    - 2
+    - 1
+  compress: 4
+  conv_kernel: 3
+  num_dplayer: 6
+  expand: 1
+training:
+  batch_size: 9
+  gradient_accumulation_steps: 1
+  grad_clip: 0
+  instruments:
+    - alto
+    - bass
+    - soprano
+    - tenor
+  lr: 5.0e-4
+  patience: 6
+  reduce_factor: 0.95
+  target_instrument: null
+  num_epochs: 1000
+  num_steps: 1000
+  q: 0.95
+  coarse_loss_clip: true
+  ema_momentum: 0.999
+  optimizer: adamw8bit
+  other_fix: false # it's needed for checking on multisong dataset if other is actually instrumental
+  use_amp: true # enable or disable usage of mixed precision (float16) - usually it must be true
+loss_multistft:
+  fft_sizes:
+  - 1024
+  - 2048
+  - 4096
+  hop_sizes:
+  - 512
+  - 1024
+  - 2048
+  win_lengths:
+  - 1024
+  - 2048
+  - 4096
+  window: "hann_window"
+  scale: "mel"
+  n_bins: 128
+  sample_rate: 44100
+  perceptual_weighting: true
+  w_sc: 1.0
+  w_log_mag: 1.0
+  w_lin_mag: 0.0
+  w_phs: 0.0
+  mag_distance: "L1"
+augmentations:
+  enable: false # enable or disable all augmentations (to fast disable if needed)
+  loudness: true # randomly change loudness of each stem on the range (loudness_min; loudness_max)
+  loudness_min: 0.5
+  loudness_max: 1.5
+  mixup: false # mix several stems of same type with some probability (only works for dataset types: 1, 2, 3)
+  mixup_probs:
+    !!python/tuple # 2 additional stems of the same type (1st with prob 0.2, 2nd with prob 0.02)
+    - 0.2
+    - 0.02
+  mixup_loudness_min: 0.5
+  mixup_loudness_max: 1.5
+inference:
+  batch_size: 16
+  dim_t: 256
+  num_overlap: 1
+  normalize: false