sepacap / modelMusicSep.yaml
Tino3141's picture
Upload modelMusicSep.yaml
15aa140 verified
project: "[Project] SepReformer" ### Dont't change
notes: "SepReformer final version" ### Insert schanges(plz write details !!!)
# ------------------------------------------------------------------------------------------------------------------------------ #
config:
# ------------------------------------------------------------ #
dataset:
max_len: 96000
sampling_rate: 24000
type: "ja_capella_power"
train: "train"
val: "test"
test: "test"
# ------------------------------------------------------------ #
dataloader:
batch_size: 2
pin_memory: false
num_workers: 4
drop_last: false
shuffle: true
prefetch_factor: 10
# ------------------------------------------------------------ #
model:
num_stages: &var_model_num_stages 3 # R
num_spks: &var_model_num_spks 7
module_audio_enc:
in_channels: 1
out_channels: &var_model_audio_enc_out_channels 256
kernel_size: &var_model_audio_enc_kernel_size 32 # L
stride: &var_model_audio_enc_stride 8 # S
groups: 1
bias: false
module_feature_projector:
num_channels: *var_model_audio_enc_out_channels
in_channels: *var_model_audio_enc_out_channels
out_channels: &feature_projector_out_channels 128 # F
kernel_size: 1
bias: false
module_separator:
num_stages: *var_model_num_stages
relative_positional_encoding:
in_channels: *feature_projector_out_channels
num_heads: 8
maxlen: 2000
embed_v: false
enc_stage:
num_patterns: 2
global_blocks:
in_channels: *feature_projector_out_channels
num_mha_heads: 8
dropout_rate: 0.05
local_blocks:
in_channels: *feature_projector_out_channels
kernel_size: 65
dropout_rate: 0.05
down_conv_layer:
in_channels: *feature_projector_out_channels
samp_kernel_size: &var_model_samp_kernel_size 5
spk_split_stage:
in_channels: *feature_projector_out_channels
num_spks: *var_model_num_spks
simple_fusion:
out_channels: *feature_projector_out_channels
dec_stage:
num_spks: *var_model_num_spks
num_patterns: 3
global_blocks:
in_channels: *feature_projector_out_channels
num_mha_heads: 8
dropout_rate: 0.05
local_blocks:
in_channels: *feature_projector_out_channels
kernel_size: 65
dropout_rate: 0.05
spk_attention:
in_channels: *feature_projector_out_channels
num_mha_heads: 8
dropout_rate: 0.05
module_output_layer:
in_channels: *var_model_audio_enc_out_channels
out_channels: *feature_projector_out_channels
num_spks: *var_model_num_spks
module_audio_dec:
in_channels: *var_model_audio_enc_out_channels
out_channels: 1
kernel_size: *var_model_audio_enc_kernel_size
stride: *var_model_audio_enc_stride
bias: false
losses:
PIT_SPECTRAL:
lambda: 0.3
weights: [1, 1, 1]
window_lengths: [256, 512, 1024]
hop_lengths: [64, 128, 256]
PIT_MEL:
# overall loss weight
lambda: 0.7
# equal weight for each of the 7 scales
weights: [1, 1, 1, 1, 1, 1, 1]
# number of mel bins at each scale (very coarse → very fine)
mels: [5, 10, 20, 40, 80, 160, 320]
# analysis window lengths (in samples)
window_lengths: [32, 64, 128, 256, 512, 1024, 2048]
# hop lengths (here 25% overlap)
hop_lengths: [8, 16, 32, 64, 128, 256, 512]
# all start from 0 Hz
mel_fmin: [0, 0, 0, 0, 0, 0, 0]
# all go up to Nyquist (16 kHz / 2 = 8 kHz)
mel_fmax: [8000, 8000, 8000, 8000, 8000, 8000, 8000]
PIT_L1:
lambda: 1
loss_g:
lambda: 1
loss_f:
lambda: 1
# ------------------------------------------------------------ #
discriminator:
rates: [2, 3, 5, 7, 11]
periods: [2, 3, 5, 7, 11]
fft_sizes: [2048, 1024, 512]
# ------------------------------------------------------------ #
engine:
ckpt_path_model: ""
max_epochs: 200
accum_steps: 10
gpuid: "0" ### "0"(single-gpu) or "0, 1" (multi-gpu)
gpu_ids: [0,1,2,3]
mvn: false
clip_norm: 5
start_scheduling: 50
test_epochs: [100, 120, 150, 170]
learning_rate: 5e-4
learning_rate_disc: 2e-4
weight_decay: 1.0e-2
log_interval: 20
ckpt_interval: 1000
seed: 42
losses:
PIT_MEL: 0.7
PIT_STFT: 0.3
PIT_L1: 1
loss_g: 1
loss_f: 1
PIT_SDR: 1
evaluation_metrics:
PESQ:
mode: "nb"
sr: 8000