| project: "[Project] SepReformer" |
| notes: "SepReformer final version" |
| |
| config: |
| |
| dataset: |
| max_len: 96000 |
| sampling_rate: 24000 |
| type: "ja_capella_power" |
| train: "train" |
| val: "test" |
| test: "test" |
| |
| dataloader: |
| batch_size: 2 |
| pin_memory: false |
| num_workers: 4 |
| drop_last: false |
| shuffle: true |
| prefetch_factor: 10 |
| |
| model: |
| num_stages: &var_model_num_stages 3 |
| num_spks: &var_model_num_spks 7 |
| module_audio_enc: |
| in_channels: 1 |
| out_channels: &var_model_audio_enc_out_channels 256 |
| kernel_size: &var_model_audio_enc_kernel_size 32 |
| stride: &var_model_audio_enc_stride 8 |
| groups: 1 |
| bias: false |
| module_feature_projector: |
| num_channels: *var_model_audio_enc_out_channels |
| in_channels: *var_model_audio_enc_out_channels |
| out_channels: &feature_projector_out_channels 128 |
| kernel_size: 1 |
| bias: false |
| module_separator: |
| num_stages: *var_model_num_stages |
| relative_positional_encoding: |
| in_channels: *feature_projector_out_channels |
| num_heads: 8 |
| maxlen: 2000 |
| embed_v: false |
| enc_stage: |
| num_patterns: 2 |
| global_blocks: |
| in_channels: *feature_projector_out_channels |
| num_mha_heads: 8 |
| dropout_rate: 0.05 |
| local_blocks: |
| in_channels: *feature_projector_out_channels |
| kernel_size: 65 |
| dropout_rate: 0.05 |
| down_conv_layer: |
| in_channels: *feature_projector_out_channels |
| samp_kernel_size: &var_model_samp_kernel_size 5 |
| spk_split_stage: |
| in_channels: *feature_projector_out_channels |
| num_spks: *var_model_num_spks |
| simple_fusion: |
| out_channels: *feature_projector_out_channels |
| dec_stage: |
| num_spks: *var_model_num_spks |
| num_patterns: 3 |
| global_blocks: |
| in_channels: *feature_projector_out_channels |
| num_mha_heads: 8 |
| dropout_rate: 0.05 |
| local_blocks: |
| in_channels: *feature_projector_out_channels |
| kernel_size: 65 |
| dropout_rate: 0.05 |
| spk_attention: |
| in_channels: *feature_projector_out_channels |
| num_mha_heads: 8 |
| dropout_rate: 0.05 |
| module_output_layer: |
| in_channels: *var_model_audio_enc_out_channels |
| out_channels: *feature_projector_out_channels |
| num_spks: *var_model_num_spks |
|
|
| module_audio_dec: |
| in_channels: *var_model_audio_enc_out_channels |
| out_channels: 1 |
| kernel_size: *var_model_audio_enc_kernel_size |
| stride: *var_model_audio_enc_stride |
| bias: false |
| losses: |
| PIT_SPECTRAL: |
| lambda: 0.3 |
| weights: [1, 1, 1] |
| window_lengths: [256, 512, 1024] |
| hop_lengths: [64, 128, 256] |
| PIT_MEL: |
| |
| lambda: 0.7 |
|
|
| |
| weights: [1, 1, 1, 1, 1, 1, 1] |
|
|
| |
| mels: [5, 10, 20, 40, 80, 160, 320] |
|
|
| |
| window_lengths: [32, 64, 128, 256, 512, 1024, 2048] |
|
|
| |
| hop_lengths: [8, 16, 32, 64, 128, 256, 512] |
|
|
| |
| mel_fmin: [0, 0, 0, 0, 0, 0, 0] |
|
|
| |
| mel_fmax: [8000, 8000, 8000, 8000, 8000, 8000, 8000] |
| PIT_L1: |
| lambda: 1 |
| loss_g: |
| lambda: 1 |
| loss_f: |
| lambda: 1 |
| |
| discriminator: |
| rates: [2, 3, 5, 7, 11] |
| periods: [2, 3, 5, 7, 11] |
| fft_sizes: [2048, 1024, 512] |
| |
| engine: |
| ckpt_path_model: "" |
| max_epochs: 200 |
| accum_steps: 10 |
| gpuid: "0" |
| gpu_ids: [0,1,2,3] |
| mvn: false |
| clip_norm: 5 |
| start_scheduling: 50 |
| test_epochs: [100, 120, 150, 170] |
| learning_rate: 5e-4 |
| learning_rate_disc: 2e-4 |
| weight_decay: 1.0e-2 |
| log_interval: 20 |
| ckpt_interval: 1000 |
| seed: 42 |
| losses: |
| PIT_MEL: 0.7 |
| PIT_STFT: 0.3 |
| PIT_L1: 1 |
| loss_g: 1 |
| loss_f: 1 |
| PIT_SDR: 1 |
| evaluation_metrics: |
| PESQ: |
| mode: "nb" |
| sr: 8000 |