Tino3141
/

sepacap

Model card Files Files and versions

xet

Community

Tino3141 commited on Jan 20

Commit

15aa140

verified ·

1 Parent(s): 07717c5

Upload modelMusicSep.yaml

Browse files

Files changed (1) hide show

modelMusicSep.yaml +154 -0

modelMusicSep.yaml ADDED Viewed

	@@ -0,0 +1,154 @@

+project: "[Project] SepReformer" ### Dont't change
+notes: "SepReformer final version" ### Insert schanges(plz write details !!!)
+# ------------------------------------------------------------------------------------------------------------------------------ #
+config:
+    # ------------------------------------------------------------ #
+    dataset:
+        max_len: 96000
+        sampling_rate: 24000
+        type: "ja_capella_power"
+        train: "train"
+        val: "test"
+        test: "test"
+    # ------------------------------------------------------------ #
+    dataloader:
+        batch_size: 2
+        pin_memory: false
+        num_workers: 4
+        drop_last: false
+        shuffle: true
+        prefetch_factor: 10
+    # ------------------------------------------------------------ #
+    model:
+        num_stages: &var_model_num_stages 3 # R
+        num_spks: &var_model_num_spks 7
+        module_audio_enc:
+            in_channels: 1
+            out_channels: &var_model_audio_enc_out_channels 256
+            kernel_size: &var_model_audio_enc_kernel_size 32 # L
+            stride: &var_model_audio_enc_stride 8 # S
+            groups: 1
+            bias: false
+        module_feature_projector:
+            num_channels: *var_model_audio_enc_out_channels
+            in_channels: *var_model_audio_enc_out_channels
+            out_channels: &feature_projector_out_channels 128 # F
+            kernel_size: 1
+            bias: false
+        module_separator:
+            num_stages: *var_model_num_stages
+            relative_positional_encoding:
+                in_channels: *feature_projector_out_channels
+                num_heads: 8
+                maxlen: 2000
+                embed_v: false
+            enc_stage:
+                num_patterns: 2
+                global_blocks:
+                    in_channels: *feature_projector_out_channels
+                    num_mha_heads: 8
+                    dropout_rate: 0.05
+                local_blocks:
+                    in_channels: *feature_projector_out_channels
+                    kernel_size: 65
+                    dropout_rate: 0.05
+                down_conv_layer:
+                    in_channels: *feature_projector_out_channels
+                    samp_kernel_size: &var_model_samp_kernel_size 5
+            spk_split_stage:
+                in_channels: *feature_projector_out_channels
+                num_spks: *var_model_num_spks
+            simple_fusion:
+                out_channels: *feature_projector_out_channels
+            dec_stage:
+                num_spks: *var_model_num_spks
+                num_patterns: 3
+                global_blocks:
+                    in_channels: *feature_projector_out_channels
+                    num_mha_heads: 8
+                    dropout_rate: 0.05
+                local_blocks:
+                    in_channels: *feature_projector_out_channels
+                    kernel_size: 65
+                    dropout_rate: 0.05
+                spk_attention:
+                    in_channels: *feature_projector_out_channels
+                    num_mha_heads: 8
+                    dropout_rate: 0.05
+        module_output_layer:
+            in_channels: *var_model_audio_enc_out_channels
+            out_channels: *feature_projector_out_channels
+            num_spks: *var_model_num_spks
+        module_audio_dec:
+            in_channels: *var_model_audio_enc_out_channels
+            out_channels: 1
+            kernel_size: *var_model_audio_enc_kernel_size
+            stride: *var_model_audio_enc_stride
+            bias: false
+    losses:
+        PIT_SPECTRAL:
+            lambda: 0.3
+            weights: [1, 1, 1]
+            window_lengths: [256, 512, 1024]
+            hop_lengths: [64, 128, 256]
+        PIT_MEL:
+            # overall loss weight
+            lambda: 0.7
+            # equal weight for each of the 7 scales
+            weights: [1, 1, 1, 1, 1, 1, 1]
+            # number of mel bins at each scale (very coarse → very fine)
+            mels: [5, 10, 20, 40, 80, 160, 320]
+            # analysis window lengths (in samples)
+            window_lengths: [32, 64, 128, 256, 512, 1024, 2048]
+            # hop lengths (here 25% overlap)
+            hop_lengths: [8, 16, 32, 64, 128, 256, 512]
+            # all start from 0 Hz
+            mel_fmin: [0, 0, 0, 0, 0, 0, 0]
+            # all go up to Nyquist (16 kHz / 2 = 8 kHz)
+            mel_fmax: [8000, 8000, 8000, 8000, 8000, 8000, 8000]
+        PIT_L1:
+            lambda: 1
+        loss_g:
+            lambda: 1
+        loss_f:
+            lambda: 1
+    # ------------------------------------------------------------ #
+    discriminator:
+        rates: [2, 3, 5, 7, 11]
+        periods: [2, 3, 5, 7, 11]
+        fft_sizes: [2048, 1024, 512]
+    # ------------------------------------------------------------ #
+    engine:
+        ckpt_path_model: ""
+        max_epochs: 200
+        accum_steps: 10
+        gpuid: "0" ### "0"(single-gpu) or "0, 1" (multi-gpu)
+        gpu_ids: [0,1,2,3]
+        mvn: false
+        clip_norm: 5
+        start_scheduling: 50
+        test_epochs: [100, 120, 150, 170]
+        learning_rate: 5e-4
+        learning_rate_disc: 2e-4
+        weight_decay: 1.0e-2
+        log_interval: 20
+        ckpt_interval: 1000
+        seed: 42
+        losses:
+            PIT_MEL: 0.7
+            PIT_STFT: 0.3
+            PIT_L1: 1
+            loss_g: 1
+            loss_f: 1
+            PIT_SDR: 1
+    evaluation_metrics:
+        PESQ:
+            mode: "nb"
+            sr: 8000