| data: |
| block_size: 512 |
| dataset_path: ../datasets/vctk-partial |
| duration: 1.8 |
| encoder: dpwavlmbase |
| encoder_ckpt: models/pretrained/dphubert/DPWavLM-sp0.75.pth |
| encoder_hop_size: 320 |
| encoder_out_channels: 768 |
| encoder_sample_rate: 16000 |
| extensions: |
| - wav |
| f0_extractor: rmvpe |
| f0_max: 1200 |
| f0_min: 65 |
| sampling_rate: 44100 |
| spk_embed_channels: 256 |
| spk_embed_encoder: pyannote.audio |
| spk_embed_encoder_ckpt: ./models/pretrained/pyannote.audio/wespeaker-voxceleb-resnet34-LM/pytorch_model.bin |
| spk_embed_encoder_sample_rate: 16000 |
| volume_window_size: 8 |
| device: cuda |
| env: |
| expdir: ../datasets/exp/vctk-partial |
| gpu_id: 0 |
| loss: |
| beta: 0.8 |
| fft_max: 2048 |
| fft_min: 256 |
| n_scale: 4 |
| overlap: 0.5 |
| use_dual_scale: false |
| use_dual_scale_log_freq: true |
| model: |
| f0_input_variance: 0.0 |
| f0_offset_size_downsamples: 8 |
| harmonic_env_size_downsamples: 8 |
| no_use_embed_conv: false |
| noise_env_size_downsamples: 8 |
| noise_seed: 289 |
| noise_to_harmonic_phase: true |
| type: CombSubMinimumNoisedPhase |
| units_hidden_channels: 256 |
| units_layers: |
| - - 10 |
| - 11 |
| use_f0_offset: true |
| use_harmonic_env: false |
| use_noise_env: true |
| use_speaker_embed: true |
| win_length: 2048 |
| train: |
| amp_dtype: fp32 |
| batch_size: 48 |
| cache_all_data: true |
| cache_device: cuda |
| cache_fp16: true |
| epochs: 50000 |
| frame_hop_random_max: 64 |
| frame_hop_random_min: 32 |
| interval_log: 10 |
| interval_val: 2000 |
| loss_variation: 0.1 |
| low_similar_loss_variation: 0.7 |
| lr: 0.0005 |
| num_workers: 2 |
| only_u2c_stack: false |
| save_opt: false |
| sched_cooldown: 2 |
| sched_factor: 0.5 |
| sched_min_lr: 3.0e-06 |
| sched_patience: 30 |
| sched_threshold: 1.0e-05 |
| sched_threshold_mode: rel |
| weight_decay: 0 |
|
|