| # βββ GLOBAL ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| log_dir: logs/pod_90h_30k_second_v2 | |
| device: "cuda" | |
| batch_size: 12 # 40 GB A100, fp16 | |
| max_len: 300 # β 8 s (200 Γ 40 ms) | |
| epochs_1st: 25 # first-stage schedule | |
| epochs_2nd: 20 # second-stage schedule (later) | |
| save_freq: 1 | |
| log_interval: 50 | |
| # leave blank on first run | |
| pretrained_model: "/workspace/styletts2/logs/pod_90h_30k_second_v2/epoch_2nd_00005.pth" | |
| second_stage_load_pretrained: true | |
| load_only_params: false | |
| # βββ PRE-PROCESS βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| preprocess_params: | |
| sr: 24000 | |
| spect_params: # required by Mel extractor | |
| n_fft: 2048 | |
| win_length: 1200 | |
| hop_length: 300 | |
| # βββ DATA ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| data_params: | |
| root_path: /workspace | |
| train_data: /workspace/styletts2/data/train_list.txt | |
| val_data: /workspace/styletts2/data/val_list.txt | |
| min_length: 50 # sample until texts with this size are obtained for OOD texts | |
| OOD_data: /workspace/styletts2/data/OOD_texts.txt | |
| # βββ LOSS SCHEDULE ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| loss_params: | |
| lambda_mel: 5. # mel reconstruction loss | |
| lambda_gen: 1. # generator loss | |
| lambda_slm: 1. # slm feature matching loss | |
| lambda_mono: 1. # monotonic alignment loss (1st stage, TMA) | |
| lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA) | |
| TMA_epoch: 14 # TMA starting epoch (1st stage) | |
| lambda_F0: 1. # F0 reconstruction loss (2nd stage) | |
| lambda_norm: 1. # norm reconstruction loss (2nd stage) | |
| lambda_dur: 1. # duration loss (2nd stage) | |
| lambda_ce: 20. # duration predictor probability output CE loss (2nd stage) | |
| lambda_sty: 1. # style reconstruction loss (2nd stage) | |
| lambda_diff: 1. # score matching loss (2nd stage) | |
| diff_epoch: 1 # style diffusion starting epoch (2nd stage) | |
| joint_epoch: 5 # joint training starting epoch (2nd stage) | |
| # βββ OPTIMISER ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| optimizer_params: | |
| lr: 0.0001 | |
| bert_lr: 0.00001 | |
| ft_lr: 0.00001 | |
| grad_accum_steps: 2 | |
| # βββ MODEL (core network & sub-modules) βββββββββββββββββββββββββββββ | |
| model_params: | |
| multispeaker: true # speaker-ID column present | |
| dim_in: 64 | |
| hidden_dim: 512 | |
| max_conv_dim: 512 | |
| n_layer: 3 | |
| n_mels: 80 | |
| n_token: 178 # 178 phonemes | |
| max_dur: 50 | |
| style_dim: 128 | |
| dropout: 0.2 | |
| decoder: | |
| type: hifigan | |
| resblock_kernel_sizes: [3, 7, 11] | |
| upsample_rates: [10, 5, 3, 2] | |
| upsample_initial_channel: 512 | |
| resblock_dilation_sizes: [[1,3,5],[1,3,5],[1,3,5]] | |
| upsample_kernel_sizes: [20, 10, 6, 4] | |
| slm: | |
| model: microsoft/wavlm-base-plus | |
| sr: 16000 | |
| hidden: 768 | |
| nlayers: 13 | |
| initial_channel: 64 | |
| diffusion: | |
| embedding_mask_proba: 0.1 | |
| transformer: | |
| num_layers: 3 | |
| num_heads: 8 | |
| head_features: 64 | |
| multiplier: 2 | |
| dist: | |
| sigma_data: 0.2 # β placeholder; code will overwrite if | |
| estimate_sigma_data: true | |
| mean: -3.0 | |
| std: 1.0 | |
| # βββ EXTERNAL CHECKPOINTS βββββββββββββββββββββββββββββββββββββββββββ | |
| F0_path: "Utils/JDC/bst.t7" | |
| ASR_config: "Utils/ASR/config.yml" | |
| ASR_path: "Utils/ASR/epoch_00080.pth" | |
| PLBERT_dir: 'Utils/PLBERT/' | |
| first_stage_path: "/workspace/styletts2/stage1_final.pth" # filled automatically after this run | |
| # βββ SLM ADVERSARIAL (ignored in stage-1, kept default) βββββββββββββ | |
| slmadv_params: | |
| min_len: 400 | |
| max_len: 500 | |
| batch_percentage: 0.5 | |
| iter: 20 | |
| thresh: 5 | |
| scale: 0.01 | |
| sig: 1.5 |