| log_dir: "StyleTTS2/pretrained/StyleTTS2-LJSpeech" # directory to save model checkpoints | |
| save_freq: 5 # save every N epochs | |
| log_interval: 10 # log training info every N steps | |
| device: "cuda" # device to run training (cuda or cpu) | |
| epochs: 50 # number of finetuning epoch (1 hour of data) | |
| batch_size: 2 # batch size | |
| max_len: 350 # maximum number of frames (truncates long audio) | |
| pretrained_model: "StyleTTS2/pretrained/StyleTTS2-LibriTTS/epochs_2nd_00020.pth" # path to pretrained model | |
| second_stage_load_pretrained: true # set to true if pretrained model is for second stage | |
| load_only_params: true # load only model parameters, ignore optimizer states | |
| F0_path: "JDC/bst.t7" # path to pretrained F0 extractor | |
| ASR_config: "ASR/config.yml" # ASR model config | |
| ASR_path: "ASR/epoch_00080.pth" # pretrained ASR model path | |
| PLBERT_dir: "PLBERT/" # PLBERT directory | |
| data_params: | |
| train_data: "StyleTTS2/Data/train_list.txt" # training text file | |
| val_data: "StyleTTS2/Data/val_list.txt" # validation text file | |
| root_path: "StyleTTS2/Data/wavs" # directory where audio files are stored | |
| OOD_data: "StyleTTS2/Data/OOD_texts.txt" # out-of-domain (OOD) text file | |
| min_length: 50 # minimum text length when sampling OOD texts | |
| preprocess_params: | |
| sr: 24000 # sampling rate | |
| spect_params: | |
| n_fft: 2048 # FFT size | |
| win_length: 1200 # window size | |
| hop_length: 300 # hop size | |
| model_params: | |
| multispeaker: false # whether to use multi-speaker embeddings | |
| dim_in: 64 # input dimension to encoder | |
| hidden_dim: 512 # hidden dimension size | |
| max_conv_dim: 512 # maximum convolutional layer dimension | |
| n_layer: 3 # number of encoder layers | |
| n_mels: 80 # number of mel channels | |
| n_token: 178 # number of phoneme tokens | |
| max_dur: 50 # maximum phoneme duration | |
| style_dim: 128 # dimension of style vector | |
| dropout: 0.2 # dropout rate | |
| decoder: | |
| type: "hifigan" # type of decoder (hifigan or istftnet) | |
| resblock_kernel_sizes: [3, 7, 11] # resblock kernel sizes | |
| upsample_rates: [10, 5, 3, 2] # upsample rates for each layer | |
| upsample_initial_channel: 512 # initial channel size for upsampling | |
| resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] # dilation sizes for resblocks | |
| upsample_kernel_sizes: [20, 10, 6, 4] # kernel sizes for upsampling | |
| slm: | |
| model: "microsoft/wavlm-base-plus" # SLM model (self-supervised speech model) | |
| sr: 16000 # sampling rate for SLM | |
| hidden: 768 # hidden size of SLM | |
| nlayers: 13 # number of SLM transformer layers | |
| initial_channel: 64 # initial channels for SLM discriminator head | |
| diffusion: | |
| embedding_mask_proba: 0.1 # probability to mask embeddings during diffusion training | |
| transformer: | |
| num_layers: 3 # number of layers in diffusion transformer | |
| num_heads: 8 # number of heads | |
| head_features: 64 # size per attention head | |
| multiplier: 2 # dimension multiplier | |
| dist: | |
| sigma_data: 0.2 # placeholder sigma if not estimated dynamically | |
| estimate_sigma_data: true # dynamically estimate sigma from batch | |
| mean: -3.0 # mean for noise distribution | |
| std: 1.0 # std dev for noise distribution | |
| loss_params: | |
| lambda_mel: 5.0 # weight for mel-spectrogram reconstruction loss | |
| lambda_gen: 1.0 # weight for generator adversarial loss | |
| lambda_slm: 1.0 # weight for SLM feature matching loss | |
| lambda_mono: 1.0 # weight for monotonic alignment loss (TMA) | |
| lambda_s2s: 1.0 # weight for sequence-to-sequence loss (TMA) | |
| lambda_F0: 1.0 # weight for F0 reconstruction loss | |
| lambda_norm: 1.0 # weight for normalization reconstruction loss | |
| lambda_dur: 1.0 # weight for duration prediction loss | |
| lambda_ce: 20.0 # weight for cross-entropy loss on duration prediction | |
| lambda_sty: 1.0 # weight for style reconstruction loss | |
| lambda_diff: 1.0 # weight for score matching loss | |
| diff_epoch: 10 # epoch to start style diffusion training | |
| joint_epoch: 110 # epoch to start joint training (stage 1 + 2) | |
| optimizer_params: | |
| lr: 0.0001 # general learning rate | |
| bert_lr: 0.00001 # learning rate for PLBERT modules | |
| ft_lr: 0.0001 # learning rate for fine-tuning acoustic models | |
| slmadv_params: | |
| min_len: 400 # minimum sequence length for SLM adversarial training | |
| max_len: 500 # maximum sequence length for SLM adversarial training | |
| batch_percentage: 0.5 # use only part of the batch to save memory | |
| iter: 10 # discriminator is updated every N generator steps | |
| thresh: 5 # gradient clipping threshold | |
| scale: 0.01 # gradient scaling factor for SLM heads | |
| sig: 1.5 # sigma for differentiable duration modeling | |