log_dir: "StyleTTS2/pretrained/StyleTTS2-LJSpeech" # directory to save model checkpoints save_freq: 5 # save every N epochs log_interval: 10 # log training info every N steps device: "cuda" # device to run training (cuda or cpu) epochs: 50 # number of finetuning epoch (1 hour of data) batch_size: 2 # batch size max_len: 350 # maximum number of frames (truncates long audio) pretrained_model: "StyleTTS2/pretrained/StyleTTS2-LibriTTS/epochs_2nd_00020.pth" # path to pretrained model second_stage_load_pretrained: true # set to true if pretrained model is for second stage load_only_params: true # load only model parameters, ignore optimizer states F0_path: "JDC/bst.t7" # path to pretrained F0 extractor ASR_config: "ASR/config.yml" # ASR model config ASR_path: "ASR/epoch_00080.pth" # pretrained ASR model path PLBERT_dir: "PLBERT/" # PLBERT directory data_params: train_data: "StyleTTS2/Data/train_list.txt" # training text file val_data: "StyleTTS2/Data/val_list.txt" # validation text file root_path: "StyleTTS2/Data/wavs" # directory where audio files are stored OOD_data: "StyleTTS2/Data/OOD_texts.txt" # out-of-domain (OOD) text file min_length: 50 # minimum text length when sampling OOD texts preprocess_params: sr: 24000 # sampling rate spect_params: n_fft: 2048 # FFT size win_length: 1200 # window size hop_length: 300 # hop size model_params: multispeaker: false # whether to use multi-speaker embeddings dim_in: 64 # input dimension to encoder hidden_dim: 512 # hidden dimension size max_conv_dim: 512 # maximum convolutional layer dimension n_layer: 3 # number of encoder layers n_mels: 80 # number of mel channels n_token: 178 # number of phoneme tokens max_dur: 50 # maximum phoneme duration style_dim: 128 # dimension of style vector dropout: 0.2 # dropout rate decoder: type: "hifigan" # type of decoder (hifigan or istftnet) resblock_kernel_sizes: [3, 7, 11] # resblock kernel sizes upsample_rates: [10, 5, 3, 2] # upsample rates for each layer upsample_initial_channel: 512 # initial channel size for upsampling resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] # dilation sizes for resblocks upsample_kernel_sizes: [20, 10, 6, 4] # kernel sizes for upsampling slm: model: "microsoft/wavlm-base-plus" # SLM model (self-supervised speech model) sr: 16000 # sampling rate for SLM hidden: 768 # hidden size of SLM nlayers: 13 # number of SLM transformer layers initial_channel: 64 # initial channels for SLM discriminator head diffusion: embedding_mask_proba: 0.1 # probability to mask embeddings during diffusion training transformer: num_layers: 3 # number of layers in diffusion transformer num_heads: 8 # number of heads head_features: 64 # size per attention head multiplier: 2 # dimension multiplier dist: sigma_data: 0.2 # placeholder sigma if not estimated dynamically estimate_sigma_data: true # dynamically estimate sigma from batch mean: -3.0 # mean for noise distribution std: 1.0 # std dev for noise distribution loss_params: lambda_mel: 5.0 # weight for mel-spectrogram reconstruction loss lambda_gen: 1.0 # weight for generator adversarial loss lambda_slm: 1.0 # weight for SLM feature matching loss lambda_mono: 1.0 # weight for monotonic alignment loss (TMA) lambda_s2s: 1.0 # weight for sequence-to-sequence loss (TMA) lambda_F0: 1.0 # weight for F0 reconstruction loss lambda_norm: 1.0 # weight for normalization reconstruction loss lambda_dur: 1.0 # weight for duration prediction loss lambda_ce: 20.0 # weight for cross-entropy loss on duration prediction lambda_sty: 1.0 # weight for style reconstruction loss lambda_diff: 1.0 # weight for score matching loss diff_epoch: 10 # epoch to start style diffusion training joint_epoch: 110 # epoch to start joint training (stage 1 + 2) optimizer_params: lr: 0.0001 # general learning rate bert_lr: 0.00001 # learning rate for PLBERT modules ft_lr: 0.0001 # learning rate for fine-tuning acoustic models slmadv_params: min_len: 400 # minimum sequence length for SLM adversarial training max_len: 500 # maximum sequence length for SLM adversarial training batch_percentage: 0.5 # use only part of the batch to save memory iter: 10 # discriminator is updated every N generator steps thresh: 5 # gradient clipping threshold scale: 0.01 # gradient scaling factor for SLM heads sig: 1.5 # sigma for differentiable duration modeling