File size: 4,540 Bytes

9d4e434

log_dir: "StyleTTS2/pretrained/StyleTTS2-LJSpeech" # directory to save model checkpoints
save_freq: 5 # save every N epochs
log_interval: 10 # log training info every N steps
device: "cuda" # device to run training (cuda or cpu)
epochs: 50 # number of finetuning epoch (1 hour of data)
batch_size: 2 # batch size
max_len: 350 # maximum number of frames (truncates long audio)
pretrained_model: "StyleTTS2/pretrained/StyleTTS2-LibriTTS/epochs_2nd_00020.pth" # path to pretrained model
second_stage_load_pretrained: true # set to true if pretrained model is for second stage
load_only_params: true # load only model parameters, ignore optimizer states

F0_path: "JDC/bst.t7" # path to pretrained F0 extractor
ASR_config: "ASR/config.yml" # ASR model config
ASR_path: "ASR/epoch_00080.pth" # pretrained ASR model path
PLBERT_dir: "PLBERT/" # PLBERT directory

data_params:
  train_data: "StyleTTS2/Data/train_list.txt" # training text file
  val_data: "StyleTTS2/Data/val_list.txt" # validation text file
  root_path: "StyleTTS2/Data/wavs" # directory where audio files are stored
  OOD_data: "StyleTTS2/Data/OOD_texts.txt" # out-of-domain (OOD) text file
  min_length: 50 # minimum text length when sampling OOD texts

preprocess_params:
  sr: 24000 # sampling rate
  spect_params:
    n_fft: 2048 # FFT size
    win_length: 1200 # window size
    hop_length: 300 # hop size

model_params:
  multispeaker: false # whether to use multi-speaker embeddings

  dim_in: 64 # input dimension to encoder
  hidden_dim: 512 # hidden dimension size
  max_conv_dim: 512 # maximum convolutional layer dimension
  n_layer: 3 # number of encoder layers
  n_mels: 80 # number of mel channels

  n_token: 178 # number of phoneme tokens
  max_dur: 50 # maximum phoneme duration
  style_dim: 128 # dimension of style vector

  dropout: 0.2 # dropout rate

  decoder:
    type: "hifigan" # type of decoder (hifigan or istftnet)
    resblock_kernel_sizes: [3, 7, 11] # resblock kernel sizes
    upsample_rates: [10, 5, 3, 2] # upsample rates for each layer
    upsample_initial_channel: 512 # initial channel size for upsampling
    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] # dilation sizes for resblocks
    upsample_kernel_sizes: [20, 10, 6, 4] # kernel sizes for upsampling

  slm:
    model: "microsoft/wavlm-base-plus" # SLM model (self-supervised speech model)
    sr: 16000 # sampling rate for SLM
    hidden: 768 # hidden size of SLM
    nlayers: 13 # number of SLM transformer layers
    initial_channel: 64 # initial channels for SLM discriminator head

  diffusion:
    embedding_mask_proba: 0.1 # probability to mask embeddings during diffusion training

    transformer:
      num_layers: 3 # number of layers in diffusion transformer
      num_heads: 8 # number of heads
      head_features: 64 # size per attention head
      multiplier: 2 # dimension multiplier

    dist:
      sigma_data: 0.2 # placeholder sigma if not estimated dynamically
      estimate_sigma_data: true # dynamically estimate sigma from batch
      mean: -3.0 # mean for noise distribution
      std: 1.0 # std dev for noise distribution

loss_params:
  lambda_mel: 5.0 # weight for mel-spectrogram reconstruction loss
  lambda_gen: 1.0 # weight for generator adversarial loss
  lambda_slm: 1.0 # weight for SLM feature matching loss
  lambda_mono: 1.0 # weight for monotonic alignment loss (TMA)
  lambda_s2s: 1.0 # weight for sequence-to-sequence loss (TMA)
  lambda_F0: 1.0 # weight for F0 reconstruction loss
  lambda_norm: 1.0 # weight for normalization reconstruction loss
  lambda_dur: 1.0 # weight for duration prediction loss
  lambda_ce: 20.0 # weight for cross-entropy loss on duration prediction
  lambda_sty: 1.0 # weight for style reconstruction loss
  lambda_diff: 1.0 # weight for score matching loss

  diff_epoch: 10 # epoch to start style diffusion training
  joint_epoch: 110 # epoch to start joint training (stage 1 + 2)

optimizer_params:
  lr: 0.0001 # general learning rate
  bert_lr: 0.00001 # learning rate for PLBERT modules
  ft_lr: 0.0001 # learning rate for fine-tuning acoustic models

slmadv_params:
  min_len: 400 # minimum sequence length for SLM adversarial training
  max_len: 500 # maximum sequence length for SLM adversarial training
  batch_percentage: 0.5 # use only part of the batch to save memory
  iter: 10 # discriminator is updated every N generator steps
  thresh: 5 # gradient clipping threshold
  scale: 0.01 # gradient scaling factor for SLM heads
  sig: 1.5 # sigma for differentiable duration modeling