styletts2-pretrained / config_ft.yml
ibrazebra's picture
Upload folder using huggingface_hub
9d4e434 verified
log_dir: "StyleTTS2/pretrained/StyleTTS2-LJSpeech" # directory to save model checkpoints
save_freq: 5 # save every N epochs
log_interval: 10 # log training info every N steps
device: "cuda" # device to run training (cuda or cpu)
epochs: 50 # number of finetuning epoch (1 hour of data)
batch_size: 2 # batch size
max_len: 350 # maximum number of frames (truncates long audio)
pretrained_model: "StyleTTS2/pretrained/StyleTTS2-LibriTTS/epochs_2nd_00020.pth" # path to pretrained model
second_stage_load_pretrained: true # set to true if pretrained model is for second stage
load_only_params: true # load only model parameters, ignore optimizer states
F0_path: "JDC/bst.t7" # path to pretrained F0 extractor
ASR_config: "ASR/config.yml" # ASR model config
ASR_path: "ASR/epoch_00080.pth" # pretrained ASR model path
PLBERT_dir: "PLBERT/" # PLBERT directory
data_params:
train_data: "StyleTTS2/Data/train_list.txt" # training text file
val_data: "StyleTTS2/Data/val_list.txt" # validation text file
root_path: "StyleTTS2/Data/wavs" # directory where audio files are stored
OOD_data: "StyleTTS2/Data/OOD_texts.txt" # out-of-domain (OOD) text file
min_length: 50 # minimum text length when sampling OOD texts
preprocess_params:
sr: 24000 # sampling rate
spect_params:
n_fft: 2048 # FFT size
win_length: 1200 # window size
hop_length: 300 # hop size
model_params:
multispeaker: false # whether to use multi-speaker embeddings
dim_in: 64 # input dimension to encoder
hidden_dim: 512 # hidden dimension size
max_conv_dim: 512 # maximum convolutional layer dimension
n_layer: 3 # number of encoder layers
n_mels: 80 # number of mel channels
n_token: 178 # number of phoneme tokens
max_dur: 50 # maximum phoneme duration
style_dim: 128 # dimension of style vector
dropout: 0.2 # dropout rate
decoder:
type: "hifigan" # type of decoder (hifigan or istftnet)
resblock_kernel_sizes: [3, 7, 11] # resblock kernel sizes
upsample_rates: [10, 5, 3, 2] # upsample rates for each layer
upsample_initial_channel: 512 # initial channel size for upsampling
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] # dilation sizes for resblocks
upsample_kernel_sizes: [20, 10, 6, 4] # kernel sizes for upsampling
slm:
model: "microsoft/wavlm-base-plus" # SLM model (self-supervised speech model)
sr: 16000 # sampling rate for SLM
hidden: 768 # hidden size of SLM
nlayers: 13 # number of SLM transformer layers
initial_channel: 64 # initial channels for SLM discriminator head
diffusion:
embedding_mask_proba: 0.1 # probability to mask embeddings during diffusion training
transformer:
num_layers: 3 # number of layers in diffusion transformer
num_heads: 8 # number of heads
head_features: 64 # size per attention head
multiplier: 2 # dimension multiplier
dist:
sigma_data: 0.2 # placeholder sigma if not estimated dynamically
estimate_sigma_data: true # dynamically estimate sigma from batch
mean: -3.0 # mean for noise distribution
std: 1.0 # std dev for noise distribution
loss_params:
lambda_mel: 5.0 # weight for mel-spectrogram reconstruction loss
lambda_gen: 1.0 # weight for generator adversarial loss
lambda_slm: 1.0 # weight for SLM feature matching loss
lambda_mono: 1.0 # weight for monotonic alignment loss (TMA)
lambda_s2s: 1.0 # weight for sequence-to-sequence loss (TMA)
lambda_F0: 1.0 # weight for F0 reconstruction loss
lambda_norm: 1.0 # weight for normalization reconstruction loss
lambda_dur: 1.0 # weight for duration prediction loss
lambda_ce: 20.0 # weight for cross-entropy loss on duration prediction
lambda_sty: 1.0 # weight for style reconstruction loss
lambda_diff: 1.0 # weight for score matching loss
diff_epoch: 10 # epoch to start style diffusion training
joint_epoch: 110 # epoch to start joint training (stage 1 + 2)
optimizer_params:
lr: 0.0001 # general learning rate
bert_lr: 0.00001 # learning rate for PLBERT modules
ft_lr: 0.0001 # learning rate for fine-tuning acoustic models
slmadv_params:
min_len: 400 # minimum sequence length for SLM adversarial training
max_len: 500 # maximum sequence length for SLM adversarial training
batch_percentage: 0.5 # use only part of the batch to save memory
iter: 10 # discriminator is updated every N generator steps
thresh: 5 # gradient clipping threshold
scale: 0.01 # gradient scaling factor for SLM heads
sig: 1.5 # sigma for differentiable duration modeling