styletts2-pretrained / config_ft.yml

Upload folder using huggingface_hub

9d4e434 verified about 2 months ago

4.54 kB

	log_dir: "StyleTTS2/pretrained/StyleTTS2-LJSpeech" # directory to save model checkpoints
	save_freq: 5 # save every N epochs
	log_interval: 10 # log training info every N steps
	device: "cuda" # device to run training (cuda or cpu)
	epochs: 50 # number of finetuning epoch (1 hour of data)
	batch_size: 2 # batch size
	max_len: 350 # maximum number of frames (truncates long audio)
	pretrained_model: "StyleTTS2/pretrained/StyleTTS2-LibriTTS/epochs_2nd_00020.pth" # path to pretrained model
	second_stage_load_pretrained: true # set to true if pretrained model is for second stage
	load_only_params: true # load only model parameters, ignore optimizer states

	F0_path: "JDC/bst.t7" # path to pretrained F0 extractor
	ASR_config: "ASR/config.yml" # ASR model config
	ASR_path: "ASR/epoch_00080.pth" # pretrained ASR model path
	PLBERT_dir: "PLBERT/" # PLBERT directory

	data_params:
	train_data: "StyleTTS2/Data/train_list.txt" # training text file
	val_data: "StyleTTS2/Data/val_list.txt" # validation text file
	root_path: "StyleTTS2/Data/wavs" # directory where audio files are stored
	OOD_data: "StyleTTS2/Data/OOD_texts.txt" # out-of-domain (OOD) text file
	min_length: 50 # minimum text length when sampling OOD texts

	preprocess_params:
	sr: 24000 # sampling rate
	spect_params:
	n_fft: 2048 # FFT size
	win_length: 1200 # window size
	hop_length: 300 # hop size

	model_params:
	multispeaker: false # whether to use multi-speaker embeddings

	dim_in: 64 # input dimension to encoder
	hidden_dim: 512 # hidden dimension size
	max_conv_dim: 512 # maximum convolutional layer dimension
	n_layer: 3 # number of encoder layers
	n_mels: 80 # number of mel channels

	n_token: 178 # number of phoneme tokens
	max_dur: 50 # maximum phoneme duration
	style_dim: 128 # dimension of style vector

	dropout: 0.2 # dropout rate

	decoder:
	type: "hifigan" # type of decoder (hifigan or istftnet)
	resblock_kernel_sizes: [3, 7, 11] # resblock kernel sizes
	upsample_rates: [10, 5, 3, 2] # upsample rates for each layer
	upsample_initial_channel: 512 # initial channel size for upsampling
	resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] # dilation sizes for resblocks
	upsample_kernel_sizes: [20, 10, 6, 4] # kernel sizes for upsampling

	slm:
	model: "microsoft/wavlm-base-plus" # SLM model (self-supervised speech model)
	sr: 16000 # sampling rate for SLM
	hidden: 768 # hidden size of SLM
	nlayers: 13 # number of SLM transformer layers
	initial_channel: 64 # initial channels for SLM discriminator head

	diffusion:
	embedding_mask_proba: 0.1 # probability to mask embeddings during diffusion training

	transformer:
	num_layers: 3 # number of layers in diffusion transformer
	num_heads: 8 # number of heads
	head_features: 64 # size per attention head
	multiplier: 2 # dimension multiplier

	dist:
	sigma_data: 0.2 # placeholder sigma if not estimated dynamically
	estimate_sigma_data: true # dynamically estimate sigma from batch
	mean: -3.0 # mean for noise distribution
	std: 1.0 # std dev for noise distribution

	loss_params:
	lambda_mel: 5.0 # weight for mel-spectrogram reconstruction loss
	lambda_gen: 1.0 # weight for generator adversarial loss
	lambda_slm: 1.0 # weight for SLM feature matching loss
	lambda_mono: 1.0 # weight for monotonic alignment loss (TMA)
	lambda_s2s: 1.0 # weight for sequence-to-sequence loss (TMA)
	lambda_F0: 1.0 # weight for F0 reconstruction loss
	lambda_norm: 1.0 # weight for normalization reconstruction loss
	lambda_dur: 1.0 # weight for duration prediction loss
	lambda_ce: 20.0 # weight for cross-entropy loss on duration prediction
	lambda_sty: 1.0 # weight for style reconstruction loss
	lambda_diff: 1.0 # weight for score matching loss

	diff_epoch: 10 # epoch to start style diffusion training
	joint_epoch: 110 # epoch to start joint training (stage 1 + 2)

	optimizer_params:
	lr: 0.0001 # general learning rate
	bert_lr: 0.00001 # learning rate for PLBERT modules
	ft_lr: 0.0001 # learning rate for fine-tuning acoustic models

	slmadv_params:
	min_len: 400 # minimum sequence length for SLM adversarial training
	max_len: 500 # maximum sequence length for SLM adversarial training
	batch_percentage: 0.5 # use only part of the batch to save memory
	iter: 10 # discriminator is updated every N generator steps
	thresh: 5 # gradient clipping threshold
	scale: 0.01 # gradient scaling factor for SLM heads
	sig: 1.5 # sigma for differentiable duration modeling