styletts2 / Configs /config_ft_single.yml

9c5b5e6 8 months ago

4.35 kB

	# ─── GLOBAL ──────────────────────────────────────────────────────────
	log_dir: logs/pod_90h_30k_second_v2
	device: "cuda"

	batch_size: 12 # 40 GB A100, fp16
	max_len: 300 # ≈ 8 s (200 × 40 ms)

	epochs_1st: 25 # first-stage schedule
	epochs_2nd: 20 # second-stage schedule (later)
	save_freq: 1
	log_interval: 50

	# leave blank on first run
	pretrained_model: "/workspace/styletts2/logs/pod_90h_30k_second_v2/epoch_2nd_00005.pth"
	second_stage_load_pretrained: true
	load_only_params: false

	# ─── PRE-PROCESS ─────────────────────────────────────────────────────
	preprocess_params:
	sr: 24000
	spect_params: # required by Mel extractor
	n_fft: 2048
	win_length: 1200
	hop_length: 300

	# ─── DATA ────────────────────────────────────────────────────────────
	data_params:
	root_path: /workspace
	train_data: /workspace/styletts2/data/train_list.txt
	val_data: /workspace/styletts2/data/val_list.txt
	min_length: 50 # sample until texts with this size are obtained for OOD texts
	OOD_data: /workspace/styletts2/data/OOD_texts.txt

	# ─── LOSS SCHEDULE ──────────────────────────────────────────────────
	loss_params:
	lambda_mel: 5. # mel reconstruction loss
	lambda_gen: 1. # generator loss
	lambda_slm: 1. # slm feature matching loss

	lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
	lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
	TMA_epoch: 14 # TMA starting epoch (1st stage)

	lambda_F0: 1. # F0 reconstruction loss (2nd stage)
	lambda_norm: 1. # norm reconstruction loss (2nd stage)
	lambda_dur: 1. # duration loss (2nd stage)
	lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
	lambda_sty: 1. # style reconstruction loss (2nd stage)
	lambda_diff: 1. # score matching loss (2nd stage)

	diff_epoch: 1 # style diffusion starting epoch (2nd stage)
	joint_epoch: 5 # joint training starting epoch (2nd stage)

	# ─── OPTIMISER ──────────────────────────────────────────────────────
	optimizer_params:
	lr: 0.0001
	bert_lr: 0.00001
	ft_lr: 0.00001
	grad_accum_steps: 2

	# ─── MODEL (core network & sub-modules) ─────────────────────────────
	model_params:
	multispeaker: true # speaker-ID column present
	dim_in: 64
	hidden_dim: 512
	max_conv_dim: 512
	n_layer: 3
	n_mels: 80

	n_token: 178 # 178 phonemes
	max_dur: 50
	style_dim: 128
	dropout: 0.2

	decoder:
	type: hifigan
	resblock_kernel_sizes: [3, 7, 11]
	upsample_rates: [10, 5, 3, 2]
	upsample_initial_channel: 512
	resblock_dilation_sizes: [[1,3,5],[1,3,5],[1,3,5]]
	upsample_kernel_sizes: [20, 10, 6, 4]

	slm:
	model: microsoft/wavlm-base-plus
	sr: 16000
	hidden: 768
	nlayers: 13
	initial_channel: 64

	diffusion:
	embedding_mask_proba: 0.1
	transformer:
	num_layers: 3
	num_heads: 8
	head_features: 64
	multiplier: 2
	dist:
	sigma_data: 0.2 # ← placeholder; code will overwrite if
	estimate_sigma_data: true
	mean: -3.0
	std: 1.0

	# ─── EXTERNAL CHECKPOINTS ───────────────────────────────────────────
	F0_path: "Utils/JDC/bst.t7"
	ASR_config: "Utils/ASR/config.yml"
	ASR_path: "Utils/ASR/epoch_00080.pth"
	PLBERT_dir: 'Utils/PLBERT/'
	first_stage_path: "/workspace/styletts2/stage1_final.pth" # filled automatically after this run

	# ─── SLM ADVERSARIAL (ignored in stage-1, kept default) ─────────────
	slmadv_params:
	min_len: 400
	max_len: 500
	batch_percentage: 0.5
	iter: 20
	thresh: 5
	scale: 0.01
	sig: 1.5