styletts2-ukrainian

Sleeping

Serhiy Stetskovych

Add multi model

086eb2f 12 months ago

2.73 kB

	F0_path: "weights/jdc.bin"
	ASR_config: "Utils/ASR/config.yml"
	ASR_path: "weights/asr.bin"


	model_params_multi:
	multispeaker: true

	dim_in: 64
	hidden_dim: 512
	max_conv_dim: 512
	n_layer: 3
	n_mels: 80

	n_token: 181 # number of phoneme tokens
	max_dur: 50 # maximum duration of a single phoneme
	style_dim: 128 # style vector size

	dropout: 0.2

	# config for decoder
	decoder:
	type: 'hifigan' # either hifigan or istftnet
	resblock_kernel_sizes: [3,7,11]
	upsample_rates : [10,5,3,2]
	upsample_initial_channel: 512
	resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
	upsample_kernel_sizes: [20,10,6,4]

	# speech language model config
	slm:
	model: ''
	sr: 16000 # sampling rate of SLM
	hidden: 768 # hidden size of SLM
	nlayers: 13 # number of layers of SLM
	initial_channel: 64 # initial channels of SLM discriminator head

	# style diffusion model config
	diffusion:
	embedding_mask_proba: 0.1
	# transformer config
	transformer:
	num_layers: 3
	num_heads: 8
	head_features: 64
	multiplier: 2

	# diffusion distribution config
	dist:
	sigma_data: 0.19988229232390187 # placeholder for estimate_sigma_data set to false
	estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
	mean: -3.0
	std: 1.0

	model_params_single:
	multispeaker: false

	dim_in: 64
	hidden_dim: 512
	max_conv_dim: 512
	n_layer: 3
	n_mels: 80

	n_token: 181 # number of phoneme tokens
	max_dur: 50 # maximum duration of a single phoneme
	style_dim: 128 # style vector size

	dropout: 0.2

	# config for decoder
	decoder:
	type: 'istftnet' # either hifigan or istftnet
	resblock_kernel_sizes: [3,7,11]
	upsample_rates : [10, 6]
	upsample_initial_channel: 512
	resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
	upsample_kernel_sizes: [20, 12]
	gen_istft_n_fft: 20
	gen_istft_hop_size: 5

	# speech language model config
	slm:
	model: 'openai/whisper-medium'
	sr: 16000 # sampling rate of SLM
	hidden: 768 # hidden size of SLM
	nlayers: 13 # number of layers of SLM
	initial_channel: 64 # initial channels of SLM discriminator head

	# style diffusion model config
	diffusion:
	embedding_mask_proba: 0.1
	# transformer config
	transformer:
	num_layers: 3
	num_heads: 8
	head_features: 64
	multiplier: 2

	# diffusion distribution config
	dist:
	sigma_data: 0.18 # placeholder for estimate_sigma_data set to false
	estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
	mean: -3.0
	std: 1.0