NeMo / examples /tts /conf /rad-tts_dec_ipa.yaml

thanks to NVIDIA ❤

7934b29 about 3 years ago

7.43 kB

	name: RadTTS
	sample_rate: 22050

	train_dataset: ???
	validation_datasets: ???
	ckpt_path: None
	export_dir: ???
	sup_data_path: ???
	sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"]



	# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
	# by running `scripts/dataset_processing/tts/extract_sup_data.py`
	pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech
	pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech

	# default values from librosa.pyin
	pitch_fmin: 65.40639132514966
	pitch_fmax: 2093.004522404789

	# default values for sample_rate=22050
	n_mels: 80
	n_window_size: 1024
	n_window_stride: 256
	n_fft: 1024
	lowfreq: 0
	highfreq: 8000
	window: "hann"


	phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt"
	heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
	mapping_file_path: ""

	model:
	target: nemo.collections.tts.models.RadTTSModel
	bin_loss_start_ratio: 0.2
	bin_loss_warmup_epochs: 100

	symbols_embedding_dim: 384
	n_mel_channels: ${n_mels}

	pitch_mean: ${pitch_mean}
	pitch_std: ${pitch_std}

	text_normalizer:
	_target_: nemo_text_processing.text_normalization.normalize.Normalizer
	lang: en
	input_case: cased

	text_normalizer_call_kwargs:
	verbose: false
	punct_pre_process: true
	punct_post_process: true

	text_tokenizer:
	_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
	punct: true
	apostrophe: true
	pad_with_space: true
	g2p:
	_target_: nemo.collections.tts.g2p.modules.IPAG2P
	phoneme_dict: ${phoneme_dict_path}
	heteronyms: ${heteronyms_path}
	phoneme_probability: 0.5
	# Relies on the heteronyms list for anything that needs to be disambiguated
	ignore_ambiguous_words: true
	use_chars: true
	use_stresses: true

	train_ds:
	dataset:
	_target_: "nemo.collections.tts.data.tts_dataset.TTSDataset"
	manifest_filepath: ${train_dataset}
	sample_rate: ${sample_rate}
	sup_data_path: ${sup_data_path}
	sup_data_types: ${sup_data_types}
	n_fft: ${n_fft}
	win_length: ${n_window_size}
	hop_length: ${n_window_stride}
	window: ${window}
	n_mels: ${n_mels}
	lowfreq: ${lowfreq}
	highfreq: ${highfreq}
	max_duration: null
	min_duration: 0.1
	ignore_file: null
	trim: False
	pitch_fmin: ${pitch_fmin}
	pitch_fmax: ${pitch_fmax}



	text_tokenizer:
	_target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
	punct: True
	stresses: True
	chars: True
	space: ' '
	silence: null
	apostrophe: True
	sep: '\|'
	add_blank_at: null
	pad_with_space: True
	g2p:
	_target_: "nemo.collections.tts.g2p.modules.EnglishG2p"
	phoneme_dict: ${phoneme_dict_path}
	heteronyms: ${heteronyms_path}
	phoneme_probability: 0.5
	dataloader_params:
	drop_last: false
	shuffle: true
	batch_size: 8
	num_workers: 8
	pin_memory: false

	validation_ds:
	dataset:
	_target_: "nemo.collections.tts.data.tts_dataset.TTSDataset"
	manifest_filepath: ${validation_datasets}
	sample_rate: ${sample_rate}
	sup_data_path: ${sup_data_path}
	sup_data_types: ${sup_data_types}
	n_fft: ${n_fft}
	win_length: ${n_window_size}
	hop_length: ${n_window_stride}
	window: ${window}
	n_mels: ${n_mels}
	lowfreq: ${lowfreq}
	highfreq: ${highfreq}
	max_duration: null
	min_duration: 0.1
	ignore_file: null
	trim: False
	pitch_fmin: ${pitch_fmin}
	pitch_fmax: ${pitch_fmax}

	text_tokenizer:
	_target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
	punct: True
	stresses: True
	chars: True
	space: ' '
	silence: null
	apostrophe: True
	sep: '\|'
	add_blank_at: null
	pad_with_space: True
	g2p:
	_target_: "nemo.collections.tts.g2p.modules.EnglishG2p"
	phoneme_dict: ${phoneme_dict_path}
	heteronyms: ${heteronyms_path}
	phoneme_probability: 0.5
	dataloader_params:
	drop_last: false
	shuffle: false
	batch_size: 8
	num_workers: 8
	pin_memory: false

	optim:
	name: RAdam
	lr: 0.0001
	betas: [0.9, 0.98]
	weight_decay: 0.000001

	sched:
	name: exp_decay
	warmup_steps: 40000
	last_epoch: -1
	d_model: 1 # Disable scaling based on model dim
	trainerConfig:
	sigma: 1
	iters_per_checkpoint: 3000
	seed: null
	ignore_layers: []
	finetune_layers: []
	include_layers: []
	with_tensorboard: true
	dur_loss_weight: 1
	ctc_loss_weight: 1
	mask_unvoiced_f0: false
	log_step: 1
	binarization_start_iter: 6000
	kl_loss_start_iter: 18000
	loss_weights:
	ctc_loss_weight: 0.1
	dur_loss_weight: 1.0
	f0_loss_weight: 1.0
	energy_loss_weight: 1.0
	vpred_loss_weight: 1.0
	unfreeze_modules: "all"

	load_from_checkpoint: False
	init_from_ptl_ckpt: ${ckpt_path}
	modelConfig:
	_target_: "nemo.collections.tts.modules.radtts.RadTTSModule"
	n_speakers: 1
	n_speaker_dim: 16
	n_text: 384 #185
	n_text_dim: 512
	n_flows: 8
	n_conv_layers_per_step: 4
	n_mel_channels: 80
	n_hidden: 1024
	mel_encoder_n_hidden: 512
	dummy_speaker_embedding: false
	n_early_size: 2
	n_early_every: 2
	n_group_size: 2
	affine_model: wavenet
	include_modules: "decatnvpred"
	scaling_fn: tanh
	matrix_decomposition: LUS
	learn_alignments: true
	use_context_lstm: true
	context_lstm_norm: spectral
	context_lstm_w_f0_and_energy: true
	text_encoder_lstm_norm: spectral
	n_f0_dims: 1
	n_energy_avg_dims: 1
	use_first_order_features: false
	unvoiced_bias_activation: "relu"
	decoder_use_partial_padding: false
	decoder_use_unvoiced_bias: true
	ap_pred_log_f0: true
	ap_use_unvoiced_bias: true
	ap_use_voiced_embeddings: true
	dur_model_config: null
	f0_model_config: null
	energy_model_config: null
	v_model_config :
	name : dap
	hparams :
	n_speaker_dim : 16
	take_log_of_input: false
	bottleneck_hparams:
	in_dim: 512
	reduction_factor: 16
	norm: weightnorm
	non_linearity: relu
	arch_hparams:
	out_dim: 1
	n_layers: 2
	n_channels: 256
	kernel_size: 3
	p_dropout: 0.5

	trainer:
	devices: 8
	precision: 16
	max_epochs: 1000
	num_nodes: 1
	accelerator: gpu
	strategy: ddp
	accumulate_grad_batches: 1
	enable_checkpointing: False
	logger: False
	gradient_clip_val: 1
	log_every_n_steps: 100
	check_val_every_n_epoch: 5

	exp_manager:
	exp_dir: ${export_dir}
	name: ${name}
	create_tensorboard_logger: True
	create_checkpoint_callback: True
	checkpoint_callback_params:
	monitor: val/loss_ctc
	mode: min
	filepath: ${export_dir}
	filename: model_checkpoint