NeMo / examples /tts /conf /vits_44100.yaml

thanks to NVIDIA ❤

7934b29 almost 3 years ago

5.52 kB

	# This config contains the default values for training VITS model on LJSpeech dataset.
	# If you want to train model on other dataset, you can change config values according to your dataset.
	# Most dataset-specific arguments are in the head of the config file, see below.

	name: VITS

	train_dataset: ???
	validation_datasets: ???
	sup_data_path: ???
	sup_data_types: [speaker_id]

	pitch_fmin: 65.40639132514966
	pitch_fmax: 2093.004522404789

	sample_rate: 44100
	n_mel_channels: 80
	n_window_size: 2048
	n_window_stride: 512
	n_fft: 2048
	lowfreq: 0
	highfreq: null
	window: hann

	phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt"
	heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"

	model:
	n_speakers: 13000
	segment_size: 16384
	c_mel: 45
	c_kl: 1.
	use_spectral_norm: false

	pitch_fmin: ${pitch_fmin}
	pitch_fmax: ${pitch_fmax}

	sample_rate: ${sample_rate}
	n_mel_channels: ${n_mel_channels}
	n_window_size: ${n_window_size}
	n_window_stride: ${n_window_stride}
	n_fft: ${n_fft}
	lowfreq: ${lowfreq}
	highfreq: ${highfreq}
	window: ${window}

	text_normalizer:
	_target_: nemo_text_processing.text_normalization.normalize.Normalizer
	lang: en
	input_case: cased

	text_normalizer_call_kwargs:
	verbose: false
	punct_pre_process: true
	punct_post_process: true

	text_tokenizer:
	_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
	punct: true
	apostrophe: true
	pad_with_space: false
	g2p:
	_target_: nemo.collections.tts.g2p.modules.IPAG2P
	phoneme_dict: ${phoneme_dict_path}
	heteronyms: ${heteronyms_path}
	phoneme_probability: 0.8
	# Relies on the heteronyms list for anything that needs to be disambiguated
	ignore_ambiguous_words: false
	use_chars: true
	use_stresses: true

	train_ds:
	dataset:
	_target_: "nemo.collections.tts.data.tts_dataset.TTSDataset"
	manifest_filepath: ${train_dataset}
	sample_rate: ${model.sample_rate}
	sup_data_path: ${sup_data_path}
	sup_data_types: ${sup_data_types}
	n_fft: ${model.n_fft}
	win_length: ${model.n_window_size}
	hop_length: ${model.n_window_stride}
	window: ${model.window}
	n_mels: ${model.n_mel_channels}
	lowfreq: ${model.lowfreq}
	highfreq: ${model.highfreq}
	max_duration: null
	min_duration: 0.1
	ignore_file: null
	trim: False
	pitch_fmin: ${model.pitch_fmin}
	pitch_fmax: ${model.pitch_fmax}

	dataloader_params:
	num_workers: 8
	pin_memory: false

	batch_sampler:
	batch_size: 32
	boundaries: [32,300,400,500,600,700,800,900,1000]
	num_replicas: ${trainer.devices}
	shuffle: true

	validation_ds:
	dataset:
	_target_: "nemo.collections.tts.data.tts_dataset.TTSDataset"
	manifest_filepath: ${validation_datasets}
	sample_rate: ${model.sample_rate}
	sup_data_path: ${sup_data_path}
	sup_data_types: ${sup_data_types}
	n_fft: ${model.n_fft}
	win_length: ${model.n_window_size}
	hop_length: ${model.n_window_stride}
	window: ${model.window}
	n_mels: ${model.n_mel_channels}
	lowfreq: ${model.lowfreq}
	highfreq: ${model.highfreq}
	max_duration: null
	min_duration: 0.1
	ignore_file: null
	trim: False
	pitch_fmin: ${model.pitch_fmin}
	pitch_fmax: ${model.pitch_fmax}

	dataloader_params:
	drop_last: false
	shuffle: false
	batch_size: 32
	num_workers: 4
	pin_memory: false

	preprocessor:
	_target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures
	nfilt: ${model.n_mel_channels}
	highfreq: ${model.highfreq}
	log: true
	log_zero_guard_type: clamp
	log_zero_guard_value: 1e-05
	lowfreq: ${model.lowfreq}
	n_fft: ${model.n_fft}
	n_window_size: ${model.n_window_size}
	n_window_stride: ${model.n_window_stride}
	pad_to: 1
	pad_value: 0
	sample_rate: ${model.sample_rate}
	window: ${model.window}
	normalize: null
	preemph: null
	dither: 0.0
	frame_splicing: 1
	stft_conv: false
	nb_augmentation_prob : 0
	mag_power: 1.0
	exact_pad: true
	use_grads: true

	synthesizer:
	_target_: nemo.collections.tts.modules.vits_modules.SynthesizerTrn
	inter_channels: 192
	hidden_channels: 192
	filter_channels: 768
	n_heads: 2
	n_layers: 6
	kernel_size: 3
	p_dropout: 0.1
	resblock: "1"
	resblock_kernel_sizes: [3,7,11]
	resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
	upsample_rates: [8,8,4,2]
	upsample_initial_channel: 512
	upsample_kernel_sizes: [16,16,4,4]
	n_speakers: ${model.n_speakers}
	gin_channels: 256 # for multi-speaker

	optim:
	_target_: torch.optim.AdamW
	lr: 2e-4
	betas: [0.9, 0.99]
	eps: 1e-9

	sched:
	name: CosineAnnealing
	max_steps: 1000000
	min_lr: 1e-5

	trainer:
	num_nodes: 1
	devices: 2
	accelerator: gpu
	strategy: ddp
	precision: 32
	# amp_backend: 'apex'
	# amp_level: 'O2'
	# benchmark: true
	max_epochs: -1
	accumulate_grad_batches: 1
	enable_checkpointing: false # Provided by exp_manager
	logger: false # Provided by exp_manager
	log_every_n_steps: 50
	check_val_every_n_epoch: 1

	exp_manager:
	exp_dir: ???
	name: ${name}
	create_tensorboard_logger: true
	create_checkpoint_callback: true
	checkpoint_callback_params:
	monitor: loss_gen_all
	mode: min
	resume_if_exists: false
	resume_ignore_no_checkpoint: false