| name: RadTTS |
| sample_rate: 22050 |
|
|
| train_dataset: ??? |
| validation_datasets: ??? |
| ckpt_path: None |
| export_dir: ??? |
| sup_data_path: ??? |
| sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"] |
|
|
|
|
|
|
| |
| |
| pitch_mean: ??? |
| pitch_std: ??? |
|
|
| |
| pitch_fmin: 65.40639132514966 |
| pitch_fmax: 2093.004522404789 |
|
|
| |
| n_mels: 80 |
| n_window_size: 1024 |
| n_window_stride: 256 |
| n_fft: 1024 |
| lowfreq: 0 |
| highfreq: 8000 |
| window: "hann" |
|
|
|
|
| phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt" |
| heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" |
| mapping_file_path: "" |
|
|
| model: |
| target: nemo.collections.tts.models.RadTTSModel |
| bin_loss_start_ratio: 0.2 |
| bin_loss_warmup_epochs: 100 |
|
|
| symbols_embedding_dim: 384 |
| n_mel_channels: ${n_mels} |
|
|
| pitch_mean: ${pitch_mean} |
| pitch_std: ${pitch_std} |
|
|
| text_normalizer: |
| _target_: nemo_text_processing.text_normalization.normalize.Normalizer |
| lang: en |
| input_case: cased |
|
|
| text_normalizer_call_kwargs: |
| verbose: false |
| punct_pre_process: true |
| punct_post_process: true |
|
|
| text_tokenizer: |
| _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer |
| punct: true |
| apostrophe: true |
| pad_with_space: true |
| g2p: |
| _target_: nemo.collections.tts.g2p.modules.IPAG2P |
| phoneme_dict: ${phoneme_dict_path} |
| heteronyms: ${heteronyms_path} |
| phoneme_probability: 0.5 |
| |
| ignore_ambiguous_words: true |
| use_chars: true |
| use_stresses: true |
| |
| train_ds: |
| dataset: |
| _target_: "nemo.collections.tts.data.tts_dataset.TTSDataset" |
| manifest_filepath: ${train_dataset} |
| sample_rate: ${sample_rate} |
| sup_data_path: ${sup_data_path} |
| sup_data_types: ${sup_data_types} |
| n_fft: ${n_fft} |
| win_length: ${n_window_size} |
| hop_length: ${n_window_stride} |
| window: ${window} |
| n_mels: ${n_mels} |
| lowfreq: ${lowfreq} |
| highfreq: ${highfreq} |
| max_duration: null |
| min_duration: 0.1 |
| ignore_file: null |
| trim: False |
| pitch_fmin: ${pitch_fmin} |
| pitch_fmax: ${pitch_fmax} |
|
|
|
|
|
|
| text_tokenizer: |
| _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" |
| punct: True |
| stresses: True |
| chars: True |
| space: ' ' |
| silence: null |
| apostrophe: True |
| sep: '|' |
| add_blank_at: null |
| pad_with_space: True |
| g2p: |
| _target_: "nemo.collections.tts.g2p.modules.EnglishG2p" |
| phoneme_dict: ${phoneme_dict_path} |
| heteronyms: ${heteronyms_path} |
| phoneme_probability: 0.5 |
| dataloader_params: |
| drop_last: false |
| shuffle: true |
| batch_size: 8 |
| num_workers: 8 |
| pin_memory: false |
|
|
| validation_ds: |
| dataset: |
| _target_: "nemo.collections.tts.data.tts_dataset.TTSDataset" |
| manifest_filepath: ${validation_datasets} |
| sample_rate: ${sample_rate} |
| sup_data_path: ${sup_data_path} |
| sup_data_types: ${sup_data_types} |
| n_fft: ${n_fft} |
| win_length: ${n_window_size} |
| hop_length: ${n_window_stride} |
| window: ${window} |
| n_mels: ${n_mels} |
| lowfreq: ${lowfreq} |
| highfreq: ${highfreq} |
| max_duration: null |
| min_duration: 0.1 |
| ignore_file: null |
| trim: False |
| pitch_fmin: ${pitch_fmin} |
| pitch_fmax: ${pitch_fmax} |
|
|
| text_tokenizer: |
| _target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer" |
| punct: True |
| stresses: True |
| chars: True |
| space: ' ' |
| silence: null |
| apostrophe: True |
| sep: '|' |
| add_blank_at: null |
| pad_with_space: True |
| g2p: |
| _target_: "nemo.collections.tts.g2p.modules.EnglishG2p" |
| phoneme_dict: ${phoneme_dict_path} |
| heteronyms: ${heteronyms_path} |
| phoneme_probability: 0.5 |
| dataloader_params: |
| drop_last: false |
| shuffle: false |
| batch_size: 8 |
| num_workers: 8 |
| pin_memory: false |
|
|
| optim: |
| name: RAdam |
| lr: 0.0001 |
| betas: [0.9, 0.98] |
| weight_decay: 0.000001 |
|
|
| sched: |
| name: exp_decay |
| warmup_steps: 40000 |
| last_epoch: -1 |
| d_model: 1 |
| trainerConfig: |
| sigma: 1 |
| iters_per_checkpoint: 3000 |
| seed: null |
| ignore_layers: [] |
| finetune_layers: [] |
| include_layers: [] |
| with_tensorboard: true |
| dur_loss_weight: 1 |
| ctc_loss_weight: 1 |
| mask_unvoiced_f0: false |
| log_step: 1 |
| binarization_start_iter: 6000 |
| kl_loss_start_iter: 18000 |
| loss_weights: |
| ctc_loss_weight: 0.1 |
| dur_loss_weight: 1.0 |
| f0_loss_weight: 1.0 |
| energy_loss_weight: 1.0 |
| vpred_loss_weight: 1.0 |
| unfreeze_modules: "all" |
|
|
| load_from_checkpoint: False |
| init_from_ptl_ckpt: ${ckpt_path} |
| modelConfig: |
| _target_: "nemo.collections.tts.modules.radtts.RadTTSModule" |
| n_speakers: 1 |
| n_speaker_dim: 16 |
| n_text: 384 |
| n_text_dim: 512 |
| n_flows: 8 |
| n_conv_layers_per_step: 4 |
| n_mel_channels: 80 |
| n_hidden: 1024 |
| mel_encoder_n_hidden: 512 |
| dummy_speaker_embedding: false |
| n_early_size: 2 |
| n_early_every: 2 |
| n_group_size: 2 |
| affine_model: wavenet |
| include_modules: "decatnvpred" |
| scaling_fn: tanh |
| matrix_decomposition: LUS |
| learn_alignments: true |
| use_context_lstm: true |
| context_lstm_norm: spectral |
| context_lstm_w_f0_and_energy: true |
| text_encoder_lstm_norm: spectral |
| n_f0_dims: 1 |
| n_energy_avg_dims: 1 |
| use_first_order_features: false |
| unvoiced_bias_activation: "relu" |
| decoder_use_partial_padding: false |
| decoder_use_unvoiced_bias: true |
| ap_pred_log_f0: true |
| ap_use_unvoiced_bias: true |
| ap_use_voiced_embeddings: true |
| dur_model_config: null |
| f0_model_config: null |
| energy_model_config: null |
| v_model_config : |
| name : dap |
| hparams : |
| n_speaker_dim : 16 |
| take_log_of_input: false |
| bottleneck_hparams: |
| in_dim: 512 |
| reduction_factor: 16 |
| norm: weightnorm |
| non_linearity: relu |
| arch_hparams: |
| out_dim: 1 |
| n_layers: 2 |
| n_channels: 256 |
| kernel_size: 3 |
| p_dropout: 0.5 |
|
|
| trainer: |
| devices: 8 |
| precision: 16 |
| max_epochs: 1000 |
| num_nodes: 1 |
| accelerator: gpu |
| strategy: ddp |
| accumulate_grad_batches: 1 |
| enable_checkpointing: False |
| logger: False |
| gradient_clip_val: 1 |
| log_every_n_steps: 100 |
| check_val_every_n_epoch: 5 |
|
|
| exp_manager: |
| exp_dir: ${export_dir} |
| name: ${name} |
| create_tensorboard_logger: True |
| create_checkpoint_callback: True |
| checkpoint_callback_params: |
| monitor: val/loss_ctc |
| mode: min |
| filepath: ${export_dir} |
| filename: model_checkpoint |
|
|