NeMo / examples /tts /conf /rad-tts_dec_ipa.yaml
camenduru's picture
thanks to NVIDIA ❤
7934b29
name: RadTTS
sample_rate: 22050
train_dataset: ???
validation_datasets: ???
ckpt_path: None
export_dir: ???
sup_data_path: ???
sup_data_types: ["log_mel", "align_prior_matrix", "pitch", "voiced_mask", "p_voiced", "energy"]
# these frame-wise values depend on pitch_fmin and pitch_fmax, you can get values
# by running `scripts/dataset_processing/tts/extract_sup_data.py`
pitch_mean: ??? # e.g. 212.35873413085938 for LJSpeech
pitch_std: ??? # e.g. 68.52806091308594 for LJSpeech
# default values from librosa.pyin
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789
# default values for sample_rate=22050
n_mels: 80
n_window_size: 1024
n_window_stride: 256
n_fft: 1024
lowfreq: 0
highfreq: 8000
window: "hann"
phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
mapping_file_path: ""
model:
target: nemo.collections.tts.models.RadTTSModel
bin_loss_start_ratio: 0.2
bin_loss_warmup_epochs: 100
symbols_embedding_dim: 384
n_mel_channels: ${n_mels}
pitch_mean: ${pitch_mean}
pitch_std: ${pitch_std}
text_normalizer:
_target_: nemo_text_processing.text_normalization.normalize.Normalizer
lang: en
input_case: cased
text_normalizer_call_kwargs:
verbose: false
punct_pre_process: true
punct_post_process: true
text_tokenizer:
_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
punct: true
apostrophe: true
pad_with_space: true
g2p:
_target_: nemo.collections.tts.g2p.modules.IPAG2P
phoneme_dict: ${phoneme_dict_path}
heteronyms: ${heteronyms_path}
phoneme_probability: 0.5
# Relies on the heteronyms list for anything that needs to be disambiguated
ignore_ambiguous_words: true
use_chars: true
use_stresses: true
train_ds:
dataset:
_target_: "nemo.collections.tts.data.tts_dataset.TTSDataset"
manifest_filepath: ${train_dataset}
sample_rate: ${sample_rate}
sup_data_path: ${sup_data_path}
sup_data_types: ${sup_data_types}
n_fft: ${n_fft}
win_length: ${n_window_size}
hop_length: ${n_window_stride}
window: ${window}
n_mels: ${n_mels}
lowfreq: ${lowfreq}
highfreq: ${highfreq}
max_duration: null
min_duration: 0.1
ignore_file: null
trim: False
pitch_fmin: ${pitch_fmin}
pitch_fmax: ${pitch_fmax}
text_tokenizer:
_target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
punct: True
stresses: True
chars: True
space: ' '
silence: null
apostrophe: True
sep: '|'
add_blank_at: null
pad_with_space: True
g2p:
_target_: "nemo.collections.tts.g2p.modules.EnglishG2p"
phoneme_dict: ${phoneme_dict_path}
heteronyms: ${heteronyms_path}
phoneme_probability: 0.5
dataloader_params:
drop_last: false
shuffle: true
batch_size: 8
num_workers: 8
pin_memory: false
validation_ds:
dataset:
_target_: "nemo.collections.tts.data.tts_dataset.TTSDataset"
manifest_filepath: ${validation_datasets}
sample_rate: ${sample_rate}
sup_data_path: ${sup_data_path}
sup_data_types: ${sup_data_types}
n_fft: ${n_fft}
win_length: ${n_window_size}
hop_length: ${n_window_stride}
window: ${window}
n_mels: ${n_mels}
lowfreq: ${lowfreq}
highfreq: ${highfreq}
max_duration: null
min_duration: 0.1
ignore_file: null
trim: False
pitch_fmin: ${pitch_fmin}
pitch_fmax: ${pitch_fmax}
text_tokenizer:
_target_: "nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer"
punct: True
stresses: True
chars: True
space: ' '
silence: null
apostrophe: True
sep: '|'
add_blank_at: null
pad_with_space: True
g2p:
_target_: "nemo.collections.tts.g2p.modules.EnglishG2p"
phoneme_dict: ${phoneme_dict_path}
heteronyms: ${heteronyms_path}
phoneme_probability: 0.5
dataloader_params:
drop_last: false
shuffle: false
batch_size: 8
num_workers: 8
pin_memory: false
optim:
name: RAdam
lr: 0.0001
betas: [0.9, 0.98]
weight_decay: 0.000001
sched:
name: exp_decay
warmup_steps: 40000
last_epoch: -1
d_model: 1 # Disable scaling based on model dim
trainerConfig:
sigma: 1
iters_per_checkpoint: 3000
seed: null
ignore_layers: []
finetune_layers: []
include_layers: []
with_tensorboard: true
dur_loss_weight: 1
ctc_loss_weight: 1
mask_unvoiced_f0: false
log_step: 1
binarization_start_iter: 6000
kl_loss_start_iter: 18000
loss_weights:
ctc_loss_weight: 0.1
dur_loss_weight: 1.0
f0_loss_weight: 1.0
energy_loss_weight: 1.0
vpred_loss_weight: 1.0
unfreeze_modules: "all"
load_from_checkpoint: False
init_from_ptl_ckpt: ${ckpt_path}
modelConfig:
_target_: "nemo.collections.tts.modules.radtts.RadTTSModule"
n_speakers: 1
n_speaker_dim: 16
n_text: 384 #185
n_text_dim: 512
n_flows: 8
n_conv_layers_per_step: 4
n_mel_channels: 80
n_hidden: 1024
mel_encoder_n_hidden: 512
dummy_speaker_embedding: false
n_early_size: 2
n_early_every: 2
n_group_size: 2
affine_model: wavenet
include_modules: "decatnvpred"
scaling_fn: tanh
matrix_decomposition: LUS
learn_alignments: true
use_context_lstm: true
context_lstm_norm: spectral
context_lstm_w_f0_and_energy: true
text_encoder_lstm_norm: spectral
n_f0_dims: 1
n_energy_avg_dims: 1
use_first_order_features: false
unvoiced_bias_activation: "relu"
decoder_use_partial_padding: false
decoder_use_unvoiced_bias: true
ap_pred_log_f0: true
ap_use_unvoiced_bias: true
ap_use_voiced_embeddings: true
dur_model_config: null
f0_model_config: null
energy_model_config: null
v_model_config :
name : dap
hparams :
n_speaker_dim : 16
take_log_of_input: false
bottleneck_hparams:
in_dim: 512
reduction_factor: 16
norm: weightnorm
non_linearity: relu
arch_hparams:
out_dim: 1
n_layers: 2
n_channels: 256
kernel_size: 3
p_dropout: 0.5
trainer:
devices: 8
precision: 16
max_epochs: 1000
num_nodes: 1
accelerator: gpu
strategy: ddp
accumulate_grad_batches: 1
enable_checkpointing: False
logger: False
gradient_clip_val: 1
log_every_n_steps: 100
check_val_every_n_epoch: 5
exp_manager:
exp_dir: ${export_dir}
name: ${name}
create_tensorboard_logger: True
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val/loss_ctc
mode: min
filepath: ${export_dir}
filename: model_checkpoint