File size: 4,783 Bytes
7934b29 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | # This config contains the default values for training Aligner model on LJSpeech dataset.
# If you want to train model on other dataset, you can change config values according to your dataset.
# Most dataset-specific arguments are in the head of the config file, see below.
name: Aligner
train_dataset: ???
validation_datasets: ???
sup_data_path: ???
sup_data_types: [ "align_prior_matrix" ]
# Default values for dataset with sample_rate=22050
sample_rate: 22050
n_mel_channels: 80
n_window_size: 1024
n_window_stride: 256
n_fft: 1024
lowfreq: 0
highfreq: 8000
window: hann
phoneme_dict_path: "scripts/tts_dataset_files/cmudict-0.7b_nv22.10"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
model:
symbols_embedding_dim: 384
bin_loss_start_ratio: 0.2
bin_loss_warmup_epochs: 100
sample_rate: ${sample_rate}
n_mel_channels: ${n_mel_channels}
n_window_size: ${n_window_size}
n_window_stride: ${n_window_stride}
n_fft: ${n_fft}
lowfreq: ${lowfreq}
highfreq: ${highfreq}
window: ${window}
text_normalizer:
_target_: nemo_text_processing.text_normalization.normalize.Normalizer
lang: en
input_case: cased
text_normalizer_call_kwargs:
verbose: false
punct_pre_process: true
punct_post_process: true
text_tokenizer:
_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.EnglishPhonemesTokenizer
punct: true
stresses: true
chars: true
apostrophe: true
pad_with_space: true
g2p:
_target_: nemo.collections.tts.g2p.modules.EnglishG2p
phoneme_dict: ${phoneme_dict_path}
heteronyms: ${heteronyms_path}
train_ds:
dataset:
_target_: nemo.collections.tts.data.tts_dataset.TTSDataset
manifest_filepath: ${train_dataset}
sample_rate: ${model.sample_rate}
sup_data_path: ${sup_data_path}
sup_data_types: ${sup_data_types}
n_fft: ${model.n_fft}
win_length: ${model.n_window_size}
hop_length: ${model.n_window_stride}
window: ${model.window}
n_mels: ${model.n_mel_channels}
lowfreq: ${model.lowfreq}
highfreq: ${model.highfreq}
max_duration: null
min_duration: 0.1
ignore_file: null
trim: false
dataloader_params:
drop_last: false
shuffle: true
batch_size: 64
num_workers: 4
pin_memory: true
validation_ds:
dataset:
_target_: nemo.collections.tts.data.tts_dataset.TTSDataset
manifest_filepath: ${validation_datasets}
sample_rate: ${model.sample_rate}
sup_data_path: ${sup_data_path}
sup_data_types: ${sup_data_types}
n_fft: ${model.n_fft}
win_length: ${model.n_window_size}
hop_length: ${model.n_window_stride}
window: ${model.window}
n_mels: ${model.n_mel_channels}
lowfreq: ${model.lowfreq}
highfreq: ${model.highfreq}
max_duration: null
min_duration: 0.1
ignore_file: null
trim: false
dataloader_params:
drop_last: false
shuffle: false
batch_size: 64
num_workers: 1
pin_memory: true
preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
features: ${model.n_mel_channels}
lowfreq: ${model.lowfreq}
highfreq: ${model.highfreq}
n_fft: ${model.n_fft}
n_window_size: ${model.n_window_size}
window_size: false
n_window_stride: ${model.n_window_stride}
window_stride: false
pad_to: 1
pad_value: -11.52
sample_rate: ${model.sample_rate}
window: ${model.window}
normalize: null
preemph: null
dither: 0.0
frame_splicing: 1
log: true
log_zero_guard_type: clamp
log_zero_guard_value: 1e-05
mag_power: 1.0
alignment_encoder:
_target_: nemo.collections.tts.modules.aligner.AlignmentEncoder
n_mel_channels: ${model.n_mel_channels}
n_text_channels: ${model.symbols_embedding_dim}
n_att_channels: ${model.n_mel_channels}
optim:
name: adam
lr: 1e-3
weight_decay: 1e-6
sched:
name: CosineAnnealing
min_lr: 5e-5
warmup_ratio: 0.35
trainer:
devices: 1
num_nodes: 1
accelerator: gpu
strategy: ddp
precision: 32
max_epochs: 1000
accumulate_grad_batches: 1
gradient_clip_val: 1000.0
enable_checkpointing: false # Provided by exp_manager
logger: false # Provided by exp_manager
log_every_n_steps: 100
check_val_every_n_epoch: 1
benchmark: false
exp_manager:
exp_dir: null
name: ${name}
create_tensorboard_logger: true
create_checkpoint_callback: true
checkpoint_callback_params:
monitor: val_forward_sum_loss
mode: min
create_wandb_logger: false
wandb_logger_kwargs:
name: null
project: null
entity: null
resume_if_exists: false
resume_ignore_no_checkpoint: false
|