NeMo / examples /tts /conf /vits_44100.yaml
camenduru's picture
thanks to NVIDIA ❤
7934b29
# This config contains the default values for training VITS model on LJSpeech dataset.
# If you want to train model on other dataset, you can change config values according to your dataset.
# Most dataset-specific arguments are in the head of the config file, see below.
name: VITS
train_dataset: ???
validation_datasets: ???
sup_data_path: ???
sup_data_types: [speaker_id]
pitch_fmin: 65.40639132514966
pitch_fmax: 2093.004522404789
sample_rate: 44100
n_mel_channels: 80
n_window_size: 2048
n_window_stride: 512
n_fft: 2048
lowfreq: 0
highfreq: null
window: hann
phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt"
heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722"
model:
n_speakers: 13000
segment_size: 16384
c_mel: 45
c_kl: 1.
use_spectral_norm: false
pitch_fmin: ${pitch_fmin}
pitch_fmax: ${pitch_fmax}
sample_rate: ${sample_rate}
n_mel_channels: ${n_mel_channels}
n_window_size: ${n_window_size}
n_window_stride: ${n_window_stride}
n_fft: ${n_fft}
lowfreq: ${lowfreq}
highfreq: ${highfreq}
window: ${window}
text_normalizer:
_target_: nemo_text_processing.text_normalization.normalize.Normalizer
lang: en
input_case: cased
text_normalizer_call_kwargs:
verbose: false
punct_pre_process: true
punct_post_process: true
text_tokenizer:
_target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer
punct: true
apostrophe: true
pad_with_space: false
g2p:
_target_: nemo.collections.tts.g2p.modules.IPAG2P
phoneme_dict: ${phoneme_dict_path}
heteronyms: ${heteronyms_path}
phoneme_probability: 0.8
# Relies on the heteronyms list for anything that needs to be disambiguated
ignore_ambiguous_words: false
use_chars: true
use_stresses: true
train_ds:
dataset:
_target_: "nemo.collections.tts.data.tts_dataset.TTSDataset"
manifest_filepath: ${train_dataset}
sample_rate: ${model.sample_rate}
sup_data_path: ${sup_data_path}
sup_data_types: ${sup_data_types}
n_fft: ${model.n_fft}
win_length: ${model.n_window_size}
hop_length: ${model.n_window_stride}
window: ${model.window}
n_mels: ${model.n_mel_channels}
lowfreq: ${model.lowfreq}
highfreq: ${model.highfreq}
max_duration: null
min_duration: 0.1
ignore_file: null
trim: False
pitch_fmin: ${model.pitch_fmin}
pitch_fmax: ${model.pitch_fmax}
dataloader_params:
num_workers: 8
pin_memory: false
batch_sampler:
batch_size: 32
boundaries: [32,300,400,500,600,700,800,900,1000]
num_replicas: ${trainer.devices}
shuffle: true
validation_ds:
dataset:
_target_: "nemo.collections.tts.data.tts_dataset.TTSDataset"
manifest_filepath: ${validation_datasets}
sample_rate: ${model.sample_rate}
sup_data_path: ${sup_data_path}
sup_data_types: ${sup_data_types}
n_fft: ${model.n_fft}
win_length: ${model.n_window_size}
hop_length: ${model.n_window_stride}
window: ${model.window}
n_mels: ${model.n_mel_channels}
lowfreq: ${model.lowfreq}
highfreq: ${model.highfreq}
max_duration: null
min_duration: 0.1
ignore_file: null
trim: False
pitch_fmin: ${model.pitch_fmin}
pitch_fmax: ${model.pitch_fmax}
dataloader_params:
drop_last: false
shuffle: false
batch_size: 32
num_workers: 4
pin_memory: false
preprocessor:
_target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures
nfilt: ${model.n_mel_channels}
highfreq: ${model.highfreq}
log: true
log_zero_guard_type: clamp
log_zero_guard_value: 1e-05
lowfreq: ${model.lowfreq}
n_fft: ${model.n_fft}
n_window_size: ${model.n_window_size}
n_window_stride: ${model.n_window_stride}
pad_to: 1
pad_value: 0
sample_rate: ${model.sample_rate}
window: ${model.window}
normalize: null
preemph: null
dither: 0.0
frame_splicing: 1
stft_conv: false
nb_augmentation_prob : 0
mag_power: 1.0
exact_pad: true
use_grads: true
synthesizer:
_target_: nemo.collections.tts.modules.vits_modules.SynthesizerTrn
inter_channels: 192
hidden_channels: 192
filter_channels: 768
n_heads: 2
n_layers: 6
kernel_size: 3
p_dropout: 0.1
resblock: "1"
resblock_kernel_sizes: [3,7,11]
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
upsample_rates: [8,8,4,2]
upsample_initial_channel: 512
upsample_kernel_sizes: [16,16,4,4]
n_speakers: ${model.n_speakers}
gin_channels: 256 # for multi-speaker
optim:
_target_: torch.optim.AdamW
lr: 2e-4
betas: [0.9, 0.99]
eps: 1e-9
sched:
name: CosineAnnealing
max_steps: 1000000
min_lr: 1e-5
trainer:
num_nodes: 1
devices: 2
accelerator: gpu
strategy: ddp
precision: 32
# amp_backend: 'apex'
# amp_level: 'O2'
# benchmark: true
max_epochs: -1
accumulate_grad_batches: 1
enable_checkpointing: false # Provided by exp_manager
logger: false # Provided by exp_manager
log_every_n_steps: 50
check_val_every_n_epoch: 1
exp_manager:
exp_dir: ???
name: ${name}
create_tensorboard_logger: true
create_checkpoint_callback: true
checkpoint_callback_params:
monitor: loss_gen_all
mode: min
resume_if_exists: false
resume_ignore_no_checkpoint: false