| |
| |
| |
|
|
| name: VITS |
|
|
| train_dataset: ??? |
| validation_datasets: ??? |
| sup_data_path: ??? |
| sup_data_types: [speaker_id] |
|
|
| pitch_fmin: 65.40639132514966 |
| pitch_fmax: 2093.004522404789 |
|
|
| sample_rate: 44100 |
| n_mel_channels: 80 |
| n_window_size: 2048 |
| n_window_stride: 512 |
| n_fft: 2048 |
| lowfreq: 0 |
| highfreq: null |
| window: hann |
|
|
| phoneme_dict_path: "scripts/tts_dataset_files/ipa_cmudict-0.7b_nv22.10.txt" |
| heteronyms_path: "scripts/tts_dataset_files/heteronyms-052722" |
|
|
| model: |
| n_speakers: 13000 |
| segment_size: 16384 |
| c_mel: 45 |
| c_kl: 1. |
| use_spectral_norm: false |
|
|
| pitch_fmin: ${pitch_fmin} |
| pitch_fmax: ${pitch_fmax} |
|
|
| sample_rate: ${sample_rate} |
| n_mel_channels: ${n_mel_channels} |
| n_window_size: ${n_window_size} |
| n_window_stride: ${n_window_stride} |
| n_fft: ${n_fft} |
| lowfreq: ${lowfreq} |
| highfreq: ${highfreq} |
| window: ${window} |
|
|
| text_normalizer: |
| _target_: nemo_text_processing.text_normalization.normalize.Normalizer |
| lang: en |
| input_case: cased |
|
|
| text_normalizer_call_kwargs: |
| verbose: false |
| punct_pre_process: true |
| punct_post_process: true |
| |
| text_tokenizer: |
| _target_: nemo.collections.common.tokenizers.text_to_speech.tts_tokenizers.IPATokenizer |
| punct: true |
| apostrophe: true |
| pad_with_space: false |
| g2p: |
| _target_: nemo.collections.tts.g2p.modules.IPAG2P |
| phoneme_dict: ${phoneme_dict_path} |
| heteronyms: ${heteronyms_path} |
| phoneme_probability: 0.8 |
| |
| ignore_ambiguous_words: false |
| use_chars: true |
| use_stresses: true |
|
|
| train_ds: |
| dataset: |
| _target_: "nemo.collections.tts.data.tts_dataset.TTSDataset" |
| manifest_filepath: ${train_dataset} |
| sample_rate: ${model.sample_rate} |
| sup_data_path: ${sup_data_path} |
| sup_data_types: ${sup_data_types} |
| n_fft: ${model.n_fft} |
| win_length: ${model.n_window_size} |
| hop_length: ${model.n_window_stride} |
| window: ${model.window} |
| n_mels: ${model.n_mel_channels} |
| lowfreq: ${model.lowfreq} |
| highfreq: ${model.highfreq} |
| max_duration: null |
| min_duration: 0.1 |
| ignore_file: null |
| trim: False |
| pitch_fmin: ${model.pitch_fmin} |
| pitch_fmax: ${model.pitch_fmax} |
| |
| dataloader_params: |
| num_workers: 8 |
| pin_memory: false |
|
|
| batch_sampler: |
| batch_size: 32 |
| boundaries: [32,300,400,500,600,700,800,900,1000] |
| num_replicas: ${trainer.devices} |
| shuffle: true |
|
|
| validation_ds: |
| dataset: |
| _target_: "nemo.collections.tts.data.tts_dataset.TTSDataset" |
| manifest_filepath: ${validation_datasets} |
| sample_rate: ${model.sample_rate} |
| sup_data_path: ${sup_data_path} |
| sup_data_types: ${sup_data_types} |
| n_fft: ${model.n_fft} |
| win_length: ${model.n_window_size} |
| hop_length: ${model.n_window_stride} |
| window: ${model.window} |
| n_mels: ${model.n_mel_channels} |
| lowfreq: ${model.lowfreq} |
| highfreq: ${model.highfreq} |
| max_duration: null |
| min_duration: 0.1 |
| ignore_file: null |
| trim: False |
| pitch_fmin: ${model.pitch_fmin} |
| pitch_fmax: ${model.pitch_fmax} |
|
|
| dataloader_params: |
| drop_last: false |
| shuffle: false |
| batch_size: 32 |
| num_workers: 4 |
| pin_memory: false |
|
|
| preprocessor: |
| _target_: nemo.collections.asr.parts.preprocessing.features.FilterbankFeatures |
| nfilt: ${model.n_mel_channels} |
| highfreq: ${model.highfreq} |
| log: true |
| log_zero_guard_type: clamp |
| log_zero_guard_value: 1e-05 |
| lowfreq: ${model.lowfreq} |
| n_fft: ${model.n_fft} |
| n_window_size: ${model.n_window_size} |
| n_window_stride: ${model.n_window_stride} |
| pad_to: 1 |
| pad_value: 0 |
| sample_rate: ${model.sample_rate} |
| window: ${model.window} |
| normalize: null |
| preemph: null |
| dither: 0.0 |
| frame_splicing: 1 |
| stft_conv: false |
| nb_augmentation_prob : 0 |
| mag_power: 1.0 |
| exact_pad: true |
| use_grads: true |
| |
| synthesizer: |
| _target_: nemo.collections.tts.modules.vits_modules.SynthesizerTrn |
| inter_channels: 192 |
| hidden_channels: 192 |
| filter_channels: 768 |
| n_heads: 2 |
| n_layers: 6 |
| kernel_size: 3 |
| p_dropout: 0.1 |
| resblock: "1" |
| resblock_kernel_sizes: [3,7,11] |
| resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] |
| upsample_rates: [8,8,4,2] |
| upsample_initial_channel: 512 |
| upsample_kernel_sizes: [16,16,4,4] |
| n_speakers: ${model.n_speakers} |
| gin_channels: 256 |
|
|
| optim: |
| _target_: torch.optim.AdamW |
| lr: 2e-4 |
| betas: [0.9, 0.99] |
| eps: 1e-9 |
| |
| sched: |
| name: CosineAnnealing |
| max_steps: 1000000 |
| min_lr: 1e-5 |
|
|
| trainer: |
| num_nodes: 1 |
| devices: 2 |
| accelerator: gpu |
| strategy: ddp |
| precision: 32 |
| |
| |
| |
| max_epochs: -1 |
| accumulate_grad_batches: 1 |
| enable_checkpointing: false |
| logger: false |
| log_every_n_steps: 50 |
| check_val_every_n_epoch: 1 |
|
|
| exp_manager: |
| exp_dir: ??? |
| name: ${name} |
| create_tensorboard_logger: true |
| create_checkpoint_callback: true |
| checkpoint_callback_params: |
| monitor: loss_gen_all |
| mode: min |
| resume_if_exists: false |
| resume_ignore_no_checkpoint: false |
|
|