| |
| |
| |
| |
|
|
| name: MelCodec |
|
|
| max_epochs: ??? |
| |
| batch_size: 16 |
| |
| |
| weighted_sampling_steps_per_epoch: null |
|
|
| |
| |
| train_ds_meta: ??? |
| val_ds_meta: ??? |
|
|
| log_ds_meta: ??? |
| log_dir: ??? |
|
|
| |
| sample_rate: 22050 |
| win_length: 1024 |
| hop_length: 256 |
| train_n_samples: 8192 |
| |
| |
| up_sample_rates: [8, 8, 2, 2] |
|
|
|
|
| model: |
|
|
| max_epochs: ${max_epochs} |
| steps_per_epoch: ${weighted_sampling_steps_per_epoch} |
|
|
| sample_rate: ${sample_rate} |
| samples_per_frame: ${hop_length} |
|
|
| mel_loss_l1_scale: 1.0 |
| mel_loss_l2_scale: 0.0 |
| stft_loss_scale: 20.0 |
| time_domain_loss_scale: 0.0 |
| commit_loss_scale: 0.0 |
|
|
| |
| |
| disc_updates_per_period: 1 |
| disc_update_period: 2 |
|
|
| |
| loss_resolutions: [ |
| [32, 8, 32], [64, 16, 64], [128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048] |
| ] |
| mel_loss_dims: [5, 10, 20, 40, 80, 160, 320] |
| mel_loss_log_guard: 1.0 |
| stft_loss_log_guard: 1.0 |
| feature_loss_type: absolute |
|
|
| train_ds: |
| dataset: |
| _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset |
| dataset_meta: ${train_ds_meta} |
| weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch} |
| sample_rate: ${sample_rate} |
| n_samples: ${train_n_samples} |
| min_duration: 0.4 |
| max_duration: null |
|
|
| dataloader_params: |
| batch_size: ${batch_size} |
| drop_last: true |
| num_workers: 4 |
|
|
| validation_ds: |
| dataset: |
| _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset |
| sample_rate: ${sample_rate} |
| n_samples: null |
| min_duration: null |
| max_duration: null |
| trunc_duration: 10.0 |
| dataset_meta: ${val_ds_meta} |
|
|
| dataloader_params: |
| batch_size: 4 |
| num_workers: 2 |
|
|
| |
| |
| log_config: |
| log_dir: ${log_dir} |
| log_epochs: [10, 50, 100, 150, 200] |
| epoch_frequency: 100 |
| log_tensorboard: false |
| log_wandb: false |
|
|
| generators: |
| - _target_: nemo.collections.tts.parts.utils.callbacks.AudioCodecArtifactGenerator |
| log_audio: true |
| log_encoding: false |
| log_dequantized: false |
|
|
| dataset: |
| _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset |
| sample_rate: ${sample_rate} |
| n_samples: null |
| min_duration: null |
| max_duration: null |
| trunc_duration: 10.0 |
| dataset_meta: ${log_ds_meta} |
|
|
| dataloader_params: |
| batch_size: 4 |
| num_workers: 2 |
|
|
| audio_encoder: |
| _target_: nemo.collections.tts.modules.audio_codec_modules.MultiBandMelEncoder |
| mel_bands: [[0, 10], [10, 20], [20, 30], [30, 40], [40, 50], [50, 60], [60, 70], [70, 80]] |
| out_channels: 4 |
| hidden_channels: 128 |
| filters: 256 |
| mel_processor: |
| _target_: nemo.collections.tts.modules.audio_codec_modules.MelSpectrogramProcessor |
| mel_dim: 80 |
| sample_rate: ${sample_rate} |
| win_length: ${win_length} |
| hop_length: ${hop_length} |
|
|
| audio_decoder: |
| _target_: nemo.collections.tts.modules.audio_codec_modules.HiFiGANDecoder |
| up_sample_rates: ${up_sample_rates} |
| input_dim: 32 |
| base_channels: 1024 |
|
|
| vector_quantizer: |
| _target_: nemo.collections.tts.modules.audio_codec_modules.GroupFiniteScalarQuantizer |
| num_groups: 8 |
| num_levels_per_group: [8, 5, 5, 5] |
|
|
| discriminator: |
| _target_: nemo.collections.tts.modules.audio_codec_modules.Discriminator |
| discriminators: |
| - _target_: nemo.collections.tts.modules.encodec_modules.MultiResolutionDiscriminatorSTFT |
| resolutions: [[128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048]] |
| - _target_: nemo.collections.tts.modules.audio_codec_modules.MultiPeriodDiscriminator |
|
|
| generator_loss: |
| _target_: nemo.collections.tts.losses.audio_codec_loss.GeneratorSquaredLoss |
|
|
| discriminator_loss: |
| _target_: nemo.collections.tts.losses.audio_codec_loss.DiscriminatorSquaredLoss |
|
|
| optim: |
| _target_: torch.optim.Adam |
| lr: 2e-4 |
| betas: [0.8, 0.99] |
|
|
| sched: |
| name: ExponentialLR |
| gamma: 0.998 |
|
|
| trainer: |
| num_nodes: 1 |
| devices: 1 |
| accelerator: gpu |
| strategy: ddp_find_unused_parameters_true |
| precision: 16 |
| max_epochs: ${max_epochs} |
| accumulate_grad_batches: 1 |
| enable_checkpointing: False |
| logger: false |
| log_every_n_steps: 100 |
| check_val_every_n_epoch: 5 |
| benchmark: false |
|
|
| exp_manager: |
| exp_dir: null |
| name: ${name} |
| create_tensorboard_logger: false |
| create_wandb_logger: false |
| wandb_logger_kwargs: |
| name: null |
| project: null |
| create_checkpoint_callback: true |
| checkpoint_callback_params: |
| monitor: val_loss |
| mode: min |
| save_top_k: 5 |
| save_best_model: true |
| always_save_nemo: true |
| resume_if_exists: false |
| resume_ignore_no_checkpoint: false |
|
|