| |
| |
| |
|
|
| name: AudioCodec |
|
|
| max_epochs: ??? |
| max_steps: 200000 |
| |
| batch_size: 32 |
| |
| |
| weighted_sampling_steps_per_epoch: null |
|
|
| |
| |
| train_ds_meta: ??? |
| val_ds_meta: ??? |
|
|
| log_ds_meta: ??? |
| log_dir: ??? |
|
|
| |
| sample_rate: 16000 |
| train_n_samples: 16000 |
| down_sample_rates: [2, 4, 5, 5] |
| up_sample_rates: [5, 5, 4, 2] |
| |
| |
| samples_per_frame: 200 |
|
|
| model: |
|
|
| max_epochs: ${max_epochs} |
| steps_per_epoch: ${weighted_sampling_steps_per_epoch} |
| max_steps: ${max_steps} |
|
|
| sample_rate: ${sample_rate} |
| samples_per_frame: ${samples_per_frame} |
|
|
| mel_loss_l1_scale: 1.0 |
| mel_loss_l2_scale: 1.0 |
| stft_loss_scale: 0.0 |
| time_domain_loss_scale: 0.1 |
|
|
| |
| |
| disc_updates_per_period: 2 |
| disc_update_period: 3 |
|
|
| |
| loss_resolutions: [ |
| [32, 8, 32], [64, 16, 64], [128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048] |
| ] |
| mel_loss_dims: [64, 64, 64, 64, 64, 64, 64] |
| mel_loss_log_guard: 1E-5 |
| stft_loss_log_guard: 1.0 |
|
|
| train_ds: |
| dataset: |
| _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset |
| weighted_sampling_steps_per_epoch: ${weighted_sampling_steps_per_epoch} |
| sample_rate: ${sample_rate} |
| n_samples: ${train_n_samples} |
| min_duration: 1.01 |
| max_duration: null |
| dataset_meta: ${train_ds_meta} |
|
|
| dataloader_params: |
| batch_size: ${batch_size} |
| drop_last: true |
| num_workers: 4 |
|
|
| validation_ds: |
| dataset: |
| _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset |
| sample_rate: ${sample_rate} |
| n_samples: null |
| min_duration: null |
| max_duration: null |
| trunc_duration: 10.0 |
| dataset_meta: ${val_ds_meta} |
|
|
| dataloader_params: |
| batch_size: 8 |
| num_workers: 2 |
|
|
| |
| |
| log_config: |
| log_dir: ${log_dir} |
| log_epochs: [1, 2, 3, 4, 5, 6] |
| epoch_frequency: 1 |
| log_tensorboard: false |
| log_wandb: false |
|
|
| generators: |
| - _target_: nemo.collections.tts.parts.utils.callbacks.AudioCodecArtifactGenerator |
| log_audio: true |
| log_encoding: false |
| log_dequantized: false |
|
|
| dataset: |
| _target_: nemo.collections.tts.data.vocoder_dataset.VocoderDataset |
| sample_rate: ${sample_rate} |
| n_samples: null |
| min_duration: null |
| max_duration: null |
| trunc_duration: 15.0 |
| dataset_meta: ${log_ds_meta} |
|
|
| dataloader_params: |
| batch_size: 4 |
| num_workers: 2 |
|
|
| audio_encoder: |
| _target_: nemo.collections.tts.modules.encodec_modules.SEANetEncoder |
| down_sample_rates: ${down_sample_rates} |
|
|
| audio_decoder: |
| _target_: nemo.collections.tts.modules.encodec_modules.SEANetDecoder |
| up_sample_rates: ${up_sample_rates} |
|
|
| vector_quantizer: |
| _target_: nemo.collections.tts.modules.encodec_modules.ResidualVectorQuantizer |
| num_codebooks: 8 |
|
|
| discriminator: |
| _target_: nemo.collections.tts.modules.encodec_modules.MultiResolutionDiscriminatorSTFT |
| resolutions: [[128, 32, 128], [256, 64, 256], [512, 128, 512], [1024, 256, 1024], [2048, 512, 2048]] |
|
|
| generator_loss: |
| _target_: nemo.collections.tts.losses.audio_codec_loss.GeneratorSquaredLoss |
|
|
| discriminator_loss: |
| _target_: nemo.collections.tts.losses.audio_codec_loss.DiscriminatorSquaredLoss |
|
|
| optim: |
| _target_: torch.optim.AdamW |
| lr: 1e-4 |
| betas: [0.8, 0.9] |
|
|
| sched: |
| name: StepLR |
| gamma: 0.999996 |
| step_size: 1 |
|
|
| |
| trainer: |
| num_nodes: 1 |
| devices: 1 |
| accelerator: gpu |
| strategy: ddp_find_unused_parameters_true |
| precision: 32 |
| max_steps: ${max_steps} |
| max_epochs: ${max_epochs} |
| accumulate_grad_batches: 1 |
| enable_checkpointing: False |
| logger: false |
| log_every_n_steps: 100 |
| check_val_every_n_epoch: 1 |
| benchmark: false |
| |
|
|
| exp_manager: |
| exp_dir: null |
| name: ${name} |
| create_tensorboard_logger: true |
| create_checkpoint_callback: true |
| create_wandb_logger: false |
| checkpoint_callback_params: |
| monitor: val_loss |
| resume_if_exists: false |
| resume_ignore_no_checkpoint: false |
|
|