| name: flow_matching_generative_ssl_pretraining |
|
|
| model: |
| type: flow_matching |
| sample_rate: 16000 |
| skip_nan_grad: true |
| num_outputs: 1 |
| p_cond: 0.9 |
| normalize_input: true |
| max_utts_evaluation_metrics: 125 |
|
|
| train_ds: |
| shar_path: ??? |
| use_lhotse: true |
| truncate_duration: 4.09 |
| truncate_offset_type: random |
| batch_size: 8 |
| shuffle: true |
| num_workers: 8 |
| pin_memory: true |
|
|
| validation_ds: |
| manifest_filepath: ??? |
| input_key: clean_filepath |
| target_key: clean_filepath |
| random_offset: false |
| batch_size: 8 |
| shuffle: false |
| num_workers: 4 |
| pin_memory: true |
| |
| log_config: |
| log_tensorboard: true |
| log_wandb: false |
| max_utts: 8 |
| |
| encoder: |
| _target_: nemo.collections.audio.modules.transforms.AudioToSpectrogram |
| fft_length: 510 |
| hop_length: 128 |
| magnitude_power: 0.5 |
| scale: 0.33 |
|
|
| decoder: |
| _target_: nemo.collections.audio.modules.transforms.SpectrogramToAudio |
| fft_length: ${model.encoder.fft_length} |
| hop_length: ${model.encoder.hop_length} |
| magnitude_power: ${model.encoder.magnitude_power} |
| scale: ${model.encoder.scale} |
|
|
| estimator: |
| _target_: nemo.collections.audio.parts.submodules.transformerunet.SpectrogramTransformerUNet |
| in_channels: 2 |
| out_channels: 1 |
| depth: 24 |
| ff_dropout: 0.1 |
| time_hidden_dim: 1024 |
|
|
| flow: |
| _target_: nemo.collections.audio.parts.submodules.flow.OptimalTransportFlow |
| sigma_start: 1.0 |
| sigma_end: 1e-4 |
|
|
| sampler: |
| _target_: nemo.collections.audio.parts.submodules.flow.ConditionalFlowMatchingEulerSampler |
| num_steps: 20 |
| time_min: 1e-8 |
| time_max: 1.0 |
| |
| ssl_pretrain_masking: |
| _target_: nemo.collections.audio.modules.ssl_pretrain_masking.SSLPretrainWithMaskedPatch |
| patch_size: 10 |
| mask_fraction: 0.7 |
| |
| loss: |
| _target_: nemo.collections.audio.losses.MSELoss |
| ndim: 4 |
|
|
| metrics: |
| val: |
| sisdr: |
| _target_: torchmetrics.audio.ScaleInvariantSignalDistortionRatio |
| estoi: |
| _target_: torchmetrics.audio.ShortTimeObjectiveIntelligibility |
| fs: ${model.sample_rate} |
| extended: true |
| pesq: |
| _target_: torchmetrics.audio.PerceptualEvaluationSpeechQuality |
| fs: ${model.sample_rate} |
| mode: wb |
| |
| optim: |
| name: adam |
| lr: 5e-5 |
| |
| betas: [0.9, 0.999] |
| weight_decay: 0.0 |
|
|
| |
| sched: |
| name: CosineAnnealing |
| |
| warmup_steps: 5000 |
| warmup_ratio: null |
| min_lr: 1e-5 |
|
|
| trainer: |
| devices: -1 |
| num_nodes: 1 |
| max_epochs: -1 |
| max_steps: 10000 |
| limit_train_batches: 1000 |
| val_check_interval: 1.0 |
| accelerator: auto |
| strategy: ddp |
| use_distributed_sampler: false |
| accumulate_grad_batches: 1 |
| gradient_clip_val: 0.2 |
| precision: 32 |
| log_every_n_steps: 25 |
| enable_progress_bar: true |
| num_sanity_val_steps: 0 |
| check_val_every_n_epoch: 1 |
| sync_batchnorm: true |
| enable_checkpointing: false |
| logger: false |
|
|
| exp_manager: |
| exp_dir: null |
| name: ${name} |
|
|
| |
| ema: |
| enable: true |
| decay: 0.999 |
| cpu_offload: false |
| every_n_steps: 1 |
| validate_original_weights: false |
|
|
| |
| create_tensorboard_logger: true |
|
|
| |
| create_checkpoint_callback: true |
| checkpoint_callback_params: |
| |
| monitor: val_pesq |
| mode: max |
| save_top_k: 3 |
| always_save_nemo: true |
|
|
| |
| create_early_stopping_callback: true |
| early_stopping_callback_params: |
| monitor: val_sisdr |
| mode: max |
| min_delta: 0.0 |
| patience: 20 |
| verbose: true |
| strict: false |
|
|
| resume_from_checkpoint: null |
| |
| resume_if_exists: false |
| resume_ignore_no_checkpoint: false |
|
|
| |
| create_wandb_logger: false |
| wandb_logger_kwargs: |
| name: null |
| project: null |
|
|