| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | name: &name "ContextNet-8x-Stride-SSL" |
| |
|
| | model: |
| | sample_rate: &sample_rate 16000 |
| |
|
| | train_ds: |
| | manifest_filepath: ??? |
| | sample_rate: ${model.sample_rate} |
| | batch_size: 16 |
| | trim_silence: false |
| | max_duration: 16.7 |
| | min_duration: 8.0 |
| | shuffle: true |
| | use_start_end_token: false |
| | num_workers: 16 |
| | pin_memory: true |
| | |
| | is_tarred: false |
| | tarred_audio_filepaths: null |
| | tarred_shard_strategy: "scatter" |
| | shuffle_n: 2048 |
| | |
| | bucketing_strategy: "synced_randomized" |
| | bucketing_batch_size: null |
| |
|
| | validation_ds: |
| | manifest_filepath: ??? |
| | sample_rate: ${model.sample_rate} |
| | batch_size: 8 |
| | shuffle: false |
| | use_start_end_token: false |
| | num_workers: 16 |
| | pin_memory: true |
| | min_duration: 8.0 |
| |
|
| | model_defaults: |
| | filters: 1024 |
| | repeat: 5 |
| | dropout: 0.1 |
| | separable: true |
| | se: true |
| | se_context_size: -1 |
| | kernel_size_factor: 1.0 |
| | enc_hidden: 640 |
| | decoder_out_channels: 128 |
| |
|
| | preprocessor: |
| | _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor |
| | sample_rate: ${model.sample_rate} |
| | normalize: "per_feature" |
| | window_size: 0.025 |
| | window_stride: 0.01 |
| | window: "hann" |
| | features: &n_mels 80 |
| | n_fft: 512 |
| | frame_splicing: 1 |
| | dither: 0.00001 |
| | pad_to: 16 |
| | stft_conv: false |
| |
|
| | spec_augment: |
| | _target_: nemo.collections.asr.modules.MaskedPatchAugmentation |
| | freq_masks: 3 |
| | freq_width: 20 |
| | patch_size: 48 |
| | mask_patches: 0.5 |
| |
|
| | encoder: |
| | _target_: nemo.collections.asr.modules.ConvASREncoder |
| | feat_in: *n_mels |
| | activation: swish |
| | conv_mask: true |
| | init_mode: "tds_uniform" |
| |
|
| | jasper: |
| | - filters: ${model.model_defaults.filters} |
| | repeat: 1 |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: 0.0 |
| | residual: false |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [2] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | stride_last: true |
| | residual_mode: "stride_add" |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [2] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | stride_last: true |
| | residual_mode: "stride_add" |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [2] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | stride_last: true |
| | residual_mode: "stride_add" |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.filters} |
| | repeat: ${model.model_defaults.repeat} |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: ${model.model_defaults.dropout} |
| | residual: true |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | - filters: ${model.model_defaults.enc_hidden} |
| | repeat: 1 |
| | kernel: [5] |
| | stride: [1] |
| | dilation: [1] |
| | dropout: 0.0 |
| | residual: false |
| | separable: ${model.model_defaults.separable} |
| | se: ${model.model_defaults.se} |
| | se_context_size: ${model.model_defaults.se_context_size} |
| | kernel_size_factor: ${model.model_defaults.kernel_size_factor} |
| |
|
| | loss_list: |
| | contrastive: |
| | decoder: |
| | _target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction |
| | feat_in: ${model.model_defaults.enc_hidden} |
| | feat_hidden: 128 |
| | |
| | feat_out: ${model.model_defaults.decoder_out_channels} |
| | stride_layers: 1 |
| | |
| | |
| | |
| | non_stride_layers: 0 |
| | stride_transpose: true |
| | apply_softmax: false |
| | loss: |
| | _target_: nemo.collections.asr.losses.ContrastiveLoss |
| | in_dim: ${model.preprocessor.features} |
| | proj_dim: ${model.model_defaults.decoder_out_channels} |
| | combine_time_steps: 4 |
| | quantized_targets: true |
| | |
| | codebook_size: 300 |
| | num_groups: 2 |
| | num_negatives: 100 |
| | sample_from_same_utterance_only: true |
| | sample_from_non_masked: false |
| |
|
| | mlm: |
| | decoder: |
| | _target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction |
| | feat_in: ${model.model_defaults.enc_hidden} |
| | feat_hidden: 128 |
| | |
| | feat_out: 90000 |
| | |
| | stride_layers: 1 |
| | stride_transpose: true |
| | activation: "identity" |
| | apply_softmax: true |
| | loss: |
| | _target_: nemo.collections.asr.losses.MLMLoss |
| | combine_time_steps: 4 |
| | targets_from_loss: "contrastive" |
| | loss_alpha: 1000. |
| |
|
| | optim: |
| | name: adamw |
| | lr: 5.0 |
| | |
| | betas: [0.9, 0.98] |
| | weight_decay: 1e-3 |
| |
|
| | |
| | sched: |
| | name: NoamAnnealing |
| | d_model: ${model.model_defaults.enc_hidden} |
| | |
| | warmup_steps: 25000 |
| | warmup_ratio: null |
| | min_lr: 1e-6 |
| |
|
| | trainer: |
| | devices: -1 |
| | num_nodes: 1 |
| | max_epochs: 1000 |
| | max_steps: -1 |
| | val_check_interval: 1.0 |
| | accelerator: auto |
| | strategy: ddp |
| | accumulate_grad_batches: 1 |
| | gradient_clip_val: 1.0 |
| | precision: 32 |
| | log_every_n_steps: 10 |
| | enable_progress_bar: True |
| | resume_from_checkpoint: null |
| | num_sanity_val_steps: 0 |
| | check_val_every_n_epoch: 1 |
| | sync_batchnorm: true |
| | enable_checkpointing: False |
| | logger: false |
| | benchmark: false |
| |
|
| | exp_manager: |
| | exp_dir: null |
| | name: ${name} |
| | create_tensorboard_logger: true |
| | create_checkpoint_callback: true |
| | checkpoint_callback_params: |
| | |
| | monitor: "val_loss" |
| | mode: "min" |
| | save_top_k: 5 |
| |
|
| | |
| | resume_if_exists: false |
| | resume_ignore_no_checkpoint: false |
| |
|
| | |
| | create_wandb_logger: false |
| | wandb_logger_kwargs: |
| | name: null |
| | project: null |
| |
|