# This config contains the default values for self-supervised pre-training of ContextNet encoder. # In contrast to original ContextNet, the same number of filters is used throughout the model. # Default learning parameters in this config are set for effective batch size of 1K. To train it with smaller effective # batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. # Here are the recommended configs for different variants of ContextNet, other parameters are the same as in this config file. # # +-------------+---------+------------+ # | Model | filters | time_masks | # +=============+=========+============+ # | Small (14M)| 256 | 2 | # +-------------+---------+------------+ # | Medium (40M)| 512 | 5 | # +-------------+---------+------------+ # | Large (145M)| 1024 | 10 | # +------------------------------------- name: &name "ContextNet-8x-Stride-SSL" model: sample_rate: &sample_rate 16000 train_ds: manifest_filepath: ??? sample_rate: ${model.sample_rate} batch_size: 16 # Can be increased if memory allows or when using smaller model trim_silence: false max_duration: 16.7 min_duration: 8.0 shuffle: true use_start_end_token: false num_workers: 16 pin_memory: true # tarred datasets is_tarred: false tarred_audio_filepaths: null tarred_shard_strategy: "scatter" shuffle_n: 2048 # bucketing params bucketing_strategy: "synced_randomized" bucketing_batch_size: null validation_ds: manifest_filepath: ??? sample_rate: ${model.sample_rate} batch_size: 8 shuffle: false use_start_end_token: false num_workers: 16 pin_memory: true min_duration: 8.0 model_defaults: filters: 1024 repeat: 5 dropout: 0.1 separable: true se: true se_context_size: -1 kernel_size_factor: 1.0 enc_hidden: 640 decoder_out_channels: 128 preprocessor: _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor sample_rate: ${model.sample_rate} normalize: "per_feature" window_size: 0.025 window_stride: 0.01 window: "hann" features: &n_mels 80 n_fft: 512 frame_splicing: 1 dither: 0.00001 pad_to: 16 stft_conv: false spec_augment: _target_: nemo.collections.asr.modules.MaskedPatchAugmentation freq_masks: 3 freq_width: 20 patch_size: 48 mask_patches: 0.5 encoder: _target_: nemo.collections.asr.modules.ConvASREncoder feat_in: *n_mels activation: swish conv_mask: true init_mode: "tds_uniform" jasper: - filters: ${model.model_defaults.filters} repeat: 1 kernel: [5] stride: [1] dilation: [1] dropout: 0.0 residual: false separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [2] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} stride_last: true residual_mode: "stride_add" kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [2] # *stride dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} stride_last: true residual_mode: "stride_add" kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [2] # stride dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} stride_last: true residual_mode: "stride_add" kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.filters} repeat: ${model.model_defaults.repeat} kernel: [5] stride: [1] dilation: [1] dropout: ${model.model_defaults.dropout} residual: true separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} - filters: ${model.model_defaults.enc_hidden} repeat: 1 kernel: [5] stride: [1] dilation: [1] dropout: 0.0 residual: false separable: ${model.model_defaults.separable} se: ${model.model_defaults.se} se_context_size: ${model.model_defaults.se_context_size} kernel_size_factor: ${model.model_defaults.kernel_size_factor} loss_list: contrastive: decoder: _target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction feat_in: ${model.model_defaults.enc_hidden} feat_hidden: 128 # features in hidden layer of decoder feat_out: ${model.model_defaults.decoder_out_channels} stride_layers: 1 # if loss.combine_time_steps is different than the encoder stride, # then a corresponding amount of stride_layers needs to # be added to the decoder (here stride is 8 and combine_time_steps is 4) non_stride_layers: 0 stride_transpose: true apply_softmax: false loss: _target_: nemo.collections.asr.losses.ContrastiveLoss in_dim: ${model.preprocessor.features} proj_dim: ${model.model_defaults.decoder_out_channels} combine_time_steps: 4 #how many spectrogram time steps are used for one target/representation for contrastive task quantized_targets: true #should quantizer or linear layer be used # (quantizer is required to extract pseudo-labels for other losses) codebook_size: 300 # number of vectors in the quantization codebook per group num_groups: 2 # number of groups in the quantizer codebook num_negatives: 100 # number of sampled negatives for each target sample_from_same_utterance_only: true #should negatives be sampled only from the same utterance sample_from_non_masked: false #should negatives be sampled from non-masked steps mlm: decoder: _target_: nemo.collections.asr.modules.ConvASRDecoderReconstruction feat_in: ${model.model_defaults.enc_hidden} feat_hidden: 128 # features in hidden layer of decoder feat_out: 90000 # this should be equal to codebook_size^groups in the contrastive loss to match the targets stride_layers: 1 stride_transpose: true activation: "identity" apply_softmax: true loss: _target_: nemo.collections.asr.losses.MLMLoss combine_time_steps: 4 targets_from_loss: "contrastive" loss_alpha: 1000. optim: name: adamw lr: 5.0 # optimizer arguments betas: [0.9, 0.98] weight_decay: 1e-3 # scheduler setup sched: name: NoamAnnealing d_model: ${model.model_defaults.enc_hidden} # scheduler config override warmup_steps: 25000 warmup_ratio: null min_lr: 1e-6 trainer: devices: -1 # number of GPUs, -1 would use all available GPUs num_nodes: 1 max_epochs: 1000 max_steps: -1 # computed at runtime if not set val_check_interval: 1.0 # Set to 0.25 to check 4 times per epoch, or an int for number of iterations accelerator: auto strategy: ddp accumulate_grad_batches: 1 gradient_clip_val: 1.0 precision: 32 # Should be set to 16 for O1 and O2 to enable the AMP. log_every_n_steps: 10 # Interval of logging. enable_progress_bar: True resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. num_sanity_val_steps: 0 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs sync_batchnorm: true enable_checkpointing: False # Provided by exp_manager logger: false # Provided by exp_manager benchmark: false # needs to be false for models with variable-length speech input as it slows down training exp_manager: exp_dir: null name: ${name} create_tensorboard_logger: true create_checkpoint_callback: true checkpoint_callback_params: # in case of multiple validation sets, first one is used monitor: "val_loss" mode: "min" save_top_k: 5 # you need to set these two to True to continue the training resume_if_exists: false resume_ignore_no_checkpoint: false # You may use this section to create a W&B logger create_wandb_logger: false wandb_logger_kwargs: name: null project: null