| name: &name "TitaNet-L" |
| sample_rate: &sample_rate 16000 |
|
|
| model: |
| train_ds: |
| manifest_filepath: ??? |
| sample_rate: 16000 |
| labels: null |
| batch_size: 64 |
| shuffle: True |
| is_tarred: False |
| tarred_audio_filepaths: null |
| tarred_shard_strategy: "scatter" |
| augmentor: |
| noise: |
| manifest_path: null |
| prob: 0.5 |
| min_snr_db: 0 |
| max_snr_db: 15 |
|
|
| speed: |
| prob: 0.3 |
| sr: *sample_rate |
| resample_type: 'kaiser_fast' |
| min_speed_rate: 0.95 |
| max_speed_rate: 1.05 |
|
|
| validation_ds: |
| manifest_filepath: ??? |
| sample_rate: 16000 |
| labels: null |
| batch_size: 128 |
| shuffle: False |
|
|
| model_defaults: |
| filters: 1024 |
| repeat: 3 |
| dropout: 0.1 |
| separable: true |
| se: true |
| se_context_size: -1 |
| kernel_size_factor: 1.0 |
| |
| preprocessor: |
| _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor |
| normalize: "per_feature" |
| window_size: 0.025 |
| sample_rate: *sample_rate |
| window_stride: 0.01 |
| window: "hann" |
| features: &n_mels 80 |
| n_fft: 512 |
| frame_splicing: 1 |
| dither: 0.00001 |
|
|
| spec_augment: |
| _target_: nemo.collections.asr.modules.SpectrogramAugmentation |
| freq_masks: 3 |
| freq_width: 4 |
| time_masks: 5 |
| time_width: 0.03 |
|
|
| encoder: |
| _target_: nemo.collections.asr.modules.ConvASREncoder |
| feat_in: *n_mels |
| activation: relu |
| conv_mask: true |
|
|
| jasper: |
| - filters: ${model.model_defaults.filters} |
| repeat: 1 |
| kernel: [3] |
| stride: [1] |
| dilation: [1] |
| dropout: 0.0 |
| residual: false |
| separable: ${model.model_defaults.separable} |
| se: ${model.model_defaults.se} |
| se_context_size: ${model.model_defaults.se_context_size} |
|
|
| - filters: ${model.model_defaults.filters} |
| repeat: ${model.model_defaults.repeat} |
| kernel: [7] |
| stride: [1] |
| dilation: [1] |
| dropout: ${model.model_defaults.dropout} |
| residual: true |
| separable: ${model.model_defaults.separable} |
| se: ${model.model_defaults.se} |
| se_context_size: ${model.model_defaults.se_context_size} |
|
|
| - filters: ${model.model_defaults.filters} |
| repeat: ${model.model_defaults.repeat} |
| kernel: [11] |
| stride: [1] |
| dilation: [1] |
| dropout: ${model.model_defaults.dropout} |
| residual: true |
| separable: ${model.model_defaults.separable} |
| se: ${model.model_defaults.se} |
| se_context_size: ${model.model_defaults.se_context_size} |
|
|
| - filters: ${model.model_defaults.filters} |
| repeat: ${model.model_defaults.repeat} |
| kernel: [15] |
| stride: [1] |
| dilation: [1] |
| dropout: ${model.model_defaults.dropout} |
| residual: true |
| separable: ${model.model_defaults.separable} |
| se: ${model.model_defaults.se} |
| se_context_size: ${model.model_defaults.se_context_size} |
|
|
| - filters: &enc_feat_out 3072 |
| repeat: 1 |
| kernel: [1] |
| stride: [1] |
| dilation: [1] |
| dropout: 0.0 |
| residual: false |
| separable: ${model.model_defaults.separable} |
| se: ${model.model_defaults.se} |
| se_context_size: ${model.model_defaults.se_context_size} |
|
|
| decoder: |
| _target_: nemo.collections.asr.modules.SpeakerDecoder |
| feat_in: *enc_feat_out |
| num_classes: 7205 |
| pool_mode: 'attention' |
| emb_sizes: 192 |
|
|
| loss: |
| _target_: nemo.collections.asr.losses.angularloss.AngularSoftmaxLoss |
| scale: 30 |
| margin: 0.2 |
|
|
| optim: |
| name: sgd |
| lr: .006 |
| weight_decay: 0.0002 |
| momentum: 0.9 |
|
|
| |
| sched: |
| name: CosineAnnealing |
| warmup_ratio: 0.1 |
| min_lr: 0.0 |
|
|
| trainer: |
| devices: 1 |
| max_epochs: 250 |
| max_steps: -1 |
| num_nodes: 1 |
| accelerator: gpu |
| strategy: ddp |
| deterministic: True |
| enable_checkpointing: False |
| logger: False |
| log_every_n_steps: 1 |
| val_check_interval: 1.0 |
| gradient_clip_val: 1.0 |
|
|
| exp_manager: |
| exp_dir: null |
| name: *name |
| create_tensorboard_logger: True |
| create_checkpoint_callback: True |
|
|