| # It contains the default values for training an autoregressive FastConformer-Transformer AED model with sub-word encoding. | |
| # Architecture and training config: | |
| # Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective | |
| # batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches. | |
| # Here are the recommended configs for different variants of FastConformer-Transformer, other parameters are the same as in this config file. | |
| # One extra (linear projection) layer is added between FastConformer encoder and Transformer decoder if they have different hidden sizes | |
| # It is recommended to initialize FastConformer with ASR/SSL pre-trained encoder for better accuracy and faster convergence | |
| # Canary model family | |
| # | Model | Num Params | encoder.n_layers | transf_decoder.config_dict.num_layers | transf_decoder.config_dict.max_sequence_length | model_defaults.asr_enc_hidden | model_defaults.lm_dec_hidden | | |
| # |:--------------------:|:----------:|:-----------------:|:-------------------------------------:|:----------------------------------------------:|:-----------------------------:|:----------------------------:| | |
| # | canary-1b | 1B | 24 | 24 | 512 | 1024 | 1024 | | |
| # | canary-1b-flash | 883M | 32 | 4 | 1024 | 1024 | 1024 | | |
| # | canary-180m-flash | 182M | 17 | 4 | 1024 | 512 | 1024 | | |
| # | |
| # a typical training manifest entry looks like this - | |
| # {"audio_filepath": "/path/to/audio/file.wav", "duration": 16.192, "text": "Text spoken in the audio.", "source_lang": "en", "target_lang": "en", "taskname": "asr", "pnc": "yes"} | |
| name: "FastConformer-Transformer-MultiTask" | |
| # Note: for larger models (1B+ params) initializing from a pretrained encoder | |
| # may help (or even be required to) stabilize the training. | |
| init_from_nemo_model: | |
| model0: | |
| path: "/home/ubuntu/NeMo_Canary/canary_results/Higurashi_ASR/checkpoints/Higurashi_ASR.nemo" | |
| exclude: ["transf_decoder._embedding.token_embedding", "log_softmax.mlp.layer0"] | |
| # init_from_pretrained_model: | |
| # model0: | |
| # name: "nvidia/canary-180m-flash" | |
| # include: ["encoder"] | |
| # If using example training script, below will be used to instantiate spl_tokens tokenizer. | |
| # Similar can be done by calling CanaryTokenizer.build_special_tokenizer(tokens, output_dir). | |
| # If a tokenizer exists in dir, will skip building and use already built tokenizer. | |
| spl_tokens: | |
| model_dir: ??? | |
| tokens: ["translate", "transcribe", 'ja'] | |
| force_rebuild: False # Set to True to build new tokenizer each time. | |
| model: | |
| sample_rate: 16000 | |
| label_smoothing: 0.0 | |
| use_loss_mask_for_prompt: false | |
| log_prediction: true # enables logging sample predictions in the output during training | |
| # Important ! Set the prompt format to the class you need | |
| prompt_format: ??? # Options supported: ["canary", "canary2"] | |
| prompt_defaults: null | |
| model_defaults: | |
| asr_enc_hidden: 1024 | |
| lm_enc_hidden: 512 | |
| lm_dec_hidden: 1024 | |
| train_ds: | |
| use_lhotse: true | |
| tarred_audio_filepaths: null | |
| manifest_filepath: ??? | |
| sample_rate: ${model.sample_rate} | |
| shuffle: true | |
| num_workers: 4 | |
| # To understand the settings below, please refer to Lhotse Dataloading documentation: | |
| # https://github.com/NVIDIA/NeMo/blob/main/docs/source/asr/datasets.rst#lhotse-dataloading | |
| # You can also check the following configuration dataclass: | |
| # https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/data/lhotse/dataloader.py#L36 | |
| batch_size: null | |
| batch_duration: 3000 | |
| quadratic_duration: 15 | |
| use_bucketing: True | |
| num_buckets: 20 | |
| bucket_buffer_size: 20000 | |
| shuffle_buffer_size: 10000 | |
| text_field: "text" | |
| lang_field: "target_lang" | |
| validation_ds: | |
| use_lhotse: true | |
| manifest_filepath: ??? | |
| sample_rate: ${model.sample_rate} | |
| batch_size: 12 # you may increase batch_size if your memory allows | |
| shuffle: false | |
| num_workers: 4 | |
| pin_memory: true | |
| use_start_end_token: true | |
| use_bucketing: false | |
| text_field: "text" | |
| lang_field: "target_lang" | |
| test_ds: | |
| use_lhotse: true | |
| manifest_filepath: ??? | |
| sample_rate: ${model.sample_rate} | |
| batch_size: 8 # you may increase batch_size if your memory allows | |
| shuffle: false | |
| num_workers: 4 | |
| pin_memory: true | |
| use_start_end_token: true | |
| use_bucketing: false | |
| # recommend small vocab size of 128 or 256 when using 4x sub-sampling | |
| # you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py | |
| tokenizer: | |
| dir: null # Null for aggregate tokenizers | |
| type: agg # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) or `agg` for aggregate tokenizers | |
| langs: | |
| spl_tokens: # special tokens model | |
| dir: null # Passed in training script | |
| type: bpe | |
| ja: # English tokenizer (example, replace with whichever language you would like or add tokenizers to add tokenizer for additional languages) | |
| dir: ??? | |
| type: bpe | |
| custom_tokenizer: | |
| _target_: nemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer # Can be replaced with other tokenizer for different prompt formats | |
| tokenizers: null # Filled at runtime by all the tokenizers inside the aggregate tokenizer | |
| # Audio Preprocessor | |
| preprocessor: | |
| _target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor | |
| sample_rate: ${model.sample_rate} | |
| normalize: "per_feature" | |
| window_size: 0.025 | |
| window_stride: 0.01 | |
| window: "hann" | |
| features: 128 | |
| n_fft: 512 | |
| log: true | |
| frame_splicing: 1 | |
| dither: 0.00001 | |
| pad_to: 0 | |
| pad_value: 0.0 | |
| # SpecAugment is applied either in the model or in the data layer | |
| spec_augment: | |
| _target_: nemo.collections.asr.modules.SpectrogramAugmentation | |
| freq_masks: 2 # set to zero to disable it | |
| # you may use lower time_masks for smaller models to have a faster convergence | |
| time_masks: 10 # set to zero to disable it | |
| freq_width: 27 | |
| time_width: 0.05 | |
| # FastConformer Encoder | |
| encoder: | |
| _target_: nemo.collections.asr.modules.ConformerEncoder | |
| feat_in: ${model.preprocessor.features} | |
| feat_out: -1 # you may set it if you need different output size other than the default d_model | |
| n_layers: 24 | |
| d_model: ${model.model_defaults.asr_enc_hidden} | |
| # Sub-sampling params | |
| subsampling: dw_striding # vggnet or striding, vggnet may give better results but needs more memory | |
| subsampling_factor: 8 # must be power of 2 | |
| subsampling_conv_channels: 256 # -1 sets it to d_model | |
| causal_downsampling: false | |
| reduction: null | |
| reduction_position: null | |
| reduction_factor: 1 | |
| # Feed forward module's params | |
| ff_expansion_factor: 4 | |
| # Multi-headed Attention Module's params | |
| self_attention_model: rel_pos # rel_pos or abs_pos | |
| n_heads: 8 # may need to be lower for smaller d_models | |
| # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention | |
| att_context_size: [-1, -1] # -1 means unlimited context | |
| xscaling: false # scales up the input embeddings by sqrt(d_model) | |
| untie_biases: true # unties the biases of the TransformerXL layers | |
| pos_emb_max_len: 4000 | |
| # Convolution module's params | |
| conv_kernel_size: 9 | |
| conv_norm_type: batch_norm | |
| conv_context_size: null | |
| ### regularization | |
| dropout: 0.1 # The dropout used in most of the Conformer Modules | |
| dropout_pre_encoder: 0.1 | |
| dropout_emb: 0.0 # The dropout used for embeddings | |
| dropout_att: 0.1 # The dropout for multi-headed attention modules | |
| # Optional Transformer Encoder sandwitched between ASR Encoder and Transformer Ddcoder. | |
| # Only used if num_layers > 0 | |
| transf_encoder: | |
| _target_: nemo.collections.asr.modules.transformer.transformer_encoders.TransformerEncoder | |
| num_layers: 0 | |
| hidden_size: ${model.model_defaults.lm_enc_hidden} | |
| inner_size: ${multiply:${model.model_defaults.lm_enc_hidden}, 4} | |
| num_attention_heads: 8 | |
| ffn_dropout: 0.1 | |
| attn_score_dropout: 0.1 | |
| attn_layer_dropout: 0.1 | |
| mask_future: False | |
| pre_ln: True | |
| pre_ln_final_layer_norm: True | |
| transf_decoder: | |
| _target_: nemo.collections.asr.modules.transformer.get_nemo_transformer | |
| model_name: null | |
| pretrained: false | |
| encoder: null | |
| pre_ln_final_layer_norm: true | |
| config_dict: | |
| max_sequence_length: 512 | |
| num_token_types: 0 | |
| embedding_dropout: 0.1 | |
| learn_positional_encodings: false | |
| hidden_size: ${model.model_defaults.lm_dec_hidden} | |
| inner_size: ${multiply:${model.model_defaults.lm_dec_hidden}, 4} | |
| num_layers: 24 | |
| num_attention_heads: 8 | |
| ffn_dropout: 0.1 | |
| attn_score_dropout: 0.1 | |
| attn_layer_dropout: 0.1 | |
| hidden_act: relu | |
| pre_ln: true | |
| vocab_size: None # Will be set by the model at runtime | |
| # Label Prediction Head (Token Classifier) | |
| head: | |
| _target_: nemo.collections.asr.parts.submodules.token_classifier.TokenClassifier | |
| num_layers: 1 | |
| activation: relu | |
| log_softmax: true | |
| hidden_size: ${model.transf_decoder.config_dict.hidden_size} | |
| num_classes: None # Will be set by the model at runtime | |
| dropout: 0.0 | |
| use_transformer_init: true | |
| # Decoding Strategy | |
| decoding: | |
| strategy: beam | |
| return_best_hypothesis: true # Returns the most probably hypothesis after beam search | |
| beam: | |
| beam_size: 4 | |
| len_pen: 0.0 | |
| max_generation_delta: 50 | |
| # Loss Config | |
| loss: | |
| _target_: nemo.collections.common.losses.smoothed_cross_entropy.SmoothedCrossEntropyLoss | |
| label_smoothing: ${model.label_smoothing} | |
| pad_id: null | |
| optim: | |
| name: adamw | |
| lr: 3e-4 | |
| # optimizer arguments | |
| betas: [0.9, 0.98] | |
| # less necessity for weight_decay as we already have large augmentations with SpecAug | |
| # you may need weight_decay for large models, stable AMP training, small datasets, or when lower augmentations are used | |
| # weight decay of 0.0 with lr of 2.0 also works fine | |
| weight_decay: 1e-3 | |
| # scheduler setup | |
| sched: | |
| name: InverseSquareRootAnnealing | |
| # scheduler config override | |
| warmup_steps: 5000 | |
| warmup_ratio: null | |
| min_lr: 1e-6 | |
| trainer: | |
| devices: -1 # number of GPUs, -1 would use all available GPUs | |
| num_nodes: 1 | |
| max_epochs: -1 | |
| max_steps: 100000 # computed at runtime if not set | |
| val_check_interval: 1. # Set to 0.25 to check 4 times per epoch, or an int for number of iterations | |
| accelerator: auto | |
| strategy: | |
| _target_: lightning.pytorch.strategies.DDPStrategy | |
| gradient_as_bucket_view: true | |
| accumulate_grad_batches: 1 | |
| gradient_clip_val: 0.0 | |
| precision: bf16-mixed # Should be set to bf16-mixed/16-mixed for O1 and O2 to enable the AMP. | |
| log_every_n_steps: 100 # Interval of logging. | |
| enable_progress_bar: True | |
| num_sanity_val_steps: 2 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it | |
| check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs | |
| sync_batchnorm: true | |
| enable_checkpointing: False # Provided by exp_manager | |
| logger: false # Provided by exp_manager | |
| use_distributed_sampler: false # Lhotse has its own distributed sampler | |
| # exp_manager: | |
| # exp_dir: null | |
| # name: ${name} | |
| # create_tensorboard_logger: true | |
| # create_checkpoint_callback: true | |
| # checkpoint_callback_params: | |
| # # in case of multiple validation sets, first one is used | |
| # monitor: "val_loss" | |
| # mode: "min" | |
| # save_top_k: 5 | |
| # always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints | |
| # # checkpoint_callback_params: | |
| # # every_n_train_steps: 2000 | |
| # # every_n_epochs: null # must be set to null to use every_n_train_steps | |
| # # monitor: "val_loss" # want all checkpoints, so step + mode: max always succeeds | |
| # # mode: "min" | |
| # # save_top_k: 5 # save all models | |
| # # save_last: True | |
| # # always_save_nemo: True | |
| exp_manager: | |
| exp_dir: null | |
| name: ${name} | |
| create_tensorboard_logger: true | |
| create_checkpoint_callback: true | |
| checkpoint_callback_params: | |
| every_n_train_steps: 4990 | |
| every_n_epochs: null # must be set to null to use every_n_train_steps | |
| monitor: "step" # want all checkpoints, so step + mode: max always succeeds | |
| mode: "min" | |
| save_top_k: 5 # save all models | |
| save_last: True | |
| always_save_nemo: True | |
| # create_checkpoint_callback: true | |
| # checkpoint_callback_params: | |
| # # in case of multiple validation sets, first one is used | |
| # monitor: "val_loss" | |
| # mode: "min" | |
| # save_top_k: 5 | |
| # always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints | |
| resume_from_checkpoint: /home/ubuntu/NeMo_Canary/canary_results/Higurashi_ASR_v.02/checkpoints/Higurashi_ASR_v.02--step=29940.0000-epoch=1-last.ckpt # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. | |
| # you need to set these two to True to continue the training | |
| resume_if_exists: true | |
| resume_ignore_no_checkpoint: true | |
| # You may use this section to create a W&B logger | |
| create_wandb_logger: false | |
| wandb_logger_kwargs: | |
| name: null | |
| project: null | |