NeMo_Canary / config /fast-conformer_aed.yaml
Respair's picture
Create fast-conformer_aed.yaml
08cd977 verified
# It contains the default values for training an autoregressive FastConformer-Transformer AED model with sub-word encoding.
# Architecture and training config:
# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective
# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
# Here are the recommended configs for different variants of FastConformer-Transformer, other parameters are the same as in this config file.
# One extra (linear projection) layer is added between FastConformer encoder and Transformer decoder if they have different hidden sizes
# It is recommended to initialize FastConformer with ASR/SSL pre-trained encoder for better accuracy and faster convergence
# Canary model family
# | Model | Num Params | encoder.n_layers | transf_decoder.config_dict.num_layers | transf_decoder.config_dict.max_sequence_length | model_defaults.asr_enc_hidden | model_defaults.lm_dec_hidden |
# |:--------------------:|:----------:|:-----------------:|:-------------------------------------:|:----------------------------------------------:|:-----------------------------:|:----------------------------:|
# | canary-1b | 1B | 24 | 24 | 512 | 1024 | 1024 |
# | canary-1b-flash | 883M | 32 | 4 | 1024 | 1024 | 1024 |
# | canary-180m-flash | 182M | 17 | 4 | 1024 | 512 | 1024 |
#
# a typical training manifest entry looks like this -
# {"audio_filepath": "/path/to/audio/file.wav", "duration": 16.192, "text": "Text spoken in the audio.", "source_lang": "en", "target_lang": "en", "taskname": "asr", "pnc": "yes"}
name: "FastConformer-Transformer-MultiTask"
# Note: for larger models (1B+ params) initializing from a pretrained encoder
# may help (or even be required to) stabilize the training.
init_from_nemo_model:
model0:
path: "/home/ubuntu/NeMo_Canary/canary_results/Higurashi_ASR/checkpoints/Higurashi_ASR.nemo"
exclude: ["transf_decoder._embedding.token_embedding", "log_softmax.mlp.layer0"]
# init_from_pretrained_model:
# model0:
# name: "nvidia/canary-180m-flash"
# include: ["encoder"]
# If using example training script, below will be used to instantiate spl_tokens tokenizer.
# Similar can be done by calling CanaryTokenizer.build_special_tokenizer(tokens, output_dir).
# If a tokenizer exists in dir, will skip building and use already built tokenizer.
spl_tokens:
model_dir: ???
tokens: ["translate", "transcribe", 'ja']
force_rebuild: False # Set to True to build new tokenizer each time.
model:
sample_rate: 16000
label_smoothing: 0.0
use_loss_mask_for_prompt: false
log_prediction: true # enables logging sample predictions in the output during training
# Important ! Set the prompt format to the class you need
prompt_format: ??? # Options supported: ["canary", "canary2"]
prompt_defaults: null
model_defaults:
asr_enc_hidden: 1024
lm_enc_hidden: 512
lm_dec_hidden: 1024
train_ds:
use_lhotse: true
tarred_audio_filepaths: null
manifest_filepath: ???
sample_rate: ${model.sample_rate}
shuffle: true
num_workers: 4
# To understand the settings below, please refer to Lhotse Dataloading documentation:
# https://github.com/NVIDIA/NeMo/blob/main/docs/source/asr/datasets.rst#lhotse-dataloading
# You can also check the following configuration dataclass:
# https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/data/lhotse/dataloader.py#L36
batch_size: null
batch_duration: 3000
quadratic_duration: 15
use_bucketing: True
num_buckets: 20
bucket_buffer_size: 20000
shuffle_buffer_size: 10000
text_field: "text"
lang_field: "target_lang"
validation_ds:
use_lhotse: true
manifest_filepath: ???
sample_rate: ${model.sample_rate}
batch_size: 12 # you may increase batch_size if your memory allows
shuffle: false
num_workers: 4
pin_memory: true
use_start_end_token: true
use_bucketing: false
text_field: "text"
lang_field: "target_lang"
test_ds:
use_lhotse: true
manifest_filepath: ???
sample_rate: ${model.sample_rate}
batch_size: 8 # you may increase batch_size if your memory allows
shuffle: false
num_workers: 4
pin_memory: true
use_start_end_token: true
use_bucketing: false
# recommend small vocab size of 128 or 256 when using 4x sub-sampling
# you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py
tokenizer:
dir: null # Null for aggregate tokenizers
type: agg # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) or `agg` for aggregate tokenizers
langs:
spl_tokens: # special tokens model
dir: null # Passed in training script
type: bpe
ja: # English tokenizer (example, replace with whichever language you would like or add tokenizers to add tokenizer for additional languages)
dir: ???
type: bpe
custom_tokenizer:
_target_: nemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer # Can be replaced with other tokenizer for different prompt formats
tokenizers: null # Filled at runtime by all the tokenizers inside the aggregate tokenizer
# Audio Preprocessor
preprocessor:
_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
sample_rate: ${model.sample_rate}
normalize: "per_feature"
window_size: 0.025
window_stride: 0.01
window: "hann"
features: 128
n_fft: 512
log: true
frame_splicing: 1
dither: 0.00001
pad_to: 0
pad_value: 0.0
# SpecAugment is applied either in the model or in the data layer
spec_augment:
_target_: nemo.collections.asr.modules.SpectrogramAugmentation
freq_masks: 2 # set to zero to disable it
# you may use lower time_masks for smaller models to have a faster convergence
time_masks: 10 # set to zero to disable it
freq_width: 27
time_width: 0.05
# FastConformer Encoder
encoder:
_target_: nemo.collections.asr.modules.ConformerEncoder
feat_in: ${model.preprocessor.features}
feat_out: -1 # you may set it if you need different output size other than the default d_model
n_layers: 24
d_model: ${model.model_defaults.asr_enc_hidden}
# Sub-sampling params
subsampling: dw_striding # vggnet or striding, vggnet may give better results but needs more memory
subsampling_factor: 8 # must be power of 2
subsampling_conv_channels: 256 # -1 sets it to d_model
causal_downsampling: false
reduction: null
reduction_position: null
reduction_factor: 1
# Feed forward module's params
ff_expansion_factor: 4
# Multi-headed Attention Module's params
self_attention_model: rel_pos # rel_pos or abs_pos
n_heads: 8 # may need to be lower for smaller d_models
# [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
att_context_size: [-1, -1] # -1 means unlimited context
xscaling: false # scales up the input embeddings by sqrt(d_model)
untie_biases: true # unties the biases of the TransformerXL layers
pos_emb_max_len: 4000
# Convolution module's params
conv_kernel_size: 9
conv_norm_type: batch_norm
conv_context_size: null
### regularization
dropout: 0.1 # The dropout used in most of the Conformer Modules
dropout_pre_encoder: 0.1
dropout_emb: 0.0 # The dropout used for embeddings
dropout_att: 0.1 # The dropout for multi-headed attention modules
# Optional Transformer Encoder sandwitched between ASR Encoder and Transformer Ddcoder.
# Only used if num_layers > 0
transf_encoder:
_target_: nemo.collections.asr.modules.transformer.transformer_encoders.TransformerEncoder
num_layers: 0
hidden_size: ${model.model_defaults.lm_enc_hidden}
inner_size: ${multiply:${model.model_defaults.lm_enc_hidden}, 4}
num_attention_heads: 8
ffn_dropout: 0.1
attn_score_dropout: 0.1
attn_layer_dropout: 0.1
mask_future: False
pre_ln: True
pre_ln_final_layer_norm: True
transf_decoder:
_target_: nemo.collections.asr.modules.transformer.get_nemo_transformer
model_name: null
pretrained: false
encoder: null
pre_ln_final_layer_norm: true
config_dict:
max_sequence_length: 512
num_token_types: 0
embedding_dropout: 0.1
learn_positional_encodings: false
hidden_size: ${model.model_defaults.lm_dec_hidden}
inner_size: ${multiply:${model.model_defaults.lm_dec_hidden}, 4}
num_layers: 24
num_attention_heads: 8
ffn_dropout: 0.1
attn_score_dropout: 0.1
attn_layer_dropout: 0.1
hidden_act: relu
pre_ln: true
vocab_size: None # Will be set by the model at runtime
# Label Prediction Head (Token Classifier)
head:
_target_: nemo.collections.asr.parts.submodules.token_classifier.TokenClassifier
num_layers: 1
activation: relu
log_softmax: true
hidden_size: ${model.transf_decoder.config_dict.hidden_size}
num_classes: None # Will be set by the model at runtime
dropout: 0.0
use_transformer_init: true
# Decoding Strategy
decoding:
strategy: beam
return_best_hypothesis: true # Returns the most probably hypothesis after beam search
beam:
beam_size: 4
len_pen: 0.0
max_generation_delta: 50
# Loss Config
loss:
_target_: nemo.collections.common.losses.smoothed_cross_entropy.SmoothedCrossEntropyLoss
label_smoothing: ${model.label_smoothing}
pad_id: null
optim:
name: adamw
lr: 3e-4
# optimizer arguments
betas: [0.9, 0.98]
# less necessity for weight_decay as we already have large augmentations with SpecAug
# you may need weight_decay for large models, stable AMP training, small datasets, or when lower augmentations are used
# weight decay of 0.0 with lr of 2.0 also works fine
weight_decay: 1e-3
# scheduler setup
sched:
name: InverseSquareRootAnnealing
# scheduler config override
warmup_steps: 5000
warmup_ratio: null
min_lr: 1e-6
trainer:
devices: -1 # number of GPUs, -1 would use all available GPUs
num_nodes: 1
max_epochs: -1
max_steps: 100000 # computed at runtime if not set
val_check_interval: 1. # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
accelerator: auto
strategy:
_target_: lightning.pytorch.strategies.DDPStrategy
gradient_as_bucket_view: true
accumulate_grad_batches: 1
gradient_clip_val: 0.0
precision: bf16-mixed # Should be set to bf16-mixed/16-mixed for O1 and O2 to enable the AMP.
log_every_n_steps: 100 # Interval of logging.
enable_progress_bar: True
num_sanity_val_steps: 2 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
sync_batchnorm: true
enable_checkpointing: False # Provided by exp_manager
logger: false # Provided by exp_manager
use_distributed_sampler: false # Lhotse has its own distributed sampler
# exp_manager:
# exp_dir: null
# name: ${name}
# create_tensorboard_logger: true
# create_checkpoint_callback: true
# checkpoint_callback_params:
# # in case of multiple validation sets, first one is used
# monitor: "val_loss"
# mode: "min"
# save_top_k: 5
# always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints
# # checkpoint_callback_params:
# # every_n_train_steps: 2000
# # every_n_epochs: null # must be set to null to use every_n_train_steps
# # monitor: "val_loss" # want all checkpoints, so step + mode: max always succeeds
# # mode: "min"
# # save_top_k: 5 # save all models
# # save_last: True
# # always_save_nemo: True
exp_manager:
exp_dir: null
name: ${name}
create_tensorboard_logger: true
create_checkpoint_callback: true
checkpoint_callback_params:
every_n_train_steps: 4990
every_n_epochs: null # must be set to null to use every_n_train_steps
monitor: "step" # want all checkpoints, so step + mode: max always succeeds
mode: "min"
save_top_k: 5 # save all models
save_last: True
always_save_nemo: True
# create_checkpoint_callback: true
# checkpoint_callback_params:
# # in case of multiple validation sets, first one is used
# monitor: "val_loss"
# mode: "min"
# save_top_k: 5
# always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints
resume_from_checkpoint: /home/ubuntu/NeMo_Canary/canary_results/Higurashi_ASR_v.02/checkpoints/Higurashi_ASR_v.02--step=29940.0000-epoch=1-last.ckpt # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
# you need to set these two to True to continue the training
resume_if_exists: true
resume_ignore_no_checkpoint: true
# You may use this section to create a W&B logger
create_wandb_logger: false
wandb_logger_kwargs:
name: null
project: null