NeMo_Canary / config /fast-conformer_aed.yaml

Create fast-conformer_aed.yaml

08cd977 verified 8 months ago

13.7 kB

	# It contains the default values for training an autoregressive FastConformer-Transformer AED model with sub-word encoding.

	# Architecture and training config:
	# Default learning parameters in this config are set for effective batch size of 2K. To train it with smaller effective
	# batch sizes, you may need to re-tune the learning parameters or use higher accumulate_grad_batches.
	# Here are the recommended configs for different variants of FastConformer-Transformer, other parameters are the same as in this config file.
	# One extra (linear projection) layer is added between FastConformer encoder and Transformer decoder if they have different hidden sizes
	# It is recommended to initialize FastConformer with ASR/SSL pre-trained encoder for better accuracy and faster convergence

	# Canary model family

	# \| Model \| Num Params \| encoder.n_layers \| transf_decoder.config_dict.num_layers \| transf_decoder.config_dict.max_sequence_length \| model_defaults.asr_enc_hidden \| model_defaults.lm_dec_hidden \|
	# \|:--------------------:\|:----------:\|:-----------------:\|:-------------------------------------:\|:----------------------------------------------:\|:-----------------------------:\|:----------------------------:\|
	# \| canary-1b \| 1B \| 24 \| 24 \| 512 \| 1024 \| 1024 \|
	# \| canary-1b-flash \| 883M \| 32 \| 4 \| 1024 \| 1024 \| 1024 \|
	# \| canary-180m-flash \| 182M \| 17 \| 4 \| 1024 \| 512 \| 1024 \|
	#
	# a typical training manifest entry looks like this -
	# {"audio_filepath": "/path/to/audio/file.wav", "duration": 16.192, "text": "Text spoken in the audio.", "source_lang": "en", "target_lang": "en", "taskname": "asr", "pnc": "yes"}

	name: "FastConformer-Transformer-MultiTask"

	# Note: for larger models (1B+ params) initializing from a pretrained encoder
	# may help (or even be required to) stabilize the training.

	init_from_nemo_model:

	model0:
	path: "/home/ubuntu/NeMo_Canary/canary_results/Higurashi_ASR/checkpoints/Higurashi_ASR.nemo"
	exclude: ["transf_decoder._embedding.token_embedding", "log_softmax.mlp.layer0"]

	# init_from_pretrained_model:
	# model0:
	# name: "nvidia/canary-180m-flash"
	# include: ["encoder"]


	# If using example training script, below will be used to instantiate spl_tokens tokenizer.
	# Similar can be done by calling CanaryTokenizer.build_special_tokenizer(tokens, output_dir).
	# If a tokenizer exists in dir, will skip building and use already built tokenizer.
	spl_tokens:
	model_dir: ???
	tokens: ["translate", "transcribe", 'ja']
	force_rebuild: False # Set to True to build new tokenizer each time.

	model:
	sample_rate: 16000
	label_smoothing: 0.0
	use_loss_mask_for_prompt: false
	log_prediction: true # enables logging sample predictions in the output during training

	# Important ! Set the prompt format to the class you need
	prompt_format: ??? # Options supported: ["canary", "canary2"]
	prompt_defaults: null

	model_defaults:
	asr_enc_hidden: 1024
	lm_enc_hidden: 512
	lm_dec_hidden: 1024

	train_ds:
	use_lhotse: true
	tarred_audio_filepaths: null
	manifest_filepath: ???
	sample_rate: ${model.sample_rate}
	shuffle: true
	num_workers: 4
	# To understand the settings below, please refer to Lhotse Dataloading documentation:
	# https://github.com/NVIDIA/NeMo/blob/main/docs/source/asr/datasets.rst#lhotse-dataloading
	# You can also check the following configuration dataclass:
	# https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/common/data/lhotse/dataloader.py#L36
	batch_size: null
	batch_duration: 3000
	quadratic_duration: 15
	use_bucketing: True
	num_buckets: 20
	bucket_buffer_size: 20000
	shuffle_buffer_size: 10000
	text_field: "text"
	lang_field: "target_lang"


	validation_ds:
	use_lhotse: true
	manifest_filepath: ???
	sample_rate: ${model.sample_rate}
	batch_size: 12 # you may increase batch_size if your memory allows
	shuffle: false
	num_workers: 4
	pin_memory: true
	use_start_end_token: true
	use_bucketing: false
	text_field: "text"
	lang_field: "target_lang"

	test_ds:
	use_lhotse: true
	manifest_filepath: ???
	sample_rate: ${model.sample_rate}
	batch_size: 8 # you may increase batch_size if your memory allows
	shuffle: false
	num_workers: 4
	pin_memory: true
	use_start_end_token: true
	use_bucketing: false
	# recommend small vocab size of 128 or 256 when using 4x sub-sampling
	# you may find more detail on how to train a tokenizer at: /scripts/tokenizers/process_asr_text_tokenizer.py
	tokenizer:
	dir: null # Null for aggregate tokenizers
	type: agg # Can be either bpe (SentencePiece tokenizer) or wpe (WordPiece tokenizer) or `agg` for aggregate tokenizers
	langs:
	spl_tokens: # special tokens model
	dir: null # Passed in training script
	type: bpe
	ja: # English tokenizer (example, replace with whichever language you would like or add tokenizers to add tokenizer for additional languages)
	dir: ???
	type: bpe

	custom_tokenizer:
	_target_: nemo.collections.common.tokenizers.canary_tokenizer.CanaryTokenizer # Can be replaced with other tokenizer for different prompt formats
	tokenizers: null # Filled at runtime by all the tokenizers inside the aggregate tokenizer

	# Audio Preprocessor
	preprocessor:
	_target_: nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor
	sample_rate: ${model.sample_rate}
	normalize: "per_feature"
	window_size: 0.025
	window_stride: 0.01
	window: "hann"
	features: 128
	n_fft: 512
	log: true
	frame_splicing: 1
	dither: 0.00001
	pad_to: 0
	pad_value: 0.0

	# SpecAugment is applied either in the model or in the data layer
	spec_augment:
	_target_: nemo.collections.asr.modules.SpectrogramAugmentation
	freq_masks: 2 # set to zero to disable it
	# you may use lower time_masks for smaller models to have a faster convergence
	time_masks: 10 # set to zero to disable it
	freq_width: 27
	time_width: 0.05

	# FastConformer Encoder
	encoder:
	_target_: nemo.collections.asr.modules.ConformerEncoder
	feat_in: ${model.preprocessor.features}
	feat_out: -1 # you may set it if you need different output size other than the default d_model
	n_layers: 24
	d_model: ${model.model_defaults.asr_enc_hidden}

	# Sub-sampling params
	subsampling: dw_striding # vggnet or striding, vggnet may give better results but needs more memory
	subsampling_factor: 8 # must be power of 2
	subsampling_conv_channels: 256 # -1 sets it to d_model
	causal_downsampling: false
	reduction: null
	reduction_position: null
	reduction_factor: 1

	# Feed forward module's params
	ff_expansion_factor: 4

	# Multi-headed Attention Module's params
	self_attention_model: rel_pos # rel_pos or abs_pos
	n_heads: 8 # may need to be lower for smaller d_models
	# [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
	att_context_size: [-1, -1] # -1 means unlimited context
	xscaling: false # scales up the input embeddings by sqrt(d_model)
	untie_biases: true # unties the biases of the TransformerXL layers
	pos_emb_max_len: 4000

	# Convolution module's params
	conv_kernel_size: 9
	conv_norm_type: batch_norm
	conv_context_size: null

	### regularization
	dropout: 0.1 # The dropout used in most of the Conformer Modules
	dropout_pre_encoder: 0.1
	dropout_emb: 0.0 # The dropout used for embeddings
	dropout_att: 0.1 # The dropout for multi-headed attention modules

	# Optional Transformer Encoder sandwitched between ASR Encoder and Transformer Ddcoder.
	# Only used if num_layers > 0
	transf_encoder:
	_target_: nemo.collections.asr.modules.transformer.transformer_encoders.TransformerEncoder
	num_layers: 0
	hidden_size: ${model.model_defaults.lm_enc_hidden}
	inner_size: ${multiply:${model.model_defaults.lm_enc_hidden}, 4}
	num_attention_heads: 8
	ffn_dropout: 0.1
	attn_score_dropout: 0.1
	attn_layer_dropout: 0.1
	mask_future: False
	pre_ln: True
	pre_ln_final_layer_norm: True

	transf_decoder:
	_target_: nemo.collections.asr.modules.transformer.get_nemo_transformer
	model_name: null
	pretrained: false
	encoder: null
	pre_ln_final_layer_norm: true

	config_dict:
	max_sequence_length: 512
	num_token_types: 0
	embedding_dropout: 0.1
	learn_positional_encodings: false
	hidden_size: ${model.model_defaults.lm_dec_hidden}
	inner_size: ${multiply:${model.model_defaults.lm_dec_hidden}, 4}
	num_layers: 24
	num_attention_heads: 8
	ffn_dropout: 0.1
	attn_score_dropout: 0.1
	attn_layer_dropout: 0.1
	hidden_act: relu
	pre_ln: true
	vocab_size: None # Will be set by the model at runtime

	# Label Prediction Head (Token Classifier)
	head:
	_target_: nemo.collections.asr.parts.submodules.token_classifier.TokenClassifier
	num_layers: 1
	activation: relu
	log_softmax: true
	hidden_size: ${model.transf_decoder.config_dict.hidden_size}
	num_classes: None # Will be set by the model at runtime
	dropout: 0.0
	use_transformer_init: true

	# Decoding Strategy
	decoding:
	strategy: beam
	return_best_hypothesis: true # Returns the most probably hypothesis after beam search

	beam:
	beam_size: 4
	len_pen: 0.0
	max_generation_delta: 50

	# Loss Config
	loss:
	_target_: nemo.collections.common.losses.smoothed_cross_entropy.SmoothedCrossEntropyLoss
	label_smoothing: ${model.label_smoothing}
	pad_id: null

	optim:
	name: adamw
	lr: 3e-4
	# optimizer arguments
	betas: [0.9, 0.98]
	# less necessity for weight_decay as we already have large augmentations with SpecAug
	# you may need weight_decay for large models, stable AMP training, small datasets, or when lower augmentations are used
	# weight decay of 0.0 with lr of 2.0 also works fine
	weight_decay: 1e-3

	# scheduler setup
	sched:
	name: InverseSquareRootAnnealing
	# scheduler config override
	warmup_steps: 5000
	warmup_ratio: null
	min_lr: 1e-6

	trainer:
	devices: -1 # number of GPUs, -1 would use all available GPUs
	num_nodes: 1
	max_epochs: -1
	max_steps: 100000 # computed at runtime if not set
	val_check_interval: 1. # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
	accelerator: auto
	strategy:
	_target_: lightning.pytorch.strategies.DDPStrategy
	gradient_as_bucket_view: true
	accumulate_grad_batches: 1
	gradient_clip_val: 0.0
	precision: bf16-mixed # Should be set to bf16-mixed/16-mixed for O1 and O2 to enable the AMP.
	log_every_n_steps: 100 # Interval of logging.
	enable_progress_bar: True
	num_sanity_val_steps: 2 # number of steps to perform validation steps for sanity check the validation process before starting the training, setting to 0 disables it
	check_val_every_n_epoch: 1 # number of evaluations on validation every n epochs
	sync_batchnorm: true
	enable_checkpointing: False # Provided by exp_manager
	logger: false # Provided by exp_manager
	use_distributed_sampler: false # Lhotse has its own distributed sampler

	# exp_manager:
	# exp_dir: null
	# name: ${name}
	# create_tensorboard_logger: true
	# create_checkpoint_callback: true
	# checkpoint_callback_params:
	# # in case of multiple validation sets, first one is used
	# monitor: "val_loss"
	# mode: "min"
	# save_top_k: 5
	# always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints
	# # checkpoint_callback_params:
	# # every_n_train_steps: 2000
	# # every_n_epochs: null # must be set to null to use every_n_train_steps
	# # monitor: "val_loss" # want all checkpoints, so step + mode: max always succeeds
	# # mode: "min"
	# # save_top_k: 5 # save all models
	# # save_last: True
	# # always_save_nemo: True

	exp_manager:
	exp_dir: null
	name: ${name}
	create_tensorboard_logger: true

	create_checkpoint_callback: true
	checkpoint_callback_params:
	every_n_train_steps: 4990
	every_n_epochs: null # must be set to null to use every_n_train_steps
	monitor: "step" # want all checkpoints, so step + mode: max always succeeds
	mode: "min"
	save_top_k: 5 # save all models
	save_last: True
	always_save_nemo: True

	# create_checkpoint_callback: true
	# checkpoint_callback_params:
	# # in case of multiple validation sets, first one is used
	# monitor: "val_loss"
	# mode: "min"
	# save_top_k: 5
	# always_save_nemo: True # saves the checkpoints as nemo files instead of PTL checkpoints



	resume_from_checkpoint: /home/ubuntu/NeMo_Canary/canary_results/Higurashi_ASR_v.02/checkpoints/Higurashi_ASR_v.02--step=29940.0000-epoch=1-last.ckpt # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
	# you need to set these two to True to continue the training
	resume_if_exists: true
	resume_ignore_no_checkpoint: true

	# You may use this section to create a W&B logger
	create_wandb_logger: false
	wandb_logger_kwargs:
	name: null
	project: null