NeMo_Canary / examples /speechlm2 /conf /s2s_duplex_speech_decoder.yaml

Upload folder using huggingface_hub

b386992 verified 7 months ago

6.61 kB

	model:
	# Every name/path here starting with 'pretrained' is used to initialize the model weights.
	pretrained_llm: TinyLlama/TinyLlama_v1.1
	pretrained_audio_codec: ??? # to be released
	pretrained_asr: stt_en_fastconformer_hybrid_large_streaming_80ms
	scoring_asr: stt_en_fastconformer_transducer_large # used only in validation/evaluation

	pretrained_weights: True # When False, we use pretrained_name to load the architecture, but with random init

	# Regexp (re.compile) patterns matching parameters to be frozen.
	freeze_params:
	- "^audio_codec\\..+$" # Keep audio codec frozen as it only provides supervision for training.
	prevent_freeze_params: [] # Use to make specific submodules trainable; overrides freeze_params

	audio_loss_weight: 4
	text_loss_weight: 3

	# Note: Uncomment the block below to enable LoRA on LLM via HuggingFace PEFT library.
	# It will automatically freeze LLM parameters even if freeze_params was unused,
	# and prevent freezing any parameter that has the string '.lora_' in its name.
	# lora:
	# task_type: CAUSAL_LM
	# r: 8
	# lora_alpha: 32
	# lora_dropout: 0.1

	perception:
	target: nemo.collections.speechlm2.modules.perception.AudioPerceptionModule
	modality_adapter:
	_target_: nemo.collections.asr.modules.ConformerEncoder
	feat_in: 512
	feat_out: -1 # you may set it if you need different output size other than the default d_model
	n_layers: 2
	d_model: 512
	subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
	subsampling_factor: 1 # must be power of 2 for striding and vggnet
	subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
	causal_downsampling: true
	ff_expansion_factor: 4
	self_attention_model: rel_pos # rel_pos or abs_pos
	n_heads: 8 # may need to be lower for smaller d_models
	# [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
	att_context_size: [70, 1] # -1 means unlimited context
	att_context_style: chunked_limited # regular or chunked_limited
	xscaling: true # scales up the input embeddings by sqrt(d_model)
	untie_biases: true # unties the biases of the TransformerXL layers
	pos_emb_max_len: 5000
	conv_kernel_size: 9
	conv_norm_type: layer_norm # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
	# conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
	# null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
	conv_context_size: causal
	### regularization
	dropout: 0 # The dropout used in most of the Conformer Modules
	dropout_pre_encoder: 0 # The dropout used before the encoder
	dropout_emb: 0.0 # The dropout used for embeddings
	dropout_att: 0 # The dropout for multi-headed attention modules

	speech_decoder:
	n_layers: 12
	d_model: 768
	d_ffn: 3072
	sa_n_heads: 12
	kernel_size: 3
	p_dropout: 0.1
	p_dropout_out: 0.0
	has_xattn: false
	xa_d_memory: 768
	xa_n_heads: 12
	is_causal: true
	apply_norm_to_cond: true
	apply_norm_out: true
	max_length_causal_mask: 5000
	cond_on_prev_audio_tokens: True
	detach_input: False
	use_learnable_pos_emb: True

	optimizer:
	_target_: torch.optim.AdamW
	lr: 3e-4
	betas: [0.9, 0.98]
	weight_decay: 0
	foreach: true # set to false if having issues with tensor-parallelism

	lr_scheduler:
	# _target_: nemo.core.optim.lr_scheduler.InverseSquareRootAnnealing
	_target_: nemo.core.optim.lr_scheduler.CosineAnnealing
	warmup_steps: 0 #2500
	min_lr: 1e-6
	max_steps: ${trainer.max_steps}

	trainer:
	devices: -1
	accelerator: gpu
	num_nodes: 1
	precision: bf16-true
	logger: False # logger provided by exp_manager
	enable_checkpointing: False
	use_distributed_sampler: False
	max_steps: 1000000
	limit_train_batches: 100 # "epoch" size
	val_check_interval: ${trainer.limit_train_batches}
	limit_val_batches: 10
	log_every_n_steps: 10
	num_sanity_val_steps: 1
	gradient_clip_val: 1.0
	accumulate_grad_batches: 1
	strategy:
	# Replace DDPStrategy with ModelParallelStrategy to enable model parallelism
	_target_: lightning.pytorch.strategies.DDPStrategy
	gradient_as_bucket_view: true
	find_unused_parameters: true
	# _target_: lightning.pytorch.strategies.ModelParallelStrategy
	# tensor_parallel_size: 1
	# data_parallel_size: 2

	data:
	frame_length: 0.08
	source_sample_rate: 16000
	target_sample_rate: 22050
	input_roles: ["user", "User"]
	output_roles: ["agent", "Assistant"]

	train_ds:
	sample_rate: ${data.target_sample_rate}
	input_cfg:
	- type: lhotse_shar
	shar_path: ???
	seed: 42
	shard_seed: "randomized"
	num_workers: 2
	batch_size: 4
	# Optional bucketing:
	# batch_size: null
	# batch_duration: 100
	# bucket_duration_bins: [8.94766,10.1551,11.64118,19.30376,42.85]
	# use_bucketing: true
	# num_buckets: 5
	# bucket_buffer_size: 5000

	validation_ds:
	# The entries under 'datasets' are a list of separate dataloaders.
	# The structure is <dataset-name>: {<dataloader-dict-config>}
	# They inherit all settings from validation_ds, but can individually override them.
	datasets:
	val_set_0: # rename to your dataset name, add more as needed
	shar_path: ???
	sample_rate: ${data.target_sample_rate}
	batch_size: 1
	seed: 42
	shard_seed: "randomized"

	exp_manager:
	exp_dir: null
	explicit_log_dir: s2s_sdv2_results/
	name: speechlm2
	create_tensorboard_logger: false
	create_checkpoint_callback: true
	use_datetime_version: true
	max_time_per_run: 00:03:50:00

	resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
	# you need to set these two to True to continue the training
	resume_if_exists: true
	resume_ignore_no_checkpoint: true

	# You may use this section to create a W&B logger
	create_wandb_logger: false
	wandb_logger_kwargs:
	name: development-run
	project: speechlm2_speech_decoder
	resume: true

	checkpoint_callback_params:
	filename: "{step}"
	monitor: val_asr_bleu
	mode: max
	every_n_train_steps: null
	every_n_epochs: 1
	save_top_k: 1
	always_save_nemo: false
	save_nemo_on_train_end: false