NeMo_Canary / examples /speechlm2 /conf /s2s_duplex_speech_decoder.yaml
Respair's picture
Upload folder using huggingface_hub
b386992 verified
model:
# Every name/path here starting with 'pretrained' is used to initialize the model weights.
pretrained_llm: TinyLlama/TinyLlama_v1.1
pretrained_audio_codec: ??? # to be released
pretrained_asr: stt_en_fastconformer_hybrid_large_streaming_80ms
scoring_asr: stt_en_fastconformer_transducer_large # used only in validation/evaluation
pretrained_weights: True # When False, we use pretrained_name to load the architecture, but with random init
# Regexp (re.compile) patterns matching parameters to be frozen.
freeze_params:
- "^audio_codec\\..+$" # Keep audio codec frozen as it only provides supervision for training.
prevent_freeze_params: [] # Use to make specific submodules trainable; overrides freeze_params
audio_loss_weight: 4
text_loss_weight: 3
# Note: Uncomment the block below to enable LoRA on LLM via HuggingFace PEFT library.
# It will automatically freeze LLM parameters even if freeze_params was unused,
# and prevent freezing any parameter that has the string '.lora_' in its name.
# lora:
# task_type: CAUSAL_LM
# r: 8
# lora_alpha: 32
# lora_dropout: 0.1
perception:
target: nemo.collections.speechlm2.modules.perception.AudioPerceptionModule
modality_adapter:
_target_: nemo.collections.asr.modules.ConformerEncoder
feat_in: 512
feat_out: -1 # you may set it if you need different output size other than the default d_model
n_layers: 2
d_model: 512
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
subsampling_factor: 1 # must be power of 2 for striding and vggnet
subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
causal_downsampling: true
ff_expansion_factor: 4
self_attention_model: rel_pos # rel_pos or abs_pos
n_heads: 8 # may need to be lower for smaller d_models
# [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
att_context_size: [70, 1] # -1 means unlimited context
att_context_style: chunked_limited # regular or chunked_limited
xscaling: true # scales up the input embeddings by sqrt(d_model)
untie_biases: true # unties the biases of the TransformerXL layers
pos_emb_max_len: 5000
conv_kernel_size: 9
conv_norm_type: layer_norm # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
# conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
# null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
conv_context_size: causal
### regularization
dropout: 0 # The dropout used in most of the Conformer Modules
dropout_pre_encoder: 0 # The dropout used before the encoder
dropout_emb: 0.0 # The dropout used for embeddings
dropout_att: 0 # The dropout for multi-headed attention modules
speech_decoder:
n_layers: 12
d_model: 768
d_ffn: 3072
sa_n_heads: 12
kernel_size: 3
p_dropout: 0.1
p_dropout_out: 0.0
has_xattn: false
xa_d_memory: 768
xa_n_heads: 12
is_causal: true
apply_norm_to_cond: true
apply_norm_out: true
max_length_causal_mask: 5000
cond_on_prev_audio_tokens: True
detach_input: False
use_learnable_pos_emb: True
optimizer:
_target_: torch.optim.AdamW
lr: 3e-4
betas: [0.9, 0.98]
weight_decay: 0
foreach: true # set to false if having issues with tensor-parallelism
lr_scheduler:
# _target_: nemo.core.optim.lr_scheduler.InverseSquareRootAnnealing
_target_: nemo.core.optim.lr_scheduler.CosineAnnealing
warmup_steps: 0 #2500
min_lr: 1e-6
max_steps: ${trainer.max_steps}
trainer:
devices: -1
accelerator: gpu
num_nodes: 1
precision: bf16-true
logger: False # logger provided by exp_manager
enable_checkpointing: False
use_distributed_sampler: False
max_steps: 1000000
limit_train_batches: 100 # "epoch" size
val_check_interval: ${trainer.limit_train_batches}
limit_val_batches: 10
log_every_n_steps: 10
num_sanity_val_steps: 1
gradient_clip_val: 1.0
accumulate_grad_batches: 1
strategy:
# Replace DDPStrategy with ModelParallelStrategy to enable model parallelism
_target_: lightning.pytorch.strategies.DDPStrategy
gradient_as_bucket_view: true
find_unused_parameters: true
# _target_: lightning.pytorch.strategies.ModelParallelStrategy
# tensor_parallel_size: 1
# data_parallel_size: 2
data:
frame_length: 0.08
source_sample_rate: 16000
target_sample_rate: 22050
input_roles: ["user", "User"]
output_roles: ["agent", "Assistant"]
train_ds:
sample_rate: ${data.target_sample_rate}
input_cfg:
- type: lhotse_shar
shar_path: ???
seed: 42
shard_seed: "randomized"
num_workers: 2
batch_size: 4
# Optional bucketing:
# batch_size: null
# batch_duration: 100
# bucket_duration_bins: [8.94766,10.1551,11.64118,19.30376,42.85]
# use_bucketing: true
# num_buckets: 5
# bucket_buffer_size: 5000
validation_ds:
# The entries under 'datasets' are a list of separate dataloaders.
# The structure is <dataset-name>: {<dataloader-dict-config>}
# They inherit all settings from validation_ds, but can individually override them.
datasets:
val_set_0: # rename to your dataset name, add more as needed
shar_path: ???
sample_rate: ${data.target_sample_rate}
batch_size: 1
seed: 42
shard_seed: "randomized"
exp_manager:
exp_dir: null
explicit_log_dir: s2s_sdv2_results/
name: speechlm2
create_tensorboard_logger: false
create_checkpoint_callback: true
use_datetime_version: true
max_time_per_run: 00:03:50:00
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
# you need to set these two to True to continue the training
resume_if_exists: true
resume_ignore_no_checkpoint: true
# You may use this section to create a W&B logger
create_wandb_logger: false
wandb_logger_kwargs:
name: development-run
project: speechlm2_speech_decoder
resume: true
checkpoint_callback_params:
filename: "{step}"
monitor: val_asr_bleu
mode: max
every_n_train_steps: null
every_n_epochs: 1
save_top_k: 1
always_save_nemo: false
save_nemo_on_train_end: false