File size: 6,605 Bytes
b386992 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | model:
# Every name/path here starting with 'pretrained' is used to initialize the model weights.
pretrained_llm: TinyLlama/TinyLlama_v1.1
pretrained_audio_codec: ??? # to be released
pretrained_asr: stt_en_fastconformer_hybrid_large_streaming_80ms
scoring_asr: stt_en_fastconformer_transducer_large # used only in validation/evaluation
pretrained_weights: True # When False, we use pretrained_name to load the architecture, but with random init
# Regexp (re.compile) patterns matching parameters to be frozen.
freeze_params:
- "^audio_codec\\..+$" # Keep audio codec frozen as it only provides supervision for training.
prevent_freeze_params: [] # Use to make specific submodules trainable; overrides freeze_params
audio_loss_weight: 4
text_loss_weight: 3
# Note: Uncomment the block below to enable LoRA on LLM via HuggingFace PEFT library.
# It will automatically freeze LLM parameters even if freeze_params was unused,
# and prevent freezing any parameter that has the string '.lora_' in its name.
# lora:
# task_type: CAUSAL_LM
# r: 8
# lora_alpha: 32
# lora_dropout: 0.1
perception:
target: nemo.collections.speechlm2.modules.perception.AudioPerceptionModule
modality_adapter:
_target_: nemo.collections.asr.modules.ConformerEncoder
feat_in: 512
feat_out: -1 # you may set it if you need different output size other than the default d_model
n_layers: 2
d_model: 512
subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
subsampling_factor: 1 # must be power of 2 for striding and vggnet
subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
causal_downsampling: true
ff_expansion_factor: 4
self_attention_model: rel_pos # rel_pos or abs_pos
n_heads: 8 # may need to be lower for smaller d_models
# [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
att_context_size: [70, 1] # -1 means unlimited context
att_context_style: chunked_limited # regular or chunked_limited
xscaling: true # scales up the input embeddings by sqrt(d_model)
untie_biases: true # unties the biases of the TransformerXL layers
pos_emb_max_len: 5000
conv_kernel_size: 9
conv_norm_type: layer_norm # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
# conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
# null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
conv_context_size: causal
### regularization
dropout: 0 # The dropout used in most of the Conformer Modules
dropout_pre_encoder: 0 # The dropout used before the encoder
dropout_emb: 0.0 # The dropout used for embeddings
dropout_att: 0 # The dropout for multi-headed attention modules
speech_decoder:
n_layers: 12
d_model: 768
d_ffn: 3072
sa_n_heads: 12
kernel_size: 3
p_dropout: 0.1
p_dropout_out: 0.0
has_xattn: false
xa_d_memory: 768
xa_n_heads: 12
is_causal: true
apply_norm_to_cond: true
apply_norm_out: true
max_length_causal_mask: 5000
cond_on_prev_audio_tokens: True
detach_input: False
use_learnable_pos_emb: True
optimizer:
_target_: torch.optim.AdamW
lr: 3e-4
betas: [0.9, 0.98]
weight_decay: 0
foreach: true # set to false if having issues with tensor-parallelism
lr_scheduler:
# _target_: nemo.core.optim.lr_scheduler.InverseSquareRootAnnealing
_target_: nemo.core.optim.lr_scheduler.CosineAnnealing
warmup_steps: 0 #2500
min_lr: 1e-6
max_steps: ${trainer.max_steps}
trainer:
devices: -1
accelerator: gpu
num_nodes: 1
precision: bf16-true
logger: False # logger provided by exp_manager
enable_checkpointing: False
use_distributed_sampler: False
max_steps: 1000000
limit_train_batches: 100 # "epoch" size
val_check_interval: ${trainer.limit_train_batches}
limit_val_batches: 10
log_every_n_steps: 10
num_sanity_val_steps: 1
gradient_clip_val: 1.0
accumulate_grad_batches: 1
strategy:
# Replace DDPStrategy with ModelParallelStrategy to enable model parallelism
_target_: lightning.pytorch.strategies.DDPStrategy
gradient_as_bucket_view: true
find_unused_parameters: true
# _target_: lightning.pytorch.strategies.ModelParallelStrategy
# tensor_parallel_size: 1
# data_parallel_size: 2
data:
frame_length: 0.08
source_sample_rate: 16000
target_sample_rate: 22050
input_roles: ["user", "User"]
output_roles: ["agent", "Assistant"]
train_ds:
sample_rate: ${data.target_sample_rate}
input_cfg:
- type: lhotse_shar
shar_path: ???
seed: 42
shard_seed: "randomized"
num_workers: 2
batch_size: 4
# Optional bucketing:
# batch_size: null
# batch_duration: 100
# bucket_duration_bins: [8.94766,10.1551,11.64118,19.30376,42.85]
# use_bucketing: true
# num_buckets: 5
# bucket_buffer_size: 5000
validation_ds:
# The entries under 'datasets' are a list of separate dataloaders.
# The structure is <dataset-name>: {<dataloader-dict-config>}
# They inherit all settings from validation_ds, but can individually override them.
datasets:
val_set_0: # rename to your dataset name, add more as needed
shar_path: ???
sample_rate: ${data.target_sample_rate}
batch_size: 1
seed: 42
shard_seed: "randomized"
exp_manager:
exp_dir: null
explicit_log_dir: s2s_sdv2_results/
name: speechlm2
create_tensorboard_logger: false
create_checkpoint_callback: true
use_datetime_version: true
max_time_per_run: 00:03:50:00
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
# you need to set these two to True to continue the training
resume_if_exists: true
resume_ignore_no_checkpoint: true
# You may use this section to create a W&B logger
create_wandb_logger: false
wandb_logger_kwargs:
name: development-run
project: speechlm2_speech_decoder
resume: true
checkpoint_callback_params:
filename: "{step}"
monitor: val_asr_bleu
mode: max
every_n_train_steps: null
every_n_epochs: 1
save_top_k: 1
always_save_nemo: false
save_nemo_on_train_end: false
|