File size: 6,605 Bytes
b386992
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
model:
  # Every name/path here starting with 'pretrained' is used to initialize the model weights.
  pretrained_llm: TinyLlama/TinyLlama_v1.1
  pretrained_audio_codec: ???  # to be released
  pretrained_asr: stt_en_fastconformer_hybrid_large_streaming_80ms
  scoring_asr: stt_en_fastconformer_transducer_large  # used only in validation/evaluation

  pretrained_weights: True  # When False, we use pretrained_name to load the architecture, but with random init

  # Regexp (re.compile) patterns matching parameters to be frozen.
  freeze_params:
    - "^audio_codec\\..+$"  # Keep audio codec frozen as it only provides supervision for training.
  prevent_freeze_params: []  # Use to make specific submodules trainable; overrides freeze_params

  audio_loss_weight: 4
  text_loss_weight: 3

  # Note: Uncomment the block below to enable LoRA on LLM via HuggingFace PEFT library.
  #   It will automatically freeze LLM parameters even if freeze_params was unused,
  #   and prevent freezing any parameter that has the string '.lora_' in its name.
  # lora:
  #   task_type: CAUSAL_LM
  #   r: 8
  #   lora_alpha: 32
  #   lora_dropout: 0.1

  perception:
     target:  nemo.collections.speechlm2.modules.perception.AudioPerceptionModule
     modality_adapter:
       _target_: nemo.collections.asr.modules.ConformerEncoder
       feat_in: 512
       feat_out: -1 # you may set it if you need different output size other than the default d_model
       n_layers: 2
       d_model: 512
       subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
       subsampling_factor: 1 # must be power of 2 for striding and vggnet
       subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
       causal_downsampling: true
       ff_expansion_factor: 4
       self_attention_model: rel_pos # rel_pos or abs_pos
       n_heads: 8 # may need to be lower for smaller d_models
       # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
       att_context_size: [70, 1] # -1 means unlimited context
       att_context_style: chunked_limited # regular or chunked_limited
       xscaling: true # scales up the input embeddings by sqrt(d_model)
       untie_biases: true # unties the biases of the TransformerXL layers
       pos_emb_max_len: 5000
       conv_kernel_size: 9
       conv_norm_type: layer_norm # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
       # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
       # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
       conv_context_size: causal
       ### regularization
       dropout: 0 # The dropout used in most of the Conformer Modules
       dropout_pre_encoder: 0 # The dropout used before the encoder
       dropout_emb: 0.0 # The dropout used for embeddings
       dropout_att: 0 # The dropout for multi-headed attention modules

  speech_decoder:
    n_layers: 12
    d_model: 768
    d_ffn: 3072
    sa_n_heads: 12
    kernel_size: 3
    p_dropout: 0.1
    p_dropout_out: 0.0
    has_xattn: false
    xa_d_memory: 768
    xa_n_heads: 12
    is_causal: true
    apply_norm_to_cond: true
    apply_norm_out: true
    max_length_causal_mask: 5000
    cond_on_prev_audio_tokens: True
    detach_input: False
    use_learnable_pos_emb: True

  optimizer:
    _target_: torch.optim.AdamW
    lr: 3e-4
    betas: [0.9, 0.98]
    weight_decay: 0
    foreach: true # set to false if having issues with tensor-parallelism

  lr_scheduler:
#    _target_: nemo.core.optim.lr_scheduler.InverseSquareRootAnnealing
    _target_: nemo.core.optim.lr_scheduler.CosineAnnealing
    warmup_steps: 0  #2500
    min_lr: 1e-6
    max_steps: ${trainer.max_steps}

trainer:
  devices: -1
  accelerator: gpu
  num_nodes: 1
  precision: bf16-true
  logger: False # logger provided by exp_manager
  enable_checkpointing: False
  use_distributed_sampler: False
  max_steps: 1000000
  limit_train_batches: 100  # "epoch" size
  val_check_interval: ${trainer.limit_train_batches}
  limit_val_batches: 10
  log_every_n_steps: 10
  num_sanity_val_steps: 1
  gradient_clip_val: 1.0
  accumulate_grad_batches: 1
  strategy:
    # Replace DDPStrategy with ModelParallelStrategy to enable model parallelism
    _target_: lightning.pytorch.strategies.DDPStrategy
    gradient_as_bucket_view: true
    find_unused_parameters: true
    # _target_: lightning.pytorch.strategies.ModelParallelStrategy
    # tensor_parallel_size: 1
    # data_parallel_size: 2

data:
  frame_length: 0.08
  source_sample_rate: 16000
  target_sample_rate: 22050
  input_roles: ["user", "User"]
  output_roles: ["agent", "Assistant"]

  train_ds:
    sample_rate: ${data.target_sample_rate}
    input_cfg:
      - type: lhotse_shar
        shar_path: ???
    seed: 42
    shard_seed: "randomized"
    num_workers: 2
    batch_size: 4
    # Optional bucketing:
    # batch_size: null
    # batch_duration: 100
    # bucket_duration_bins: [8.94766,10.1551,11.64118,19.30376,42.85]
    # use_bucketing: true
    # num_buckets: 5
    # bucket_buffer_size: 5000

  validation_ds:
    # The entries under 'datasets' are a list of separate dataloaders.
    # The structure is <dataset-name>: {<dataloader-dict-config>}
    # They inherit all settings from validation_ds, but can individually override them.
    datasets:
      val_set_0:  # rename to your dataset name, add more as needed
        shar_path: ???
    sample_rate: ${data.target_sample_rate}
    batch_size: 1
    seed: 42
    shard_seed: "randomized"

exp_manager:
   exp_dir: null
   explicit_log_dir: s2s_sdv2_results/
   name: speechlm2
   create_tensorboard_logger: false
   create_checkpoint_callback: true
   use_datetime_version: true
   max_time_per_run: 00:03:50:00

   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   # you need to set these two to True to continue the training
   resume_if_exists: true
   resume_ignore_no_checkpoint: true

   # You may use this section to create a W&B logger
   create_wandb_logger: false
   wandb_logger_kwargs:
     name: development-run
     project: speechlm2_speech_decoder
     resume: true

   checkpoint_callback_params:
     filename: "{step}"
     monitor: val_asr_bleu
     mode: max
     every_n_train_steps: null
     every_n_epochs: 1
     save_top_k: 1
     always_save_nemo: false
     save_nemo_on_train_end: false