File size: 6,605 Bytes

b386992

model:
  # Every name/path here starting with 'pretrained' is used to initialize the model weights.
  pretrained_llm: TinyLlama/TinyLlama_v1.1
  pretrained_audio_codec: ???  # to be released
  pretrained_asr: stt_en_fastconformer_hybrid_large_streaming_80ms
  scoring_asr: stt_en_fastconformer_transducer_large  # used only in validation/evaluation

  pretrained_weights: True  # When False, we use pretrained_name to load the architecture, but with random init

  # Regexp (re.compile) patterns matching parameters to be frozen.
  freeze_params:
    - "^audio_codec\\..+$"  # Keep audio codec frozen as it only provides supervision for training.
  prevent_freeze_params: []  # Use to make specific submodules trainable; overrides freeze_params

  audio_loss_weight: 4
  text_loss_weight: 3

  # Note: Uncomment the block below to enable LoRA on LLM via HuggingFace PEFT library.
  #   It will automatically freeze LLM parameters even if freeze_params was unused,
  #   and prevent freezing any parameter that has the string '.lora_' in its name.
  # lora:
  #   task_type: CAUSAL_LM
  #   r: 8
  #   lora_alpha: 32
  #   lora_dropout: 0.1

  perception:
     target:  nemo.collections.speechlm2.modules.perception.AudioPerceptionModule
     modality_adapter:
       _target_: nemo.collections.asr.modules.ConformerEncoder
       feat_in: 512
       feat_out: -1 # you may set it if you need different output size other than the default d_model
       n_layers: 2
       d_model: 512
       subsampling: dw_striding # vggnet, striding, stacking or stacking_norm, dw_striding
       subsampling_factor: 1 # must be power of 2 for striding and vggnet
       subsampling_conv_channels: 256 # set to -1 to make it equal to the d_model
       causal_downsampling: true
       ff_expansion_factor: 4
       self_attention_model: rel_pos # rel_pos or abs_pos
       n_heads: 8 # may need to be lower for smaller d_models
       # [left, right] specifies the number of steps to be seen from left and right of each step in self-attention
       att_context_size: [70, 1] # -1 means unlimited context
       att_context_style: chunked_limited # regular or chunked_limited
       xscaling: true # scales up the input embeddings by sqrt(d_model)
       untie_biases: true # unties the biases of the TransformerXL layers
       pos_emb_max_len: 5000
       conv_kernel_size: 9
       conv_norm_type: layer_norm # batch_norm or layer_norm or groupnormN (N specifies the number of groups)
       # conv_context_size can be"causal" or a list of two integers while conv_context_size[0]+conv_context_size[1]+1==conv_kernel_size
       # null means [(kernel_size-1)//2, (kernel_size-1)//2], and 'causal' means [(kernel_size-1), 0]
       conv_context_size: causal
       ### regularization
       dropout: 0 # The dropout used in most of the Conformer Modules
       dropout_pre_encoder: 0 # The dropout used before the encoder
       dropout_emb: 0.0 # The dropout used for embeddings
       dropout_att: 0 # The dropout for multi-headed attention modules

  speech_decoder:
    n_layers: 12
    d_model: 768
    d_ffn: 3072
    sa_n_heads: 12
    kernel_size: 3
    p_dropout: 0.1
    p_dropout_out: 0.0
    has_xattn: false
    xa_d_memory: 768
    xa_n_heads: 12
    is_causal: true
    apply_norm_to_cond: true
    apply_norm_out: true
    max_length_causal_mask: 5000
    cond_on_prev_audio_tokens: True
    detach_input: False
    use_learnable_pos_emb: True

  optimizer:
    _target_: torch.optim.AdamW
    lr: 3e-4
    betas: [0.9, 0.98]
    weight_decay: 0
    foreach: true # set to false if having issues with tensor-parallelism

  lr_scheduler:
#    _target_: nemo.core.optim.lr_scheduler.InverseSquareRootAnnealing
    _target_: nemo.core.optim.lr_scheduler.CosineAnnealing
    warmup_steps: 0  #2500
    min_lr: 1e-6
    max_steps: ${trainer.max_steps}

trainer:
  devices: -1
  accelerator: gpu
  num_nodes: 1
  precision: bf16-true
  logger: False # logger provided by exp_manager
  enable_checkpointing: False
  use_distributed_sampler: False
  max_steps: 1000000
  limit_train_batches: 100  # "epoch" size
  val_check_interval: ${trainer.limit_train_batches}
  limit_val_batches: 10
  log_every_n_steps: 10
  num_sanity_val_steps: 1
  gradient_clip_val: 1.0
  accumulate_grad_batches: 1
  strategy:
    # Replace DDPStrategy with ModelParallelStrategy to enable model parallelism
    _target_: lightning.pytorch.strategies.DDPStrategy
    gradient_as_bucket_view: true
    find_unused_parameters: true
    # _target_: lightning.pytorch.strategies.ModelParallelStrategy
    # tensor_parallel_size: 1
    # data_parallel_size: 2

data:
  frame_length: 0.08
  source_sample_rate: 16000
  target_sample_rate: 22050
  input_roles: ["user", "User"]
  output_roles: ["agent", "Assistant"]

  train_ds:
    sample_rate: ${data.target_sample_rate}
    input_cfg:
      - type: lhotse_shar
        shar_path: ???
    seed: 42
    shard_seed: "randomized"
    num_workers: 2
    batch_size: 4
    # Optional bucketing:
    # batch_size: null
    # batch_duration: 100
    # bucket_duration_bins: [8.94766,10.1551,11.64118,19.30376,42.85]
    # use_bucketing: true
    # num_buckets: 5
    # bucket_buffer_size: 5000

  validation_ds:
    # The entries under 'datasets' are a list of separate dataloaders.
    # The structure is <dataset-name>: {<dataloader-dict-config>}
    # They inherit all settings from validation_ds, but can individually override them.
    datasets:
      val_set_0:  # rename to your dataset name, add more as needed
        shar_path: ???
    sample_rate: ${data.target_sample_rate}
    batch_size: 1
    seed: 42
    shard_seed: "randomized"

exp_manager:
   exp_dir: null
   explicit_log_dir: s2s_sdv2_results/
   name: speechlm2
   create_tensorboard_logger: false
   create_checkpoint_callback: true
   use_datetime_version: true
   max_time_per_run: 00:03:50:00

   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
   # you need to set these two to True to continue the training
   resume_if_exists: true
   resume_ignore_no_checkpoint: true

   # You may use this section to create a W&B logger
   create_wandb_logger: false
   wandb_logger_kwargs:
     name: development-run
     project: speechlm2_speech_decoder
     resume: true

   checkpoint_callback_params:
     filename: "{step}"
     monitor: val_asr_bleu
     mode: max
     every_n_train_steps: null
     every_n_epochs: 1
     save_top_k: 1
     always_save_nemo: false
     save_nemo_on_train_end: false