# Generated 2025-02-12 from:
# /home/hamidovaslon1/speechbrain/recipes/CommonVoice/ASR/transformer/hparams/staging_5.yaml
# yamllint disable
seed: 1101
__set_seed: !apply:speechbrain.utils.seed_everything [1101]
output_folder: uz_transformer_4000/model_saved
output_wer_folder: uz_transformer_4000/model_saved/
save_folder: uz_transformer_4000/model_saved/save
train_log: uz_transformer_4000/model_saved/train_log.txt

# Data files
data_folder: /mnt/data/commonvoice/uz  # e.g, /localscratch/cv-corpus-5.1-2020-06-22/fr
data_folder_unlabeled: /mnt/data/youtube_audio
train_tsv_file: /mnt/data/commonvoice/uz/train.tsv # Standard CommonVoice .tsv files
dev_tsv_file: /mnt/data/commonvoice/uz/dev.tsv # Standard CommonVoice .tsv files
test_tsv_file: /mnt/data/commonvoice/uz/test.tsv # Standard CommonVoice .tsv files
unlabeled_tsv_file: audio_data_loader/dataloader/youtube_gcp.tsv  # Path to the youtube dataset
accented_letters: false
language: uz # use 'it' for Italian, 'rw' for Kinyarwanda, 'en' for english
train_csv: uz_transformer_4000/model_saved/train.csv
valid_csv: uz_transformer_4000/model_saved/dev.csv
test_csv: uz_transformer_4000/model_saved/test.csv
unlabeled_csv: audio_data_loader/dataloader/unlabeled_ogg.csv  # CREATE IN THE DIRECTORY
skip_prep: false # Skip data preparation
convert_to_wav: false # Switch this to True to convert all mp3 files to wav.

# We remove utterance slonger than 10s in the train/dev/test sets as
# longer sentences certainly correspond to "open microphones".
avoid_if_longer_than: 50

# THIS IS TERRIBLE BUT WE HAVE NO CHOICE.
# Some version of the CV dataset may contain one or two files of more than
# 2 min in the validation and or test. This is an error by design of the dataset
# as these files contain 90% of silence. We exclude them.
avoid_if_longer_than_val_test: 100.0

ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################
number_of_epochs: 100
optimizer_step_limit: 30000
batch_size: 200 # This works with a 32GB GPU ! (bs * nb_gpu * accum) > 128 !
ctc_weight: 0.3
grad_accumulation_factor: 3
loss_reduction: batchmean
sorting: random
num_workers: 4
precision: fp16 # bf16, fp16 or fp32

# stages related parameters
lr_adam: 0.0008
weight_decay: 0.01
warmup_steps: 1000
augment_warmup: 6000

# BPE parameters
token_type: bpe   # ["unigram", "bpe", "char"]
character_coverage: 1.0

# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 80

# This setup works well for A100 80GB GPU, adapts it to your needs.
# Or turn it off (but training speed will decrease)
dynamic_batching: true
max_batch_length_train: 300
max_batch_length_val: 100 # we reduce it as the beam is much wider (VRAM)
num_bucket: 200
shuffle: true # if true re-creates batches at each epoch shuffling examples.
batch_ordering: random
max_batch_ex: 256

dynamic_batch_sampler_train:
  max_batch_length: 300
  num_buckets: 200
  shuffle: true
  batch_ordering: random
  max_batch_ex: 256

dynamic_batch_sampler_valid:
  max_batch_length: 100
  num_buckets: 200
  shuffle: true
  batch_ordering: random
  max_batch_ex: 256

# Dataloader options
train_dataloader_opts:
  batch_size: 200
  shuffle: true
  num_workers: 4

valid_dataloader_opts:
  batch_size: 200

test_dataloader_opts:
  batch_size: 200


####################### Model Parameters ###########################
# Transformer
d_model: 384
nhead: 8
num_encoder_layers: 8
num_decoder_layers: 4
d_ffn: 1536
transformer_dropout: 0.1
activation: &id001 !name:torch.nn.GELU
output_neurons: 4000

# Outputs
blank_index: 0
label_smoothing: 0.1
pad_index: 0
bos_index: 1
eos_index: 2

# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_search_interval: 1
valid_beam_size: 1 # We do greedy here so it's faster to decode ...
test_beam_size: 10
ctc_weight_decode: 0.3
scorer_beam_scale: 0.3

############################## models ################################

CNN: &id002 !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
  input_shape: (8, 10, 80)
  num_blocks: 2
  num_layers_per_block: 1
  out_channels: (64, 32)
  kernel_sizes: (3, 3)
  strides: (2, 2)
  residuals: (False, False)

Transformer: &id003 !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR
                                                                                     # yamllint disable-line rule:line-length
  input_size: 640
  tgt_vocab: 4000
  d_model: 384
  nhead: 8
  num_encoder_layers: 8
  num_decoder_layers: 4
  d_ffn: 1536
  dropout: 0.1
  conformer_activation: *id001
  activation: *id001
  encoder_module: conformer
  attention_type: RelPosMHAXL
  normalize_before: true
  causal: false

ctc_lin: &id005 !new:speechbrain.nnet.linear.Linear

  input_size: 384
  n_neurons: 4000

seq_lin: &id004 !new:speechbrain.nnet.linear.Linear
  input_size: 384
  n_neurons: 4000

modules:
  CNN: *id002
  Transformer: *id003
  seq_lin: *id004
  ctc_lin: *id005
model: &id008 !new:torch.nn.ModuleList
- [*id002, *id003, *id004, *id005]
Adam: !name:torch.optim.AdamW
  lr: 0.0008
  weight_decay: 0.01

# Scorer
ctc_scorer: &id006 !new:speechbrain.decoders.scorer.CTCScorer
  eos_index: 2
  blank_index: 0
  ctc_fc: *id005
scorer: &id007 !new:speechbrain.decoders.scorer.ScorerBuilder

  full_scorers: [*id006]
  weights:
    ctc: 0.3
  scorer_beam_scale: 0.3

valid_search: !new:speechbrain.decoders.S2STransformerBeamSearcher
  modules: [*id003, *id004]
  bos_index: 1
  eos_index: 2
  min_decode_ratio: 0.0
  max_decode_ratio: 1.0
  beam_size: 1
  using_eos_threshold: false
  length_normalization: true
  scorer: *id007
test_search: !new:speechbrain.decoders.S2STransformerBeamSearcher
  modules: [*id003, *id004]
  bos_index: 1
  eos_index: 2
  min_decode_ratio: 0.0
  max_decode_ratio: 1.0
  beam_size: 10
  temperature: 1.15
  using_eos_threshold: true
  scorer: *id007
log_softmax: !new:torch.nn.LogSoftmax
  dim: -1

ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
  blank_index: 0
  reduction: batchmean

seq_cost: !name:speechbrain.nnet.losses.kldiv_loss
  label_smoothing: 0.1
  reduction: batchmean

noam_annealing: &id009 !new:speechbrain.nnet.schedulers.NoamScheduler
  lr_initial: 0.0008
  n_warmup_steps: 1000

checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
  checkpoints_dir: uz_transformer_4000/model_saved/save
  recoverables:
    model: *id008
    noam_scheduler: *id009
    normalizer: &id011 !new:speechbrain.processing.features.InputNormalization
      norm_type: global
      update_until_epoch: 4

############################## Augmentations ###################################

# Time Drop
    counter: &id010 !new:speechbrain.utils.epoch_loop.EpochCounter

      limit: 100

epoch_counter: *id010
normalize: *id011
time_drop: &id012 !new:speechbrain.augment.freq_domain.SpectrogramDrop
  drop_length_low: 15
  drop_length_high: 25
  drop_count_low: 3
  drop_count_high: 3
  replace: zeros
  dim: 1

# Frequency Drop
freq_drop: &id013 !new:speechbrain.augment.freq_domain.SpectrogramDrop
  drop_length_low: 25
  drop_length_high: 35
  drop_count_low: 2
  drop_count_high: 2
  replace: zeros
  dim: 2

# Time warp
time_warp: &id014 !new:speechbrain.augment.freq_domain.Warping

fea_augment: !new:speechbrain.augment.augmenter.Augmenter
  min_augmentations: 3
  max_augmentations: 3
  augment_prob: 1.0
  augmentations: [*id012, *id013, *id014]

compute_features: !new:speechbrain.lobes.features.Fbank
  sample_rate: 16000
  n_fft: 400
  n_mels: 80

train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
  save_file: uz_transformer_4000/model_saved/train_log.txt

error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
acc_computer: !name:speechbrain.utils.Accuracy.AccuracyStats
cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
  split_tokens: true