# Generated 2025-02-12 from: # /home/hamidovaslon1/speechbrain/recipes/CommonVoice/ASR/transformer/hparams/staging_5.yaml # yamllint disable seed: 1101 __set_seed: !apply:speechbrain.utils.seed_everything [1101] output_folder: uz_transformer_4000/model_saved output_wer_folder: uz_transformer_4000/model_saved/ save_folder: uz_transformer_4000/model_saved/save train_log: uz_transformer_4000/model_saved/train_log.txt # Data files data_folder: /mnt/data/commonvoice/uz # e.g, /localscratch/cv-corpus-5.1-2020-06-22/fr data_folder_unlabeled: /mnt/data/youtube_audio train_tsv_file: /mnt/data/commonvoice/uz/train.tsv # Standard CommonVoice .tsv files dev_tsv_file: /mnt/data/commonvoice/uz/dev.tsv # Standard CommonVoice .tsv files test_tsv_file: /mnt/data/commonvoice/uz/test.tsv # Standard CommonVoice .tsv files unlabeled_tsv_file: audio_data_loader/dataloader/youtube_gcp.tsv # Path to the youtube dataset accented_letters: false language: uz # use 'it' for Italian, 'rw' for Kinyarwanda, 'en' for english train_csv: uz_transformer_4000/model_saved/train.csv valid_csv: uz_transformer_4000/model_saved/dev.csv test_csv: uz_transformer_4000/model_saved/test.csv unlabeled_csv: audio_data_loader/dataloader/unlabeled_ogg.csv # CREATE IN THE DIRECTORY skip_prep: false # Skip data preparation convert_to_wav: false # Switch this to True to convert all mp3 files to wav. # We remove utterance slonger than 10s in the train/dev/test sets as # longer sentences certainly correspond to "open microphones". avoid_if_longer_than: 50 # THIS IS TERRIBLE BUT WE HAVE NO CHOICE. # Some version of the CV dataset may contain one or two files of more than # 2 min in the validation and or test. This is an error by design of the dataset # as these files contain 90% of silence. We exclude them. avoid_if_longer_than_val_test: 100.0 ckpt_interval_minutes: 15 # save checkpoint every N min ####################### Training Parameters #################################### number_of_epochs: 100 optimizer_step_limit: 30000 batch_size: 200 # This works with a 32GB GPU ! (bs * nb_gpu * accum) > 128 ! ctc_weight: 0.3 grad_accumulation_factor: 3 loss_reduction: batchmean sorting: random num_workers: 4 precision: fp16 # bf16, fp16 or fp32 # stages related parameters lr_adam: 0.0008 weight_decay: 0.01 warmup_steps: 1000 augment_warmup: 6000 # BPE parameters token_type: bpe # ["unigram", "bpe", "char"] character_coverage: 1.0 # Feature parameters sample_rate: 16000 n_fft: 400 n_mels: 80 # This setup works well for A100 80GB GPU, adapts it to your needs. # Or turn it off (but training speed will decrease) dynamic_batching: true max_batch_length_train: 300 max_batch_length_val: 100 # we reduce it as the beam is much wider (VRAM) num_bucket: 200 shuffle: true # if true re-creates batches at each epoch shuffling examples. batch_ordering: random max_batch_ex: 256 dynamic_batch_sampler_train: max_batch_length: 300 num_buckets: 200 shuffle: true batch_ordering: random max_batch_ex: 256 dynamic_batch_sampler_valid: max_batch_length: 100 num_buckets: 200 shuffle: true batch_ordering: random max_batch_ex: 256 # Dataloader options train_dataloader_opts: batch_size: 200 shuffle: true num_workers: 4 valid_dataloader_opts: batch_size: 200 test_dataloader_opts: batch_size: 200 ####################### Model Parameters ########################### # Transformer d_model: 384 nhead: 8 num_encoder_layers: 8 num_decoder_layers: 4 d_ffn: 1536 transformer_dropout: 0.1 activation: &id001 !name:torch.nn.GELU output_neurons: 4000 # Outputs blank_index: 0 label_smoothing: 0.1 pad_index: 0 bos_index: 1 eos_index: 2 # Decoding parameters min_decode_ratio: 0.0 max_decode_ratio: 1.0 valid_search_interval: 1 valid_beam_size: 1 # We do greedy here so it's faster to decode ... test_beam_size: 10 ctc_weight_decode: 0.3 scorer_beam_scale: 0.3 ############################## models ################################ CNN: &id002 !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) num_blocks: 2 num_layers_per_block: 1 out_channels: (64, 32) kernel_sizes: (3, 3) strides: (2, 2) residuals: (False, False) Transformer: &id003 !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length input_size: 640 tgt_vocab: 4000 d_model: 384 nhead: 8 num_encoder_layers: 8 num_decoder_layers: 4 d_ffn: 1536 dropout: 0.1 conformer_activation: *id001 activation: *id001 encoder_module: conformer attention_type: RelPosMHAXL normalize_before: true causal: false ctc_lin: &id005 !new:speechbrain.nnet.linear.Linear input_size: 384 n_neurons: 4000 seq_lin: &id004 !new:speechbrain.nnet.linear.Linear input_size: 384 n_neurons: 4000 modules: CNN: *id002 Transformer: *id003 seq_lin: *id004 ctc_lin: *id005 model: &id008 !new:torch.nn.ModuleList - [*id002, *id003, *id004, *id005] Adam: !name:torch.optim.AdamW lr: 0.0008 weight_decay: 0.01 # Scorer ctc_scorer: &id006 !new:speechbrain.decoders.scorer.CTCScorer eos_index: 2 blank_index: 0 ctc_fc: *id005 scorer: &id007 !new:speechbrain.decoders.scorer.ScorerBuilder full_scorers: [*id006] weights: ctc: 0.3 scorer_beam_scale: 0.3 valid_search: !new:speechbrain.decoders.S2STransformerBeamSearcher modules: [*id003, *id004] bos_index: 1 eos_index: 2 min_decode_ratio: 0.0 max_decode_ratio: 1.0 beam_size: 1 using_eos_threshold: false length_normalization: true scorer: *id007 test_search: !new:speechbrain.decoders.S2STransformerBeamSearcher modules: [*id003, *id004] bos_index: 1 eos_index: 2 min_decode_ratio: 0.0 max_decode_ratio: 1.0 beam_size: 10 temperature: 1.15 using_eos_threshold: true scorer: *id007 log_softmax: !new:torch.nn.LogSoftmax dim: -1 ctc_cost: !name:speechbrain.nnet.losses.ctc_loss blank_index: 0 reduction: batchmean seq_cost: !name:speechbrain.nnet.losses.kldiv_loss label_smoothing: 0.1 reduction: batchmean noam_annealing: &id009 !new:speechbrain.nnet.schedulers.NoamScheduler lr_initial: 0.0008 n_warmup_steps: 1000 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: uz_transformer_4000/model_saved/save recoverables: model: *id008 noam_scheduler: *id009 normalizer: &id011 !new:speechbrain.processing.features.InputNormalization norm_type: global update_until_epoch: 4 ############################## Augmentations ################################### # Time Drop counter: &id010 !new:speechbrain.utils.epoch_loop.EpochCounter limit: 100 epoch_counter: *id010 normalize: *id011 time_drop: &id012 !new:speechbrain.augment.freq_domain.SpectrogramDrop drop_length_low: 15 drop_length_high: 25 drop_count_low: 3 drop_count_high: 3 replace: zeros dim: 1 # Frequency Drop freq_drop: &id013 !new:speechbrain.augment.freq_domain.SpectrogramDrop drop_length_low: 25 drop_length_high: 35 drop_count_low: 2 drop_count_high: 2 replace: zeros dim: 2 # Time warp time_warp: &id014 !new:speechbrain.augment.freq_domain.Warping fea_augment: !new:speechbrain.augment.augmenter.Augmenter min_augmentations: 3 max_augmentations: 3 augment_prob: 1.0 augmentations: [*id012, *id013, *id014] compute_features: !new:speechbrain.lobes.features.Fbank sample_rate: 16000 n_fft: 400 n_mels: 80 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: uz_transformer_4000/model_saved/train_log.txt error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats acc_computer: !name:speechbrain.utils.Accuracy.AccuracyStats cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats split_tokens: true