sinarashidi
/

s2st_fa-en_cvss

Model card Files Files and versions

xet

Community

sinarashidi commited on May 19, 2024

Commit

801e44e

verified ·

1 Parent(s): ebe92f1

Upload hyperparams.yaml with huggingface_hub

Browse files

Files changed (1) hide show

hyperparams.yaml +228 -0

hyperparams.yaml ADDED Viewed

	@@ -0,0 +1,228 @@

+###################################
+# Experiment Parameters and setup #
+###################################
+seed: 888
+__set_seed: !apply:torch.manual_seed [888]
+output_folder: results/s2ut/888
+save_folder: results/s2ut/888/save
+train_log: results/s2ut/888/train_log.txt
+epochs: 100
+use_tensorboard: true
+progress_samples: true
+progress_sample_path: results/s2ut/888/samples
+progress_samples_interval: 1
+progress_batch_sample_size: 4
+evaluation_interval: 10
+#################################
+# Data files and pre-processing #
+#################################
+src_data_folder: /workspace/speechbrain/common_voice # e.g, /corpus/CommonVoice/fr (French Data)
+tgt_data_folder: /workspace/speechbrain/cvss # e.g, /corpus/CV4/fr (English Data)
+sample_rate: 16000
+train_json: results/s2ut/888/save/train.json
+valid_json: results/s2ut/888/save/valid.json
+valid_small_json: results/s2ut/888/save/valid_small.json
+test_json: results/s2ut/888/save/test.json
+splits: [train, valid_small, valid, test]
+skip_prep: false
+# SSL model used to encode target features
+encoder_source: facebook/hubert-base-ls960
+layer: 6
+kmeans_source: speechbrain/tts-hifigan-unit-hubert-l6-k100-ljspeech
+codes_folder: results/s2ut/888/save/codes
+skip_extract: false
+# Vocoder model used for evaluation
+vocoder_source: speechbrain/tts-hifigan-unit-hubert-l6-k100-ljspeech
+vocoder_download_path: results/s2ut/888/save/pretrained_models/vocoder
+# ASR model used for evaluation
+asr_source: speechbrain/asr-wav2vec2-librispeech
+asr_download_path: results/s2ut/888/save/pretrained_models/asr
+# Wav2vec2 encoder
+wav2vec2_source: m3hrdadfi/wav2vec2-large-xlsr-persian-v3
+wav2vec2_download_path: results/s2ut/888/save/pretrained_models
+# wav2vec2 encoder specific parameters
+wav2vec2_frozen: false
+wav2vec2_freeze_steps: 10000
+####################### Training Parameters ####################################
+lr: 0.0005
+lr_wav2vec: 0.00001
+loss_reduction: batchmean
+# Outputs
+# blank_index: 102
+bos_index: 100
+eos_index: 101
+pad_index: 102
+label_smoothing: 0.2
+# Dynamic batching
+sorting: random
+num_workers: 4
+dynamic_batching: true
+max_batch_len: 80 # 40 GB GPU
+num_bucket: 200
+train_batch_size: 32 # if not using dynamic batching
+valid_batch_size: 1
+dynamic_batch_sampler:
+  max_batch_len: 80
+  num_buckets: 200
+  shuffle_ex: true   # if true re-creates batches at each epoch shuffling examples.
+  batch_ordering: random
+  max_batch_ex: 128
+train_dataloader_opts:
+  batch_size: 32
+  drop_last: false
+  num_workers: 4
+  collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+    padding_kwargs:
+      value: 102
+valid_dataloader_opts:
+  batch_size: 1
+  num_workers: 4
+  collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
+    padding_kwargs:
+      value: 102
+################################
+# Model Parameters and model   #
+################################
+# Feature parameters (W2V2 etc)
+features_dim: 1024 # large wav2vec output dimension, for base replace by 768
+# Length Regulator
+enc_kernel_size: 3
+enc_stride: 2
+# Transformer
+embedding_size: 512
+d_model: 512
+nhead: 8
+num_encoder_layers: 0
+num_decoder_layers: 6
+d_ffn: 2048
+transformer_dropout: 0.1
+activation: &id001 !name:torch.nn.GELU
+output_neurons: 103 # /!\ needs to be changed accordingly to the vocabulary
+attention_type: RelPosMHAXL   # "RelPosMHAXL" or "regularMHA"
+# Decoding parameters
+test_bs: 10
+min_decode_ratio: 0.0
+max_decode_ratio: 1.0
+############################## models ################################
+wav2vec2: &id002 !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+  source: m3hrdadfi/wav2vec2-large-xlsr-persian-v3
+  output_norm: true   ### Test in baseline_v2
+  freeze: false
+  freeze_feature_extractor: false
+  save_path: results/s2ut/888/save/pretrained_models
+  apply_spec_augment: true
+enc: &id003 !new:speechbrain.nnet.CNN.Conv1d
+  input_shape: [null, null, 1024]
+  out_channels: 512
+  kernel_size: 3
+  stride: 2
+transformer: &id004 !new:speechbrain.lobes.models.transformer.TransformerST.TransformerST
+                                                                                   # yamllint disable-line rule:line-length
+  input_size: 512
+  tgt_vocab: 103
+  d_model: 512
+  nhead: 8
+  num_encoder_layers: 0
+  num_decoder_layers: 6
+  d_ffn: 2048
+  dropout: 0.1
+  activation: *id001
+  attention_type: RelPosMHAXL
+  normalize_before: true
+  causal: true
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+  apply_log: true
+seq_lin: &id005 !new:speechbrain.nnet.linear.Linear
+  input_size: 512
+  n_neurons: 103
+modules:
+  wav2vec2: *id002
+  enc: *id003
+  transformer: *id004
+  seq_lin: *id005
+model: &id006 !new:torch.nn.ModuleList
+- [*id003, *id004, *id005]
+opt_class: !name:torch.optim.AdamW
+  lr: 0.0005
+  betas: (0.9, 0.98)
+wav2vec_opt_class: !name:torch.optim.AdamW
+  lr: 0.00001
+seq_cost: !name:speechbrain.nnet.losses.nll_loss
+  label_smoothing: 0.2
+  reduction: batchmean
+noam_annealing: &id008 !new:speechbrain.nnet.schedulers.NoamScheduler
+  lr_initial: 0.0005
+  n_warmup_steps: 5000
+wav2vec_annealing: &id009 !new:speechbrain.nnet.schedulers.NewBobScheduler
+  initial_value: 0.00001
+  improvement_threshold: 0.0025
+  annealing_factor: 0.98
+#epoch object
+epoch_counter: &id007 !new:speechbrain.utils.epoch_loop.EpochCounter
+  limit: 100
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+  save_file: results/s2ut/888/train_log.txt
+valid_search: !new:speechbrain.decoders.seq2seq.S2STransformerGreedySearcher
+  modules: [*id004, *id005, null]
+  bos_index: 100
+  eos_index: 101
+  min_decode_ratio: 0.0
+  max_decode_ratio: 1.0
+  temperature: 1.0
+test_search: !new:speechbrain.decoders.seq2seq.S2STransformerBeamSearcher
+  modules: [*id004, *id005]
+  bos_index: 100
+  eos_index: 101
+  min_decode_ratio: 0.0
+  max_decode_ratio: 1.0
+  beam_size: 10
+acc_computer: !name:speechbrain.utils.Accuracy.AccuracyStats
+bleu_computer: !name:speechbrain.utils.bleu.BLEUStats
+  merge_words: false
+#checkpointer
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+  checkpoints_dir: results/s2ut/888/save
+  recoverables:
+    model: *id006
+    wav2vec2: *id002
+    counter: *id007
+    noam_scheduler: *id008
+    wav2vec_scheduler: *id009