Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

CKPT.yaml +6 -0
inference.yaml +312 -0
model.ckpt +3 -0
mpd.txt +0 -0
per.txt +0 -0
perceived_ssl.ckpt +3 -0
tokenizer.ckpt +3 -0

CKPT.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+# yamllint disable
+PER: 17.544039328144205
+end-of-epoch: true
+epoch: 140
+mpd_f1: 0.7108831073653353
+unixtime: 1770905887.9787815

inference.yaml ADDED Viewed

	@@ -0,0 +1,312 @@

+# Hyperparameters toggles
+prefix: ""
+lab_enc_file: /home/m64000/work/IF-MDD/exp_iqra/wavlm_large_None_PhnMonoSSL_ottc_confEnc/save/label_encoder.txt
+ctc_loss_type: "crottc"  # Options: "ctc", "ottc", "crctc"
+encoder_type: "conformer"  # Options: None, "conformer", "zipformer", "rvq"
+wandb_project: "iqra_extra"
+# Wandb Tags
+tags:
+    - PhnMonoSSL
+    - crottc
+    - ConformerEncoder
+    - iqra_extra
+    - TTS_FT
+## SSL features Selection
+pretrained_models_path: pretrained_models/
+# pretrained_models:
+# {
+#     "wav2vec2_base": "facebook/wav2vec2-base", # 768
+#     "hubert_base": "facebook/hubert-base-ls960", # 768
+#     "wavlm_base": "microsoft/wavlm-base", # 768
+#     "wavlm_base_plus": "microsoft/wavlm-base-plus", # 768
+#     "hubert_multilingual": "utter-project/mHuBERT-147", # 768
+#     "clap" : "laion/clap-htsat-fused", # 768
+#     "data2vec_base": "facebook/data2vec-audio-base", # 768
+#     "wav2vec2_large": "facebook/wav2vec2-large", # 1024
+#     "hubert_large": "facebook/hubert-large-ls960", # 1024
+#     "wavlm_large": "microsoft/wavlm-large-plus", # 1024
+#     "data2vec_large": "facebook/data2vec-audio-large", #1024
+#     "whisper_medium": "openai/whisper-medium", # 1024
+#     "whisper_large_v3_turbo": "openai/whisper-large-v3-turbo", # 1280
+# }
+# select pretrained SSL models
+perceived_ssl_model: "wavlm_large" # in pretrained_models
+canonical_ssl_model: Null
+# # models hidden size, varies by model
+ENCODER_DIM: 1024
+# # How to fuse the features
+feature_fusion: "mono"        # Options: "mono" for single ssl, "dual_ssl_enc" for dual ssl encoder, "dual_loss" for single SSL dual ssl loss
+blend_alpha: 0.5              # If using "blend" fusion
+# Input files
+# Data files
+# data_folder_save: "/home/kevingenghaopeng/MDD/IF-MDD/data_iqra/demo_data"
+data_folder_save: "/home/m64000/work/dataset/data_iqra_extra_is26"
+train_annotation: !ref <data_folder_save>/iqra_extra_is26_train_aligned.json
+valid_annotation: !ref <data_folder_save>/iqra_extra_is26_dev_aligned.json
+test_annotation: !ref <data_folder_save>/iqra_extra_is26_test_aligned.json
+# Extra data
+train_annotation_extra: !ref <data_folder_save>/train-train_with_extra.json
+use_extra_train_data: False
+evaluate_key: "PER" # use "mpd_f1_seq" for Transformer decoder path best mpd f1
+                            # "PER_seq" for Transformer decoder's best error rate
+                            # "PER" for ctc path best error rate
+                            # "mpd_f1" for ctc path best mpd f1
+max_save_models: 3 # Maximum number of saved models for each metrics
+# generate training id for output folder
+# generate_training_id: !apply:trainer.generate_training_id.generate_training_id [!ref <perceived_ssl_model_id>, !ref <canonical_ssl_model_id>, !ref <feature_fusion>, !ref <prefix>]
+# output files
+output_folder: !ref exp_iqra/<perceived_ssl_model>_<canonical_ssl_model>_<feature_fusion>_<prefix>
+per_file: !ref <output_folder>/per.txt
+mpd_file: !ref <output_folder>/mpd.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+on_training_test_wer_folder: !ref <output_folder>/on_training_test_wer
+on_training_test_mpd_folder: !ref <output_folder>/on_training_test_mpd
+# Training Target
+training_target: "target" # "target": deduplicated canonical phoneme sequence; "target_with_repeats": with repeats
+                          # "canonical"
+                          # "perceived": deduplicated perceived phoneme sequence
+# Modules (SpeechBrain lobes)
+# modules:
+#     canonical_ssl: !ref <canonical_ssl>
+#     perceived_ssl: !ref <perceived_ssl>
+#     enc: !ref <enc>
+#     ConformerEncoder: !ref <ConformerEncoder>
+#     ctc_lin: !ref <ctc_lin>
+#     lm_weight: !ref <lm_weight>
+perceived_ssl: !apply:trainer.AutoSSLoader.AutoSSLLoader
+    model_name: !ref <perceived_ssl_model>
+    freeze: !ref <freeze_perceived_ssl>
+    freeze_feature_extractor: !ref <freeze_perceived_feature_extractor>
+    save_path: !ref <pretrained_models_path>
+    output_all_hiddens: False
+preceived_ssl_emb_layer: -1
+canonical_ssl: !apply:trainer.AutoSSLoader.AutoSSLLoader
+    model_name: !ref <canonical_ssl_model>
+    freeze: !ref <freeze_canonical_ssl>
+    freeze_feature_extractor: !ref <freeze_perceived_feature_extractor>
+    save_path: !ref <pretrained_models_path>
+    output_all_hiddens: False
+canonical_ssl_emb_layer: -1
+enc: !new:torch.nn.Sequential
+  - !new:speechbrain.lobes.models.VanillaNN.VanillaNN
+     input_shape: [null, null, !ref <ENCODER_DIM>]
+     activation: !ref <activation>
+     dnn_blocks: !ref <dnn_layers>
+     dnn_neurons: !ref <dnn_neurons>
+  - !new:torch.nn.LayerNorm
+     normalized_shape: !ref <dnn_neurons>
+kernel_size: 7
+attention_type: "RoPEMHA"  # Options: "standard", "RoPE"
+ConformerEncoder: !new:speechbrain.lobes.models.transformer.Conformer.ConformerEncoder
+      num_layers: 2
+      nhead: 8
+      d_ffn: !ref <dnn_neurons>
+      d_model: !ref <dnn_neurons>
+      dropout: 0.1
+      kernel_size: !ref <kernel_size>
+      attention_type: !ref <attention_type>
+ctc_lin:  !new:speechbrain.nnet.linear.Linear
+    input_size: !ref <dnn_neurons>
+    n_neurons: !ref <output_neurons>  # 40 phonemes + 1 blank + 1 err
+# lm_weight for OTTC's alpha prediction
+lm_weight: !new:speechbrain.nnet.linear.Linear
+    input_size: !ref <dnn_neurons>
+    n_neurons: 1  # 40 phonemes + 1 blank + 1 err
+# Model parameters
+activation: !name:torch.nn.LeakyReLU
+dnn_layers: 2
+dnn_neurons: 384
+freeze_perceived_ssl: False
+freeze_canonical_ssl: False
+freeze_perceived_feature_extractor: True  # freeze the CNN extractor in wav2vec
+freeze_canonical_feature_extractor: True         # Freeze Whisper encoder?
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+# ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
+#     blank_index: !ref <blank_index>
+# ctc_cost: !new:utils.CTCLossWithLabelPriors.CTCLossWithLabelPriors
+#     prior_scaling_factor: 0.3
+#     ctc_implementation: 'k2'
+#     blank: !ref <blank_index>
+#     reduction: 'sum'
+ctc_cost: !name:utils.losses.ot_loss.batched_ottc_loss_bucketized
+ctc_cost_mispro: !name:speechbrain.nnet.losses.ctc_loss
+    blank_index: !ref <blank_index>
+# Outputs
+output_neurons: 71 # l2arctic: 40phns(sil)+err+blank + eos + bos =44
+blank_index: 0
+model: !new:torch.nn.ModuleList
+    - [!ref <enc>, !ref <ctc_lin>, ]
+adam_opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+pretrained_opt_class: !name:torch.optim.Adam
+    lr: !ref <lr_pretrained>
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        perceived_ssl: !ref <perceived_ssl>
+        counter: !ref <epoch_counter>
+    allow_partial_load: True
+# canonical_ssl: !ref <canonical_ssl>
+# augmentation: !new:speechbrain.augment.time_domain.SpeedPerturb
+#     orig_freq: !ref <sample_rate>
+#     speeds: [95, 100, 105]
+spec_augmentation: !new:speechbrain.augment.freq_domain.SpectrogramDrop
+    drop_length_low: 5
+    drop_length_high: 27
+    drop_count_low: 1
+    drop_count_high: 3
+    replace: 'zeros'
+freq_chunk_augmentation: !new:speechbrain.augment.time_domain.DropFreq
+    drop_freq_low: 1e-14
+    drop_freq_high: 1
+    drop_freq_count_low: 1
+    drop_freq_count_high: 3
+    drop_freq_width: 0.10
+    epsilon: 1e-12
+drop_length_high: 3000
+time_chunk_augmentation: !new:speechbrain.augment.time_domain.DropChunk
+    drop_length_low: 1000
+    drop_length_high: !ref <drop_length_high>
+    drop_count_low: 1
+    drop_count_high: 3
+speed_augmentation: !new:speechbrain.augment.time_domain.SpeedPerturb
+    orig_freq: !ref <sample_rate>
+    speeds: [95, 100, 105]
+timewarp_augmentation: !new:speechbrain.augment.freq_domain.Warping
+    warp_window: 5
+    dim: 1 #  time
+augmentation: !new:speechbrain.augment.augmenter.Augmenter
+    augmentations:
+        - !ref <freq_chunk_augmentation>
+        - !ref <time_chunk_augmentation>
+        # - !new:speechbrain.augment.time_domain.SpeedPerturb # Apply speed perturbation ahead so the copy of
+        #     orig_freq: !ref <sample_rate>
+        #     speeds: [95, 100, 105]
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+# ctc_stats: !name:speechbrain.utils.metric_stats.MetricStats
+#     metric: !new:utils.CTCLossWithLabelPriors.CTCLossWithLabelPriors
+#         prior_scaling_factor: 0.3
+#         ctc_implementation: 'k2'
+#         blank: !ref <blank_index>
+#         reduction: 'none'
+ctc_stats: !name:speechbrain.utils.metric_stats.MetricStats
+    metric: !name:speechbrain.nnet.losses.ctc_loss
+        blank_index: !ref <blank_index>
+        reduction: batch
+per_stats: !name:speechbrain.utils.metric_stats.ErrorRateStats
+# # TIMIT
+# timit_local_data_folder: "/common/db/TIMIT"  # Path to TIMIT datase
+seed: 3047
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+# training parameters
+number_of_epochs: 300
+batch_size: 16
+lr: 0.0003
+sorting: ascending
+sample_rate: 16000
+gradient_accumulation: 2
+lr_pretrained: 0.00001
+# Mix-Precision Training
+auto_mix_prec: true
+# or
+precision: fp16         # 支持 "fp32"、"fp16" 或 "bf16"
+eval_precision: fp32    # 推理同样切换到 FP16
+# Dataloader options
+train_dataloader_opts:
+    batch_size: !ref <batch_size>
+valid_dataloader_opts:
+    batch_size: !ref <batch_size>
+test_dataloader_opts:
+    batch_size: !ref <batch_size>
+# # resume_from_pretrainer, to fine-tune from a saved pretrainer checkpoint
+# resume_from: /home/m64000/work/IF-MDD/exp_iqra_tts/wavlm_large_None_PhnMonoSSL_crottc_confEnc_RoPE_k7/save/CKPT+088_PER_6.2082_F1_0.9074.ckpt
+# resume_from_pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+#     collect_in: !ref <resume_from>/
+#     loadables:
+#         perceived_ssl: !ref <perceived_ssl>
+#         model: !ref <model>
+# #
+pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
+    collect_in: !ref <save_folder>/
+    loadables:
+        perceived_ssl: !ref <perceived_ssl>
+        model: !ref <model>
+        tokenizer: !ref <tokenizer>
+encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
+    perceived_ssl: !ref <perceived_ssl>
+    enc: !ref <enc>
+    ctc_lin: !ref <ctc_lin>
+    log_softmax: !ref <log_softmax>
+decoding_function: !name:speechbrain.decoders.ctc_greedy_decode
+    blank_id: !ref <blank_index>
+tokenizer: !new:speechbrain.dataio.encoder.CTCTextEncoder
+    load_from_file: /home/kevingenghaopeng/MDD/IF-MDD/pretrained_models/iqra_extra_acou_model/ottc_k7_RoPE_TTS_FT/label_encoder.txt
+modules:
+    encoder: !ref <encoder>

model.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e54409e3c2414c7263b9ab63c9228a1d7a231b6644324b2448b8bb3b3aeb744
+size 2241500

mpd.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

per.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

perceived_ssl.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1ff5b43b55c412e73381e8b257c9af3c2237fa71b76bac5119ca8b31a531ec4
+size 1262009130

tokenizer.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98cee9707ab67c3e29ee337debf4ba319cbc61c3777024db6b8f3494f0df5bfe
+size 583