iqra_OTTC / inference.yaml

Upload folder using huggingface_hub

dbdf54a verified 2 months ago

10.7 kB

	# Hyperparameters toggles
	prefix: ""

	lab_enc_file: /home/m64000/work/IF-MDD/exp_iqra/wavlm_large_None_PhnMonoSSL_ottc_confEnc/save/label_encoder.txt
	ctc_loss_type: "crottc" # Options: "ctc", "ottc", "crctc"
	encoder_type: "conformer" # Options: None, "conformer", "zipformer", "rvq"

	wandb_project: "iqra_extra"
	# Wandb Tags
	tags:
	- PhnMonoSSL
	- crottc
	- ConformerEncoder
	- iqra_extra
	- TTS_FT

	## SSL features Selection
	pretrained_models_path: pretrained_models/
	# pretrained_models:
	# {
	# "wav2vec2_base": "facebook/wav2vec2-base", # 768
	# "hubert_base": "facebook/hubert-base-ls960", # 768
	# "wavlm_base": "microsoft/wavlm-base", # 768
	# "wavlm_base_plus": "microsoft/wavlm-base-plus", # 768
	# "hubert_multilingual": "utter-project/mHuBERT-147", # 768
	# "clap" : "laion/clap-htsat-fused", # 768
	# "data2vec_base": "facebook/data2vec-audio-base", # 768

	# "wav2vec2_large": "facebook/wav2vec2-large", # 1024
	# "hubert_large": "facebook/hubert-large-ls960", # 1024
	# "wavlm_large": "microsoft/wavlm-large-plus", # 1024
	# "data2vec_large": "facebook/data2vec-audio-large", #1024
	# "whisper_medium": "openai/whisper-medium", # 1024

	# "whisper_large_v3_turbo": "openai/whisper-large-v3-turbo", # 1280
	# }



	# select pretrained SSL models
	perceived_ssl_model: "wavlm_large" # in pretrained_models
	canonical_ssl_model: Null

	# # models hidden size, varies by model
	ENCODER_DIM: 1024

	# # How to fuse the features
	feature_fusion: "mono" # Options: "mono" for single ssl, "dual_ssl_enc" for dual ssl encoder, "dual_loss" for single SSL dual ssl loss
	blend_alpha: 0.5 # If using "blend" fusion

	# Input files
	# Data files
	# data_folder_save: "/home/kevingenghaopeng/MDD/IF-MDD/data_iqra/demo_data"
	data_folder_save: "/home/m64000/work/dataset/data_iqra_extra_is26"
	train_annotation: !ref <data_folder_save>/iqra_extra_is26_train_aligned.json
	valid_annotation: !ref <data_folder_save>/iqra_extra_is26_dev_aligned.json
	test_annotation: !ref <data_folder_save>/iqra_extra_is26_test_aligned.json
	# Extra data
	train_annotation_extra: !ref <data_folder_save>/train-train_with_extra.json
	use_extra_train_data: False

	evaluate_key: "PER" # use "mpd_f1_seq" for Transformer decoder path best mpd f1
	# "PER_seq" for Transformer decoder's best error rate
	# "PER" for ctc path best error rate
	# "mpd_f1" for ctc path best mpd f1
	max_save_models: 3 # Maximum number of saved models for each metrics
	# generate training id for output folder
	# generate_training_id: !apply:trainer.generate_training_id.generate_training_id [!ref <perceived_ssl_model_id>, !ref <canonical_ssl_model_id>, !ref <feature_fusion>, !ref <prefix>]

	# output files
	output_folder: !ref exp_iqra/<perceived_ssl_model>_<canonical_ssl_model>_<feature_fusion>_<prefix>
	per_file: !ref <output_folder>/per.txt
	mpd_file: !ref <output_folder>/mpd.txt
	save_folder: !ref <output_folder>/save
	train_log: !ref <output_folder>/train_log.txt

	on_training_test_wer_folder: !ref <output_folder>/on_training_test_wer
	on_training_test_mpd_folder: !ref <output_folder>/on_training_test_mpd

	# Training Target
	training_target: "target" # "target": deduplicated canonical phoneme sequence; "target_with_repeats": with repeats
	# "canonical"
	# "perceived": deduplicated perceived phoneme sequence
	# Modules (SpeechBrain lobes)
	# modules:
	# canonical_ssl: !ref <canonical_ssl>
	# perceived_ssl: !ref <perceived_ssl>
	# enc: !ref <enc>
	# ConformerEncoder: !ref <ConformerEncoder>
	# ctc_lin: !ref <ctc_lin>
	# lm_weight: !ref <lm_weight>

	perceived_ssl: !apply:trainer.AutoSSLoader.AutoSSLLoader
	model_name: !ref <perceived_ssl_model>
	freeze: !ref <freeze_perceived_ssl>
	freeze_feature_extractor: !ref <freeze_perceived_feature_extractor>
	save_path: !ref <pretrained_models_path>
	output_all_hiddens: False
	preceived_ssl_emb_layer: -1

	canonical_ssl: !apply:trainer.AutoSSLoader.AutoSSLLoader
	model_name: !ref <canonical_ssl_model>
	freeze: !ref <freeze_canonical_ssl>
	freeze_feature_extractor: !ref <freeze_perceived_feature_extractor>
	save_path: !ref <pretrained_models_path>
	output_all_hiddens: False

	canonical_ssl_emb_layer: -1

	enc: !new:torch.nn.Sequential
	- !new:speechbrain.lobes.models.VanillaNN.VanillaNN
	input_shape: [null, null, !ref <ENCODER_DIM>]
	activation: !ref <activation>
	dnn_blocks: !ref <dnn_layers>
	dnn_neurons: !ref <dnn_neurons>
	- !new:torch.nn.LayerNorm
	normalized_shape: !ref <dnn_neurons>


	kernel_size: 7
	attention_type: "RoPEMHA" # Options: "standard", "RoPE"
	ConformerEncoder: !new:speechbrain.lobes.models.transformer.Conformer.ConformerEncoder
	num_layers: 2
	nhead: 8
	d_ffn: !ref <dnn_neurons>
	d_model: !ref <dnn_neurons>
	dropout: 0.1
	kernel_size: !ref <kernel_size>
	attention_type: !ref <attention_type>

	ctc_lin: !new:speechbrain.nnet.linear.Linear
	input_size: !ref <dnn_neurons>
	n_neurons: !ref <output_neurons> # 40 phonemes + 1 blank + 1 err

	# lm_weight for OTTC's alpha prediction
	lm_weight: !new:speechbrain.nnet.linear.Linear
	input_size: !ref <dnn_neurons>
	n_neurons: 1 # 40 phonemes + 1 blank + 1 err

	# Model parameters
	activation: !name:torch.nn.LeakyReLU
	dnn_layers: 2
	dnn_neurons: 384
	freeze_perceived_ssl: False
	freeze_canonical_ssl: False
	freeze_perceived_feature_extractor: True # freeze the CNN extractor in wav2vec
	freeze_canonical_feature_extractor: True # Freeze Whisper encoder?

	log_softmax: !new:speechbrain.nnet.activations.Softmax
	apply_log: True

	# ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
	# blank_index: !ref <blank_index>

	# ctc_cost: !new:utils.CTCLossWithLabelPriors.CTCLossWithLabelPriors
	# prior_scaling_factor: 0.3
	# ctc_implementation: 'k2'
	# blank: !ref <blank_index>
	# reduction: 'sum'

	ctc_cost: !name:utils.losses.ot_loss.batched_ottc_loss_bucketized


	ctc_cost_mispro: !name:speechbrain.nnet.losses.ctc_loss
	blank_index: !ref <blank_index>

	# Outputs
	output_neurons: 71 # l2arctic: 40phns(sil)+err+blank + eos + bos =44
	blank_index: 0

	model: !new:torch.nn.ModuleList
	- [!ref <enc>, !ref <ctc_lin>, ]

	adam_opt_class: !name:torch.optim.Adam
	lr: !ref <lr>

	pretrained_opt_class: !name:torch.optim.Adam
	lr: !ref <lr_pretrained>

	checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
	checkpoints_dir: !ref <save_folder>
	recoverables:
	model: !ref <model>
	perceived_ssl: !ref <perceived_ssl>
	counter: !ref <epoch_counter>
	allow_partial_load: True
	# canonical_ssl: !ref <canonical_ssl>
	# augmentation: !new:speechbrain.augment.time_domain.SpeedPerturb
	# orig_freq: !ref <sample_rate>
	# speeds: [95, 100, 105]

	spec_augmentation: !new:speechbrain.augment.freq_domain.SpectrogramDrop
	drop_length_low: 5
	drop_length_high: 27
	drop_count_low: 1
	drop_count_high: 3
	replace: 'zeros'

	freq_chunk_augmentation: !new:speechbrain.augment.time_domain.DropFreq
	drop_freq_low: 1e-14
	drop_freq_high: 1
	drop_freq_count_low: 1
	drop_freq_count_high: 3
	drop_freq_width: 0.10
	epsilon: 1e-12

	drop_length_high: 3000
	time_chunk_augmentation: !new:speechbrain.augment.time_domain.DropChunk
	drop_length_low: 1000
	drop_length_high: !ref <drop_length_high>
	drop_count_low: 1
	drop_count_high: 3

	speed_augmentation: !new:speechbrain.augment.time_domain.SpeedPerturb
	orig_freq: !ref <sample_rate>
	speeds: [95, 100, 105]

	timewarp_augmentation: !new:speechbrain.augment.freq_domain.Warping
	warp_window: 5
	dim: 1 # time

	augmentation: !new:speechbrain.augment.augmenter.Augmenter
	augmentations:
	- !ref <freq_chunk_augmentation>
	- !ref <time_chunk_augmentation>
	# - !new:speechbrain.augment.time_domain.SpeedPerturb # Apply speed perturbation ahead so the copy of
	# orig_freq: !ref <sample_rate>
	# speeds: [95, 100, 105]

	epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
	limit: !ref <number_of_epochs>

	train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
	save_file: !ref <train_log>

	# ctc_stats: !name:speechbrain.utils.metric_stats.MetricStats
	# metric: !new:utils.CTCLossWithLabelPriors.CTCLossWithLabelPriors
	# prior_scaling_factor: 0.3
	# ctc_implementation: 'k2'
	# blank: !ref <blank_index>
	# reduction: 'none'

	ctc_stats: !name:speechbrain.utils.metric_stats.MetricStats
	metric: !name:speechbrain.nnet.losses.ctc_loss
	blank_index: !ref <blank_index>
	reduction: batch

	per_stats: !name:speechbrain.utils.metric_stats.ErrorRateStats

	# # TIMIT
	# timit_local_data_folder: "/common/db/TIMIT" # Path to TIMIT datase

	seed: 3047
	__set_seed: !apply:torch.manual_seed [!ref <seed>]

	# training parameters
	number_of_epochs: 300
	batch_size: 16
	lr: 0.0003
	sorting: ascending
	sample_rate: 16000
	gradient_accumulation: 2
	lr_pretrained: 0.00001

	# Mix-Precision Training
	auto_mix_prec: true
	# or
	precision: fp16 # 支持 "fp32"、"fp16" 或 "bf16"
	eval_precision: fp32 # 推理同样切换到 FP16

	# Dataloader options
	train_dataloader_opts:
	batch_size: !ref <batch_size>


	valid_dataloader_opts:
	batch_size: !ref <batch_size>


	test_dataloader_opts:
	batch_size: !ref <batch_size>

	# # resume_from_pretrainer, to fine-tune from a saved pretrainer checkpoint
	# resume_from: /home/m64000/work/IF-MDD/exp_iqra_tts/wavlm_large_None_PhnMonoSSL_crottc_confEnc_RoPE_k7/save/CKPT+088_PER_6.2082_F1_0.9074.ckpt

	# resume_from_pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
	# collect_in: !ref <resume_from>/
	# loadables:
	# perceived_ssl: !ref <perceived_ssl>
	# model: !ref <model>
	# #
	pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
	collect_in: !ref <save_folder>/
	loadables:
	perceived_ssl: !ref <perceived_ssl>
	model: !ref <model>
	tokenizer: !ref <tokenizer>

	encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
	perceived_ssl: !ref <perceived_ssl>
	enc: !ref <enc>
	ctc_lin: !ref <ctc_lin>
	log_softmax: !ref <log_softmax>

	decoding_function: !name:speechbrain.decoders.ctc_greedy_decode
	blank_id: !ref <blank_index>

	tokenizer: !new:speechbrain.dataio.encoder.CTCTextEncoder
	load_from_file: /home/kevingenghaopeng/MDD/IF-MDD/pretrained_models/iqra_extra_acou_model/ottc_k7_RoPE_TTS_FT/label_encoder.txt

	modules:
	encoder: !ref <encoder>