speechbrain
/

PIQ-ESC50

@@ -1,6 +1,3 @@
-# Generated 2023-07-14 from:
-# /data2/cloned_repos/speechbrain-clone/recipes/ESC50/interpret/hparams/piq.yaml
-# yamllint disable
 # #################################
 # The recipe for training PIQ on the ESC50 dataset.
 #
@@ -10,47 +7,6 @@
 #  (based on the SpeechBrain UrbanSound8k recipe)
 # #################################
-# Seed needs to be set at top of yaml, before objects with parameters are made
-seed: 1234
-__set_seed: !!python/object/apply:torch.manual_seed [1234]
-# Set up folders for reading from and writing to
-# Dataset must already exist at `audio_data_folder`
-data_folder: /data2/ESC-50-master
-                          # e.g., /localscratch/UrbanSound8K
-audio_data_folder: /data2/ESC-50-master/audio
-experiment_name: piq
-output_folder: ./results/piq/1234
-save_folder: ./results/piq/1234/save
-train_log: ./results/piq/1234/train_log.txt
-test_only: false
-save_interpretations: true
-interpret_period: 10
-# Tensorboard logs
-use_tensorboard: false
-tensorboard_logs_folder: ./results/piq/1234/tb_logs/
-# Path where data manifest files will be stored
-train_annotation: /data2/ESC-50-master/manifest/train.json
-valid_annotation: /data2/ESC-50-master/manifest/valid.json
-test_annotation: /data2/ESC-50-master/manifest/test.json
-# To standardize results, UrbanSound8k has pre-separated samples into
-# 10 folds for multi-fold validation
-train_fold_nums: [1, 2, 3]
-valid_fold_nums: [4]
-test_fold_nums: [5]
-skip_manifest_creation: false
-ckpt_interval_minutes: 15 # save checkpoint every N min
-# Training parameters
-number_of_epochs: 200
-batch_size: 16
-lr: 0.0002
 sample_rate: 16000
 use_vq: true
 rec_loss_coef: 1
@@ -65,43 +21,6 @@ n_mels: 80
 # Number of classes
 out_n_neurons: 50
-shuffle: true
-dataloader_options:
-  batch_size: 16
-  shuffle: true
-  num_workers: 0
-epoch_counter: &id001 !new:speechbrain.utils.epoch_loop.EpochCounter
-  limit: 200
-opt_class: !name:torch.optim.Adam
-  lr: 0.0002
-  weight_decay: 0.000002
-lr_annealing: !new:speechbrain.nnet.schedulers.ReduceLROnPlateau
-  factor: 0.5
-  patience: 3
-  dont_halve_until_epoch: 100
-# Logging + checkpoints
-train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
-  save_file: ./results/piq/1234/train_log.txt
-checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
-  checkpoints_dir: ./results/piq/1234/save
-  recoverables:
-    psi_model: &id004 !new:speechbrain.lobes.models.PIQ.VectorQuantizedPSI_Audio
-      dim: 256
-      K: 1024
-      shared_keys: 0
-      activate_class_partitioning: true
-      use_adapter: true
-      adapter_reduce_dim: true
-    counter: *id001
-use_pretrained: true
 # embedding_model: !new:custom_models.Conv2dEncoder_v2
 embedding_model: &id002 !new:speechbrain.lobes.models.PIQ.Conv2dEncoder_v2
   dim: 256
@@ -111,7 +30,6 @@ classifier: &id003 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
   out_neurons: 50
   lin_blocks: 1
 # Interpretation hyperparams
 K: 1024
@@ -138,11 +56,13 @@ compute_istft: &id007 !new:speechbrain.processing.features.ISTFT
 label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
 psi_model: *id004
 modules:
-  compute_stft: *id005
-  compute_fbank: *id006
-  compute_istft: *id007
-  psi: *id004
   embedding_model: !ref <embedding_model>
   classifier: !ref <classifier>
@@ -156,5 +76,4 @@ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
     embedding_model: speechbrain/PIQ-ESC50/embedding_modelft.ckpt
     classifier: speechbrain/PIQ-ESC50/classifier.ckpt
     psi: speechbrain/PIQ-ESC50/psi_model.ckpt
-    label_encoder: speechbrain/cnn14-esc50/label_encoder.txt

 # #################################
 # The recipe for training PIQ on the ESC50 dataset.
 #
 #  (based on the SpeechBrain UrbanSound8k recipe)
 # #################################
 sample_rate: 16000
 use_vq: true
 rec_loss_coef: 1
 # Number of classes
 out_n_neurons: 50
 # embedding_model: !new:custom_models.Conv2dEncoder_v2
 embedding_model: &id002 !new:speechbrain.lobes.models.PIQ.Conv2dEncoder_v2
   dim: 256
   out_neurons: 50
   lin_blocks: 1
 # Interpretation hyperparams
 K: 1024
 label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder
 psi_model: *id004
 modules:
+  compute_stft: !ref <compute_stft>
+  compute_fbank: !ref <compute_fbank>
+  compute_istft: !ref <compute_istft>
+  psi: !ref <psi_model>
   embedding_model: !ref <embedding_model>
   classifier: !ref <classifier>
     embedding_model: speechbrain/PIQ-ESC50/embedding_modelft.ckpt
     classifier: speechbrain/PIQ-ESC50/classifier.ckpt
     psi: speechbrain/PIQ-ESC50/psi_model.ckpt
+    label_encoder: speechbrain/cnn14-esc50/label_encoder.txt