speechbrain
/

cnn14-esc50

English

Sound Classification

CNN14

Model card Files Files and versions

xet

Community

cemsubakan commited on Feb 25, 2024

Commit

8e55a46

verified ·

1 Parent(s): 590fd20

Update hyperparams.yaml

Browse files

Files changed (1) hide show

hyperparams.yaml +13 -141

hyperparams.yaml CHANGED Viewed

@@ -1,91 +1,10 @@
-# Generated 2022-11-21 from:
-# /home/cem/Dropbox/speechbrain-1/recipes/ESC50/classification/hparams/cnn14.yaml
-# yamllint disable
-# #################################
-# Basic training parameters for sound classification using the ESC50 dataset.
-# This recipe uses the ecapa-tdnn backbone for classification.
-#
-# Author:
-#  * Cem Subakan
-#  (based on the SpeechBrain UrbanSound8k recipe)
-# #################################
-# Seed needs to be set at top of yaml, before objects with parameters are made
-seed: 11
-__set_seed: !!python/object/apply:torch.manual_seed [11]
-# Set up folders for reading from and writing to
-# Dataset must already exist at `audio_data_folder`
-data_folder: /data2/ESC-50-master
-                          # e.g., /localscratch/UrbanSound8K
-open_rir_folder: <data_folder>/RIRS # Change if needed
-audio_data_folder: /data2/ESC-50-master/audio
-# TODO the follwing folder will contain the resampled audio
-# files (mono channel and config SR) to train on
-#reasmpled_audio_data_folder: !ref <data_folder>/audio_mono16kHz
-#
-experiment_name: cnn14
-output_folder: ./results/cnn14/11
-save_folder: ./results/cnn14/11/save
-train_log: ./results/cnn14/11/train_log.txt
-test_only: false
-# Tensorboard logs
-use_tensorboard: false
-tensorboard_logs_folder: ./results/cnn14/11/tb_logs/
-# Path where data manifest files will be stored
-train_annotation: /data2/ESC-50-master/manifest/train.json
-valid_annotation: /data2/ESC-50-master/manifest/valid.json
-test_annotation: /data2/ESC-50-master/manifest/test.json
-# To standardize results, UrbanSound8k has pre-separated samples into
-# 10 folds for multi-fold validation
-train_fold_nums: [1, 2, 3]
-valid_fold_nums: [4]
-test_fold_nums: [5]
-skip_manifest_creation: false
-ckpt_interval_minutes: 15 # save checkpoint every N min
-# Training parameters
-number_of_epochs: 200
-batch_size: 32
-lr: 0.0002
-base_lr: 0.00000001
-max_lr: 0.0002
-step_size: 65000
 sample_rate: 44100
 device: cpu
-# Feature parameters
-n_mels: 80
-left_frames: 0
-right_frames: 0
-deltas: false
-amp_to_db: true
-normalize: true
-use_melspectra: true
-# Number of classes
-out_n_neurons: 50
-# Note that it's actually important to shuffle the data here
-# (or at the very least, not sort the data by duration)
-# Also note that this does not violate the UrbanSound8k "no-shuffle" policy
-# because this does not mix samples from folds in train to valid/test, only
-# within train or valid, or test
-shuffle: true
-dataloader_options:
-  batch_size: 32
-  shuffle: true
-  num_workers: 0
 # Functions
-compute_features: &id003 !new:speechbrain.lobes.features.Fbank
   n_mels: 80
   left_frames: 0
   right_frames: 0
@@ -96,33 +15,16 @@ compute_features: &id003 !new:speechbrain.lobes.features.Fbank
   hop_length: 10
 use_pretrain: false
-embedding_model: &id009 !new:speechbrain.lobes.models.Cnn14.Cnn14
   mel_bins: 80
   emb_dim: 2048
-classifier: &id010 !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
   input_size: 2048
   out_neurons: 50
   lin_blocks: 1
-epoch_counter: &id012 !new:speechbrain.utils.epoch_loop.EpochCounter
-# If you do not want to use the pretrained separator you can simply delete pretrained_separator field.
-  limit: 200
-# Definition of the augmentation pipeline.
-# If concat_augment = False, the augmentation techniques are applied
-# in sequence. If concat_augment = True, all the augmented signals
-# # are concatenated in a single big batch.
-augment_pipeline: []
-concat_augment: true
-mean_var_norm: &id011 !new:speechbrain.processing.features.InputNormalization
   norm_type: sentence
   std_norm: false
@@ -131,55 +33,25 @@ n_fft: 1024
 spec_mag_power: 0.5
 hop_length: 11.6099
 win_length: 23.2199
-compute_stft: &id001 !new:speechbrain.processing.features.STFT
   n_fft: 1024
   hop_length: 11.6099
   win_length: 23.2199
   sample_rate: 44100
-compute_fbank: &id002 !new:speechbrain.processing.features.Filterbank
   n_mels: 80
   n_fft: 1024
   sample_rate: 44100
 modules:
-  compute_stft: *id001
-  compute_fbank: *id002
-  compute_features: *id003
-  embedding_model: *id009
-  classifier: *id010
-  mean_var_norm: *id011
-compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper
-  loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin
-    margin: 0.2
-    scale: 30
-# compute_error: !name:speechbrain.nnet.losses.classification_error
-opt_class: !name:torch.optim.Adam
-  lr: 0.0002
-  weight_decay: 0.000002
-lr_annealing: !new:speechbrain.nnet.schedulers.CyclicLRScheduler
-  base_lr: 0.00000001
-  max_lr: 0.0002
-  step_size: 65000
-# Logging + checkpoints
-train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
-  save_file: ./results/cnn14/11/train_log.txt
-error_stats: !name:speechbrain.utils.metric_stats.MetricStats
-  metric: !name:speechbrain.nnet.losses.classification_error
-    reduction: batch
-checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
-  checkpoints_dir: ./results/cnn14/11/save
-  recoverables:
-    embedding_model: *id009
-    classifier: *id010
-    normalizer: *id011
-    counter: *id012
 label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder

 sample_rate: 44100
 device: cpu
 # Functions
+compute_features: !new:speechbrain.lobes.features.Fbank
   n_mels: 80
   left_frames: 0
   right_frames: 0
   hop_length: 10
 use_pretrain: false
+embedding_model: !new:speechbrain.lobes.models.Cnn14.Cnn14
   mel_bins: 80
   emb_dim: 2048
+classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
   input_size: 2048
   out_neurons: 50
   lin_blocks: 1
+mean_var_norm: !new:speechbrain.processing.features.InputNormalization
   norm_type: sentence
   std_norm: false
 spec_mag_power: 0.5
 hop_length: 11.6099
 win_length: 23.2199
+compute_stft: !new:speechbrain.processing.features.STFT
   n_fft: 1024
   hop_length: 11.6099
   win_length: 23.2199
   sample_rate: 44100
+compute_fbank: !new:speechbrain.processing.features.Filterbank
   n_mels: 80
   n_fft: 1024
   sample_rate: 44100
 modules:
+  compute_stft: !ref <compute_stft>
+  compute_fbank: !ref <compute_fbank>
+  compute_features: !ref <compute_features>
+  embedding_model: !ref <embedding_model>
+  classifier: !ref <classifier>
+  mean_var_norm: !ref <mean_var_norm>
 label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder