Upload 8 files

Browse files

Files changed (8) hide show

hparams/train.yaml +35 -0
hparams/train_with_wav2vec.yaml +112 -0
model.pth +3 -0
prepare.py +52 -0
results/train_with_wav2vec2/1993/test.json +1 -0
results/train_with_wav2vec2/1993/train.json +1 -0
results/train_with_wav2vec2/1993/valid.json +1 -0
train_with_wav2vec.py +302 -0

hparams/train.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+seed: 1993
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+# Dataset will be downloaded to the `data_original`
+data_original: D:/voice-emo/dat/
+output_folder: results/train_with_wav2vec2/1993
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+# URL for the wav2vec2 model
+wav2vec2_hub: facebook/wav2vec2-base
+wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint
+# Path where data manifest files will be stored
+train_annotation: !ref <output_folder>/train.json
+valid_annotation: !ref <output_folder>/valid.json
+test_annotation: !ref <output_folder>/test.json
+split_ratio: [80, 10, 10]
+skip_prep: False
+number_of_epochs: 5
+batch_size: 4
+lr: 0.0001
+lr_wav2vec2: 0.00001
+dataloader_options:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: 0
+    drop_last: False
+encoder_dim: 768
+# Number of emotions
+out_n_neurons: 7  # (anger, disgust, fear, happy, neutral, sad, surprise)

hparams/train_with_wav2vec.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+seed: 1993
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+# Dataset will be downloaded to the `data_original`
+data_original: D:/voice-emo/dat/
+output_folder: !ref results/train_with_wav2vec2/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+# URL for the wav2vec2 model, you can change to benchmark different models
+# Important: we use wav2vec2 base and not the fine-tuned one with ASR task
+# This allows you to have ~4% improvement
+wav2vec2_hub: facebook/wav2vec2-base
+wav2vec2_folder: !ref <save_folder>/wav2vec2_checkpoint
+# Path where data manifest files will be stored
+train_annotation: !ref <output_folder>/train.json
+valid_annotation: !ref <output_folder>/valid.json
+test_annotation: !ref <output_folder>/test.json
+split_ratio: [80, 10, 10]
+skip_prep: False
+# The train logger writes training statistics to a file, as well as stdout.
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+ckpt_interval_minutes: 15  # save checkpoint every N min
+####################### Training Parameters ####################################
+number_of_epochs: 30
+batch_size: 4
+lr: 0.0001
+lr_wav2vec2: 0.00001
+# Freeze all wav2vec2
+freeze_wav2vec2: False
+# Set to true to freeze the CONV part of the wav2vec2 model
+# We see an improvement of 2% with freezing CNNs
+freeze_wav2vec2_conv: True
+####################### Model Parameters #######################################
+encoder_dim: 768
+# Number of emotions
+out_n_neurons: 7  # (anger, disgust, fear, happy, neutral, sad, suprise )
+dataloader_options:
+    batch_size: !ref <batch_size>
+    shuffle: True
+    num_workers: 2  # 2 on Linux but 0 works on Windows
+    drop_last: False
+# Wav2vec2 encoder
+wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
+    source: !ref <wav2vec2_hub>
+    output_norm: True
+    freeze: !ref <freeze_wav2vec2>
+    freeze_feature_extractor: !ref <freeze_wav2vec2_conv>
+    save_path: !ref <wav2vec2_folder>
+avg_pool: !new:speechbrain.nnet.pooling.StatisticsPooling
+    return_std: False
+output_mlp: !new:speechbrain.nnet.linear.Linear
+    input_size: !ref <encoder_dim>
+    n_neurons: !ref <out_n_neurons>
+    bias: False
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+modules:
+    wav2vec2: !ref <wav2vec2>
+    output_mlp: !ref <output_mlp>
+model: !new:torch.nn.ModuleList
+    - [!ref <output_mlp>]
+log_softmax: !new:speechbrain.nnet.activations.Softmax
+    apply_log: True
+compute_cost: !name:speechbrain.nnet.losses.nll_loss
+error_stats: !name:speechbrain.utils.metric_stats.MetricStats
+    metric: !name:speechbrain.nnet.losses.classification_error
+        reduction: batch
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+wav2vec2_opt_class: !name:torch.optim.Adam
+    lr: !ref <lr_wav2vec2>
+lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler
+    initial_value: !ref <lr>
+    improvement_threshold: 0.0025
+    annealing_factor: 0.9
+    patient: 0
+lr_annealing_wav2vec2: !new:speechbrain.nnet.schedulers.NewBobScheduler
+    initial_value: !ref <lr_wav2vec2>
+    improvement_threshold: 0.0025
+    annealing_factor: 0.9
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        model: !ref <model>
+        wav2vec2: !ref <wav2vec2>
+        lr_annealing_output: !ref <lr_annealing>
+        lr_annealing_wav2vec2: !ref <lr_annealing_wav2vec2>
+        counter: !ref <epoch_counter>

model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6dd3e3ab14987cd5124407a58a263504ee5b7540727dadbdb04f6481f3775b1f
+size 755087318

prepare.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import os
+import json
+import random
+import logging
+logger = logging.getLogger(__name__)
+def prepare_data(data_original, save_json_train, save_json_valid, save_json_test, split_ratio=[80, 10, 10], seed=12):
+    # Setting seeds for reproducible code.
+    random.seed(seed)
+    # Check if data preparation has already been done (skip if files exist)
+    if os.path.exists(save_json_train) and os.path.exists(save_json_valid) and os.path.exists(save_json_test):
+        logger.info("Preparation completed in previous run, skipping.")
+        return
+    # Collect audio files and labels
+    wav_list = []
+    labels = os.listdir(data_original)
+    for label in labels:
+        label_dir = os.path.join(data_original, label)
+        if os.path.isdir(label_dir):
+            for audio_file in os.listdir(label_dir):
+                if audio_file.endswith('.wav'):
+                    wav_file = os.path.join(label_dir, audio_file)
+                    if os.path.isfile(wav_file):
+                        wav_list.append((wav_file, label))
+                    else:
+                        logger.warning(f"Skipping invalid audio file: {wav_file}")
+    # Shuffle and split the data
+    random.shuffle(wav_list)
+    n_total = len(wav_list)
+    n_train = n_total * split_ratio[0] // 100
+    n_valid = n_total * split_ratio[1] // 100
+    train_set = wav_list[:n_train]
+    valid_set = wav_list[n_train:n_train + n_valid]
+    test_set = wav_list[n_train + n_valid:]
+    # Create JSON files for train, valid, and test sets
+    create_json(train_set, save_json_train)
+    create_json(valid_set, save_json_valid)
+    create_json(test_set, save_json_test)
+    logger.info(f"Created {save_json_train}, {save_json_valid}, and {save_json_test}")
+def create_json(data, json_file):
+    data_dict = {str(idx): {'wav': wav, 'label': label} for idx, (wav, label) in enumerate(data)}
+    with open(json_file, 'w') as f:
+        json.dump(data_dict, f)

results/train_with_wav2vec2/1993/test.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"0": {"wav": "D:/voice-emo/dat/surprise\\JK_su14.wav", "label": "surprise"}, "1": {"wav": "D:/voice-emo/dat/disgust\\1001_IWW_DIS_XX_stretch_augmented.wav", "label": "disgust"}, "2": {"wav": "D:/voice-emo/dat/happy\\1001_IWL_HAP_XX.wav", "label": "happy"}, "3": {"wav": "D:/voice-emo/dat/disgust\\1001_IEO_DIS_HI_noise_augmented.wav", "label": "disgust"}, "4": {"wav": "D:/voice-emo/dat/sad\\1001_IEO_SAD_LO.wav", "label": "sad"}, "5": {"wav": "D:/voice-emo/dat/sad\\1001_IEO_SAD_LO_noise_augmented.wav", "label": "sad"}, "6": {"wav": "D:/voice-emo/dat/fear\\1001_IOM_FEA_XX_pitch_augmented.wav", "label": "fear"}, "7": {"wav": "D:/voice-emo/dat/neutral\\1001_IEO_NEU_XX.wav", "label": "neutral"}, "8": {"wav": "D:/voice-emo/dat/disgust\\1001_TAI_DIS_XX_stretch_augmented.wav", "label": "disgust"}, "9": {"wav": "D:/voice-emo/dat/fear\\1001_WSI_FEA_XX_noise_augmented.wav", "label": "fear"}, "10": {"wav": "D:/voice-emo/dat/angry\\1001_IOM_ANG_XX.wav", "label": "angry"}, "11": {"wav": "D:/voice-emo/dat/angry\\1001_WSI_ANG_XX_stretch_augmented.wav", "label": "angry"}, "12": {"wav": "D:/voice-emo/dat/disgust\\1001_IOM_DIS_XX_stretch_augmented.wav", "label": "disgust"}, "13": {"wav": "D:/voice-emo/dat/angry\\1001_MTI_ANG_XX.wav", "label": "angry"}, "14": {"wav": "D:/voice-emo/dat/surprise\\DC_su01_noise_augmented.wav", "label": "surprise"}, "15": {"wav": "D:/voice-emo/dat/fear\\1001_IEO_FEA_HI_stretch_augmented.wav", "label": "fear"}, "16": {"wav": "D:/voice-emo/dat/neutral\\1001_TIE_NEU_XX_noise_augmented.wav", "label": "neutral"}, "17": {"wav": "D:/voice-emo/dat/disgust\\1001_TAI_DIS_XX_noise_augmented.wav", "label": "disgust"}, "18": {"wav": "D:/voice-emo/dat/angry\\1001_DFA_ANG_XX_stretch_augmented.wav", "label": "angry"}, "19": {"wav": "D:/voice-emo/dat/neutral\\1001_DFA_NEU_XX.wav", "label": "neutral"}, "20": {"wav": "D:/voice-emo/dat/surprise\\DC_su03.wav", "label": "surprise"}, "21": {"wav": "D:/voice-emo/dat/fear\\1001_TIE_FEA_XX_stretch_augmented.wav", "label": "fear"}, "22": {"wav": "D:/voice-emo/dat/fear\\1001_IWL_FEA_XX_noise_augmented.wav", "label": "fear"}, "23": {"wav": "D:/voice-emo/dat/fear\\1001_IWL_FEA_XX_stretch_augmented.wav", "label": "fear"}, "24": {"wav": "D:/voice-emo/dat/happy\\1001_MTI_HAP_XX_pitch_augmented.wav", "label": "happy"}, "25": {"wav": "D:/voice-emo/dat/happy\\1001_IEO_HAP_MD_noise_augmented.wav", "label": "happy"}, "26": {"wav": "D:/voice-emo/dat/sad\\1001_ITH_SAD_XX_stretch_augmented.wav", "label": "sad"}, "27": {"wav": "D:/voice-emo/dat/happy\\1001_MTI_HAP_XX.wav", "label": "happy"}, "28": {"wav": "D:/voice-emo/dat/disgust\\1001_ITH_DIS_XX_stretch_augmented.wav", "label": "disgust"}, "29": {"wav": "D:/voice-emo/dat/happy\\1001_IEO_HAP_LO_stretch_augmented.wav", "label": "happy"}, "30": {"wav": "D:/voice-emo/dat/happy\\1001_DFA_HAP_XX_pitch_augmented.wav", "label": "happy"}, "31": {"wav": "D:/voice-emo/dat/disgust\\1001_IWL_DIS_XX_noise_augmented.wav", "label": "disgust"}, "32": {"wav": "D:/voice-emo/dat/disgust\\1001_DFA_DIS_XX_stretch_augmented.wav", "label": "disgust"}, "33": {"wav": "D:/voice-emo/dat/angry\\1001_IOM_ANG_XX_pitch_augmented.wav", "label": "angry"}, "34": {"wav": "D:/voice-emo/dat/neutral\\1001_IEO_NEU_XX_noise_augmented.wav", "label": "neutral"}, "35": {"wav": "D:/voice-emo/dat/fear\\1001_ITH_FEA_XX_pitch_augmented.wav", "label": "fear"}, "36": {"wav": "D:/voice-emo/dat/sad\\1001_TIE_SAD_XX_noise_augmented.wav", "label": "sad"}, "37": {"wav": "D:/voice-emo/dat/neutral\\1001_IWL_NEU_XX_pitch_augmented.wav", "label": "neutral"}}

results/train_with_wav2vec2/1993/train.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"0": {"wav": "D:/voice-emo/dat/angry\\1001_IEO_ANG_HI_noise_augmented.wav", "label": "angry"}, "1": {"wav": "D:/voice-emo/dat/angry\\1001_DFA_ANG_XX_pitch_augmented.wav", "label": "angry"}, "2": {"wav": "D:/voice-emo/dat/angry\\1001_ITH_ANG_XX.wav", "label": "angry"}, "3": {"wav": "D:/voice-emo/dat/fear\\1001_IWW_FEA_XX_noise_augmented.wav", "label": "fear"}, "4": {"wav": "D:/voice-emo/dat/neutral\\1001_MTI_NEU_XX.wav", "label": "neutral"}, "5": {"wav": "D:/voice-emo/dat/surprise\\DC_su08_noise_augmented.wav", "label": "surprise"}, "6": {"wav": "D:/voice-emo/dat/surprise\\DC_su04.wav", "label": "surprise"}, "7": {"wav": "D:/voice-emo/dat/surprise\\DC_su01_pitch_augmented.wav", "label": "surprise"}, "8": {"wav": "D:/voice-emo/dat/angry\\1001_IEO_ANG_MD_pitch_augmented.wav", "label": "angry"}, "9": {"wav": "D:/voice-emo/dat/fear\\1001_DFA_FEA_XX_stretch_augmented.wav", "label": "fear"}, "10": {"wav": "D:/voice-emo/dat/disgust\\1001_IEO_DIS_HI_stretch_augmented.wav", "label": "disgust"}, "11": {"wav": "D:/voice-emo/dat/surprise\\DC_su07_noise_augmented.wav", "label": "surprise"}, "12": {"wav": "D:/voice-emo/dat/neutral\\1001_ITH_NEU_XX_pitch_augmented.wav", "label": "neutral"}, "13": {"wav": "D:/voice-emo/dat/fear\\1001_TIE_FEA_XX.wav", "label": "fear"}, "14": {"wav": "D:/voice-emo/dat/happy\\1001_TAI_HAP_XX_noise_augmented.wav", "label": "happy"}, "15": {"wav": "D:/voice-emo/dat/happy\\1001_IEO_HAP_LO_pitch_augmented.wav", "label": "happy"}, "16": {"wav": "D:/voice-emo/dat/neutral\\1001_ITH_NEU_XX_stretch_augmented.wav", "label": "neutral"}, "17": {"wav": "D:/voice-emo/dat/sad\\1001_ITS_SAD_XX_noise_augmented.wav", "label": "sad"}, "18": {"wav": "D:/voice-emo/dat/neutral\\1001_ITS_NEU_XX_stretch_augmented.wav", "label": "neutral"}, "19": {"wav": "D:/voice-emo/dat/sad\\1001_TAI_SAD_XX_pitch_augmented.wav", "label": "sad"}, "20": {"wav": "D:/voice-emo/dat/sad\\1001_ITH_SAD_XX_pitch_augmented.wav", "label": "sad"}, "21": {"wav": "D:/voice-emo/dat/angry\\1001_ITS_ANG_XX.wav", "label": "angry"}, "22": {"wav": "D:/voice-emo/dat/happy\\1001_IEO_HAP_HI.wav", "label": "happy"}, "23": {"wav": "D:/voice-emo/dat/sad\\1001_IOM_SAD_XX_pitch_augmented.wav", "label": "sad"}, "24": {"wav": "D:/voice-emo/dat/sad\\1001_TSI_SAD_XX_stretch_augmented.wav", "label": "sad"}, "25": {"wav": "D:/voice-emo/dat/surprise\\DC_su08.wav", "label": "surprise"}, "26": {"wav": "D:/voice-emo/dat/sad\\1001_TSI_SAD_XX.wav", "label": "sad"}, "27": {"wav": "D:/voice-emo/dat/sad\\1001_TSI_SAD_XX_noise_augmented.wav", "label": "sad"}, "28": {"wav": "D:/voice-emo/dat/neutral\\1001_TSI_NEU_XX_stretch_augmented.wav", "label": "neutral"}, "29": {"wav": "D:/voice-emo/dat/angry\\1001_TSI_ANG_XX_pitch_augmented.wav", "label": "angry"}, "30": {"wav": "D:/voice-emo/dat/fear\\1001_ITH_FEA_XX_noise_augmented.wav", "label": "fear"}, "31": {"wav": "D:/voice-emo/dat/neutral\\1001_ITS_NEU_XX_noise_augmented.wav", "label": "neutral"}, "32": {"wav": "D:/voice-emo/dat/neutral\\1001_TAI_NEU_XX_stretch_augmented.wav", "label": "neutral"}, "33": {"wav": "D:/voice-emo/dat/angry\\1001_TSI_ANG_XX_noise_augmented.wav", "label": "angry"}, "34": {"wav": "D:/voice-emo/dat/disgust\\1001_ITS_DIS_XX_stretch_augmented.wav", "label": "disgust"}, "35": {"wav": "D:/voice-emo/dat/surprise\\DC_su04_noise_augmented.wav", "label": "surprise"}, "36": {"wav": "D:/voice-emo/dat/sad\\1001_IEO_SAD_MD_noise_augmented.wav", "label": "sad"}, "37": {"wav": "D:/voice-emo/dat/surprise\\DC_su08_pitch_augmented.wav", "label": "surprise"}, "38": {"wav": "D:/voice-emo/dat/sad\\1001_IWL_SAD_XX_pitch_augmented.wav", "label": "sad"}, "39": {"wav": "D:/voice-emo/dat/fear\\1001_DFA_FEA_XX.wav", "label": "fear"}, "40": {"wav": "D:/voice-emo/dat/happy\\1001_ITH_HAP_XX.wav", "label": "happy"}, "41": {"wav": "D:/voice-emo/dat/disgust\\1001_MTI_DIS_XX.wav", "label": "disgust"}, "42": {"wav": "D:/voice-emo/dat/happy\\1001_IEO_HAP_LO_noise_augmented.wav", "label": "happy"}, "43": {"wav": "D:/voice-emo/dat/surprise\\DC_su03_stretch_augmented.wav", "label": "surprise"}, "44": {"wav": "D:/voice-emo/dat/surprise\\DC_su04_pitch_augmented.wav", "label": "surprise"}, "45": {"wav": "D:/voice-emo/dat/surprise\\DC_su07_pitch_augmented.wav", "label": "surprise"}, "46": {"wav": "D:/voice-emo/dat/happy\\1001_IWW_HAP_XX_stretch_augmented.wav", "label": "happy"}, "47": {"wav": "D:/voice-emo/dat/fear\\1001_IEO_FEA_MD.wav", "label": "fear"}, "48": {"wav": "D:/voice-emo/dat/angry\\1001_TSI_ANG_XX_stretch_augmented.wav", "label": "angry"}, "49": {"wav": "D:/voice-emo/dat/neutral\\1001_IOM_NEU_XX_stretch_augmented.wav", "label": "neutral"}, "50": {"wav": "D:/voice-emo/dat/fear\\1001_IOM_FEA_XX_noise_augmented.wav", "label": "fear"}, "51": {"wav": "D:/voice-emo/dat/disgust\\1001_IEO_DIS_LO_noise_augmented.wav", "label": "disgust"}, "52": {"wav": "D:/voice-emo/dat/disgust\\1001_IEO_DIS_MD_noise_augmented.wav", "label": "disgust"}, "53": {"wav": "D:/voice-emo/dat/disgust\\1001_IEO_DIS_MD_stretch_augmented.wav", "label": "disgust"}, "54": {"wav": "D:/voice-emo/dat/fear\\1001_IEO_FEA_LO_pitch_augmented.wav", "label": "fear"}, "55": {"wav": "D:/voice-emo/dat/disgust\\1001_IEO_DIS_LO_pitch_augmented.wav", "label": "disgust"}, "56": {"wav": "D:/voice-emo/dat/disgust\\1001_IWW_DIS_XX.wav", "label": "disgust"}, "57": {"wav": "D:/voice-emo/dat/angry\\1001_IWL_ANG_XX_noise_augmented.wav", "label": "angry"}, "58": {"wav": "D:/voice-emo/dat/happy\\1001_TAI_HAP_XX_stretch_augmented.wav", "label": "happy"}, "59": {"wav": "D:/voice-emo/dat/neutral\\1001_ITH_NEU_XX_noise_augmented.wav", "label": "neutral"}, "60": {"wav": "D:/voice-emo/dat/happy\\1001_IOM_HAP_XX_stretch_augmented.wav", "label": "happy"}, "61": {"wav": "D:/voice-emo/dat/sad\\1001_IEO_SAD_HI_pitch_augmented.wav", "label": "sad"}, "62": {"wav": "D:/voice-emo/dat/sad\\1001_TIE_SAD_XX.wav", "label": "sad"}, "63": {"wav": "D:/voice-emo/dat/angry\\1001_MTI_ANG_XX_stretch_augmented.wav", "label": "angry"}, "64": {"wav": "D:/voice-emo/dat/disgust\\1001_ITH_DIS_XX_noise_augmented.wav", "label": "disgust"}, "65": {"wav": "D:/voice-emo/dat/neutral\\1001_TAI_NEU_XX_pitch_augmented.wav", "label": "neutral"}, "66": {"wav": "D:/voice-emo/dat/fear\\1001_ITH_FEA_XX.wav", "label": "fear"}, "67": {"wav": "D:/voice-emo/dat/surprise\\DC_su09.wav", "label": "surprise"}, "68": {"wav": "D:/voice-emo/dat/sad\\1001_DFA_SAD_XX_pitch_augmented.wav", "label": "sad"}, "69": {"wav": "D:/voice-emo/dat/surprise\\DC_su05_stretch_augmented.wav", "label": "surprise"}, "70": {"wav": "D:/voice-emo/dat/neutral\\1001_IWL_NEU_XX_noise_augmented.wav", "label": "neutral"}, "71": {"wav": "D:/voice-emo/dat/disgust\\1001_IOM_DIS_XX_pitch_augmented.wav", "label": "disgust"}, "72": {"wav": "D:/voice-emo/dat/angry\\1001_IEO_ANG_LO_noise_augmented.wav", "label": "angry"}, "73": {"wav": "D:/voice-emo/dat/sad\\1001_IWW_SAD_XX_pitch_augmented.wav", "label": "sad"}, "74": {"wav": "D:/voice-emo/dat/angry\\1001_ITS_ANG_XX_stretch_augmented.wav", "label": "angry"}, "75": {"wav": "D:/voice-emo/dat/sad\\1001_WSI_SAD_XX_stretch_augmented.wav", "label": "sad"}, "76": {"wav": "D:/voice-emo/dat/neutral\\1001_IEO_NEU_XX_stretch_augmented.wav", "label": "neutral"}, "77": {"wav": "D:/voice-emo/dat/neutral\\1001_WSI_NEU_XX.wav", "label": "neutral"}, "78": {"wav": "D:/voice-emo/dat/disgust\\1001_MTI_DIS_XX_stretch_augmented.wav", "label": "disgust"}, "79": {"wav": "D:/voice-emo/dat/disgust\\1001_WSI_DIS_XX_noise_augmented.wav", "label": "disgust"}, "80": {"wav": "D:/voice-emo/dat/neutral\\1001_IOM_NEU_XX.wav", "label": "neutral"}, "81": {"wav": "D:/voice-emo/dat/surprise\\DC_su04_stretch_augmented.wav", "label": "surprise"}, "82": {"wav": "D:/voice-emo/dat/sad\\1001_IEO_SAD_LO_pitch_augmented.wav", "label": "sad"}, "83": {"wav": "D:/voice-emo/dat/fear\\1001_WSI_FEA_XX.wav", "label": "fear"}, "84": {"wav": "D:/voice-emo/dat/disgust\\1001_IEO_DIS_LO_stretch_augmented.wav", "label": "disgust"}, "85": {"wav": "D:/voice-emo/dat/fear\\1001_IEO_FEA_HI_pitch_augmented.wav", "label": "fear"}, "86": {"wav": "D:/voice-emo/dat/neutral\\1001_IWL_NEU_XX.wav", "label": "neutral"}, "87": {"wav": "D:/voice-emo/dat/neutral\\1001_IWW_NEU_XX_stretch_augmented.wav", "label": "neutral"}, "88": {"wav": "D:/voice-emo/dat/angry\\1001_WSI_ANG_XX_pitch_augmented.wav", "label": "angry"}, "89": {"wav": "D:/voice-emo/dat/angry\\1001_ITH_ANG_XX_pitch_augmented.wav", "label": "angry"}, "90": {"wav": "D:/voice-emo/dat/happy\\1001_TIE_HAP_XX_pitch_augmented.wav", "label": "happy"}, "91": {"wav": "D:/voice-emo/dat/neutral\\1001_TAI_NEU_XX.wav", "label": "neutral"}, "92": {"wav": "D:/voice-emo/dat/disgust\\1001_IOM_DIS_XX_noise_augmented.wav", "label": "disgust"}, "93": {"wav": "D:/voice-emo/dat/angry\\1001_IWL_ANG_XX_stretch_augmented.wav", "label": "angry"}, "94": {"wav": "D:/voice-emo/dat/fear\\1001_ITS_FEA_XX_stretch_augmented.wav", "label": "fear"}, "95": {"wav": "D:/voice-emo/dat/surprise\\DC_su06_pitch_augmented.wav", "label": "surprise"}, "96": {"wav": "D:/voice-emo/dat/sad\\1001_TAI_SAD_XX.wav", "label": "sad"}, "97": {"wav": "D:/voice-emo/dat/sad\\1001_IEO_SAD_HI_noise_augmented.wav", "label": "sad"}, "98": {"wav": "D:/voice-emo/dat/surprise\\DC_su06_stretch_augmented.wav", "label": "surprise"}, "99": {"wav": "D:/voice-emo/dat/angry\\1001_TSI_ANG_XX.wav", "label": "angry"}, "100": {"wav": "D:/voice-emo/dat/neutral\\1001_TIE_NEU_XX.wav", "label": "neutral"}, "101": {"wav": "D:/voice-emo/dat/sad\\1001_MTI_SAD_XX_pitch_augmented.wav", "label": "sad"}, "102": {"wav": "D:/voice-emo/dat/angry\\1001_IEO_ANG_MD_stretch_augmented.wav", "label": "angry"}, "103": {"wav": "D:/voice-emo/dat/surprise\\DC_su09_pitch_augmented.wav", "label": "surprise"}, "104": {"wav": "D:/voice-emo/dat/fear\\1001_IEO_FEA_LO_stretch_augmented.wav", "label": "fear"}, "105": {"wav": "D:/voice-emo/dat/fear\\1001_IWL_FEA_XX_pitch_augmented.wav", "label": "fear"}, "106": {"wav": "D:/voice-emo/dat/angry\\1001_ITH_ANG_XX_noise_augmented.wav", "label": "angry"}, "107": {"wav": "D:/voice-emo/dat/sad\\1001_IWL_SAD_XX.wav", "label": "sad"}, "108": {"wav": "D:/voice-emo/dat/disgust\\1001_TIE_DIS_XX.wav", "label": "disgust"}, "109": {"wav": "D:/voice-emo/dat/fear\\1001_TAI_FEA_XX_stretch_augmented.wav", "label": "fear"}, "110": {"wav": "D:/voice-emo/dat/angry\\1001_IEO_ANG_MD_noise_augmented.wav", "label": "angry"}, "111": {"wav": "D:/voice-emo/dat/neutral\\1001_MTI_NEU_XX_pitch_augmented.wav", "label": "neutral"}, "112": {"wav": "D:/voice-emo/dat/disgust\\1001_TAI_DIS_XX.wav", "label": "disgust"}, "113": {"wav": "D:/voice-emo/dat/fear\\1001_IEO_FEA_HI_noise_augmented.wav", "label": "fear"}, "114": {"wav": "D:/voice-emo/dat/disgust\\1001_IOM_DIS_XX.wav", "label": "disgust"}, "115": {"wav": "D:/voice-emo/dat/sad\\1001_TIE_SAD_XX_pitch_augmented.wav", "label": "sad"}, "116": {"wav": "D:/voice-emo/dat/disgust\\1001_TSI_DIS_XX_stretch_augmented.wav", "label": "disgust"}, "117": {"wav": "D:/voice-emo/dat/disgust\\1001_TSI_DIS_XX_pitch_augmented.wav", "label": "disgust"}, "118": {"wav": "D:/voice-emo/dat/disgust\\1001_IWW_DIS_XX_pitch_augmented.wav", "label": "disgust"}, "119": {"wav": "D:/voice-emo/dat/angry\\1001_MTI_ANG_XX_pitch_augmented.wav", "label": "angry"}, "120": {"wav": "D:/voice-emo/dat/sad\\1001_TAI_SAD_XX_noise_augmented.wav", "label": "sad"}, "121": {"wav": "D:/voice-emo/dat/disgust\\1001_TIE_DIS_XX_pitch_augmented.wav", "label": "disgust"}, "122": {"wav": "D:/voice-emo/dat/surprise\\DC_su02_stretch_augmented.wav", "label": "surprise"}, "123": {"wav": "D:/voice-emo/dat/surprise\\DC_su10_noise_augmented.wav", "label": "surprise"}, "124": {"wav": "D:/voice-emo/dat/disgust\\1001_WSI_DIS_XX.wav", "label": "disgust"}, "125": {"wav": "D:/voice-emo/dat/happy\\1001_WSI_HAP_XX_noise_augmented.wav", "label": "happy"}, "126": {"wav": "D:/voice-emo/dat/angry\\1001_DFA_ANG_XX_noise_augmented.wav", "label": "angry"}, "127": {"wav": "D:/voice-emo/dat/fear\\1001_ITS_FEA_XX_noise_augmented.wav", "label": "fear"}, "128": {"wav": "D:/voice-emo/dat/happy\\1001_TSI_HAP_XX_pitch_augmented.wav", "label": "happy"}, "129": {"wav": "D:/voice-emo/dat/happy\\1001_ITS_HAP_XX_stretch_augmented.wav", "label": "happy"}, "130": {"wav": "D:/voice-emo/dat/sad\\1001_WSI_SAD_XX.wav", "label": "sad"}, "131": {"wav": "D:/voice-emo/dat/fear\\1001_TAI_FEA_XX_noise_augmented.wav", "label": "fear"}, "132": {"wav": "D:/voice-emo/dat/angry\\1001_DFA_ANG_XX.wav", "label": "angry"}, "133": {"wav": "D:/voice-emo/dat/sad\\1001_WSI_SAD_XX_pitch_augmented.wav", "label": "sad"}, "134": {"wav": "D:/voice-emo/dat/angry\\1001_MTI_ANG_XX_noise_augmented.wav", "label": "angry"}, "135": {"wav": "D:/voice-emo/dat/disgust\\1001_IEO_DIS_MD.wav", "label": "disgust"}, "136": {"wav": "D:/voice-emo/dat/sad\\1001_MTI_SAD_XX_noise_augmented.wav", "label": "sad"}, "137": {"wav": "D:/voice-emo/dat/neutral\\1001_DFA_NEU_XX_noise_augmented.wav", "label": "neutral"}, "138": {"wav": "D:/voice-emo/dat/fear\\1001_MTI_FEA_XX.wav", "label": "fear"}, "139": {"wav": "D:/voice-emo/dat/sad\\1001_TSI_SAD_XX_pitch_augmented.wav", "label": "sad"}, "140": {"wav": "D:/voice-emo/dat/disgust\\1001_ITS_DIS_XX.wav", "label": "disgust"}, "141": {"wav": "D:/voice-emo/dat/neutral\\1001_WSI_NEU_XX_pitch_augmented.wav", "label": "neutral"}, "142": {"wav": "D:/voice-emo/dat/fear\\1001_TAI_FEA_XX_pitch_augmented.wav", "label": "fear"}, "143": {"wav": "D:/voice-emo/dat/sad\\1001_IEO_SAD_MD_stretch_augmented.wav", "label": "sad"}, "144": {"wav": "D:/voice-emo/dat/angry\\1001_ITS_ANG_XX_pitch_augmented.wav", "label": "angry"}, "145": {"wav": "D:/voice-emo/dat/sad\\1001_IEO_SAD_HI_stretch_augmented.wav", "label": "sad"}, "146": {"wav": "D:/voice-emo/dat/happy\\1001_ITS_HAP_XX_noise_augmented.wav", "label": "happy"}, "147": {"wav": "D:/voice-emo/dat/angry\\1001_TIE_ANG_XX_stretch_augmented.wav", "label": "angry"}, "148": {"wav": "D:/voice-emo/dat/happy\\1001_TIE_HAP_XX.wav", "label": "happy"}, "149": {"wav": "D:/voice-emo/dat/fear\\1001_WSI_FEA_XX_stretch_augmented.wav", "label": "fear"}, "150": {"wav": "D:/voice-emo/dat/disgust\\1001_IEO_DIS_LO.wav", "label": "disgust"}, "151": {"wav": "D:/voice-emo/dat/sad\\1001_IOM_SAD_XX.wav", "label": "sad"}, "152": {"wav": "D:/voice-emo/dat/sad\\1001_MTI_SAD_XX_stretch_augmented.wav", "label": "sad"}, "153": {"wav": "D:/voice-emo/dat/happy\\1001_IOM_HAP_XX_pitch_augmented.wav", "label": "happy"}, "154": {"wav": "D:/voice-emo/dat/happy\\1001_IOM_HAP_XX_noise_augmented.wav", "label": "happy"}, "155": {"wav": "D:/voice-emo/dat/sad\\1001_MTI_SAD_XX.wav", "label": "sad"}, "156": {"wav": "D:/voice-emo/dat/fear\\1001_IEO_FEA_MD_noise_augmented.wav", "label": "fear"}, "157": {"wav": "D:/voice-emo/dat/sad\\1001_ITS_SAD_XX_stretch_augmented.wav", "label": "sad"}, "158": {"wav": "D:/voice-emo/dat/sad\\1001_IWL_SAD_XX_noise_augmented.wav", "label": "sad"}, "159": {"wav": "D:/voice-emo/dat/neutral\\1001_IWW_NEU_XX_pitch_augmented.wav", "label": "neutral"}, "160": {"wav": "D:/voice-emo/dat/angry\\1001_ITH_ANG_XX_stretch_augmented.wav", "label": "angry"}, "161": {"wav": "D:/voice-emo/dat/happy\\1001_MTI_HAP_XX_noise_augmented.wav", "label": "happy"}, "162": {"wav": "D:/voice-emo/dat/angry\\1001_WSI_ANG_XX.wav", "label": "angry"}, "163": {"wav": "D:/voice-emo/dat/neutral\\1001_TIE_NEU_XX_pitch_augmented.wav", "label": "neutral"}, "164": {"wav": "D:/voice-emo/dat/sad\\1001_WSI_SAD_XX_noise_augmented.wav", "label": "sad"}, "165": {"wav": "D:/voice-emo/dat/angry\\1001_IWW_ANG_XX_pitch_augmented.wav", "label": "angry"}, "166": {"wav": "D:/voice-emo/dat/happy\\1001_IWW_HAP_XX.wav", "label": "happy"}, "167": {"wav": "D:/voice-emo/dat/angry\\1001_IEO_ANG_HI_pitch_augmented.wav", "label": "angry"}, "168": {"wav": "D:/voice-emo/dat/happy\\1001_IEO_HAP_MD_stretch_augmented.wav", "label": "happy"}, "169": {"wav": "D:/voice-emo/dat/sad\\1001_ITH_SAD_XX.wav", "label": "sad"}, "170": {"wav": "D:/voice-emo/dat/happy\\1001_TAI_HAP_XX.wav", "label": "happy"}, "171": {"wav": "D:/voice-emo/dat/fear\\1001_IWW_FEA_XX_pitch_augmented.wav", "label": "fear"}, "172": {"wav": "D:/voice-emo/dat/sad\\1001_ITS_SAD_XX.wav", "label": "sad"}, "173": {"wav": "D:/voice-emo/dat/angry\\1001_TIE_ANG_XX.wav", "label": "angry"}, "174": {"wav": "D:/voice-emo/dat/disgust\\1001_MTI_DIS_XX_noise_augmented.wav", "label": "disgust"}, "175": {"wav": "D:/voice-emo/dat/surprise\\DC_su06.wav", "label": "surprise"}, "176": {"wav": "D:/voice-emo/dat/angry\\1001_ITS_ANG_XX_noise_augmented.wav", "label": "angry"}, "177": {"wav": "D:/voice-emo/dat/neutral\\1001_IWL_NEU_XX_stretch_augmented.wav", "label": "neutral"}, "178": {"wav": "D:/voice-emo/dat/neutral\\1001_ITS_NEU_XX.wav", "label": "neutral"}, "179": {"wav": "D:/voice-emo/dat/disgust\\1001_IWW_DIS_XX_noise_augmented.wav", "label": "disgust"}, "180": {"wav": "D:/voice-emo/dat/neutral\\1001_MTI_NEU_XX_stretch_augmented.wav", "label": "neutral"}, "181": {"wav": "D:/voice-emo/dat/sad\\1001_IWW_SAD_XX.wav", "label": "sad"}, "182": {"wav": "D:/voice-emo/dat/fear\\1001_TSI_FEA_XX_pitch_augmented.wav", "label": "fear"}, "183": {"wav": "D:/voice-emo/dat/surprise\\DC_su08_stretch_augmented.wav", "label": "surprise"}, "184": {"wav": "D:/voice-emo/dat/fear\\1001_IEO_FEA_HI.wav", "label": "fear"}, "185": {"wav": "D:/voice-emo/dat/happy\\1001_ITS_HAP_XX_pitch_augmented.wav", "label": "happy"}, "186": {"wav": "D:/voice-emo/dat/surprise\\DC_su05_pitch_augmented.wav", "label": "surprise"}, "187": {"wav": "D:/voice-emo/dat/fear\\1001_IWW_FEA_XX.wav", "label": "fear"}, "188": {"wav": "D:/voice-emo/dat/disgust\\1001_TSI_DIS_XX.wav", "label": "disgust"}, "189": {"wav": "D:/voice-emo/dat/neutral\\1001_TSI_NEU_XX.wav", "label": "neutral"}, "190": {"wav": "D:/voice-emo/dat/fear\\1001_IEO_FEA_LO_noise_augmented.wav", "label": "fear"}, "191": {"wav": "D:/voice-emo/dat/happy\\1001_TIE_HAP_XX_noise_augmented.wav", "label": "happy"}, "192": {"wav": "D:/voice-emo/dat/happy\\1001_DFA_HAP_XX.wav", "label": "happy"}, "193": {"wav": "D:/voice-emo/dat/sad\\1001_DFA_SAD_XX_noise_augmented.wav", "label": "sad"}, "194": {"wav": "D:/voice-emo/dat/fear\\1001_TAI_FEA_XX.wav", "label": "fear"}, "195": {"wav": "D:/voice-emo/dat/angry\\1001_IOM_ANG_XX_stretch_augmented.wav", "label": "angry"}, "196": {"wav": "D:/voice-emo/dat/disgust\\1001_ITS_DIS_XX_noise_augmented.wav", "label": "disgust"}, "197": {"wav": "D:/voice-emo/dat/disgust\\1001_DFA_DIS_XX_pitch_augmented.wav", "label": "disgust"}, "198": {"wav": "D:/voice-emo/dat/surprise\\DC_su09_noise_augmented.wav", "label": "surprise"}, "199": {"wav": "D:/voice-emo/dat/sad\\1001_IEO_SAD_HI.wav", "label": "sad"}, "200": {"wav": "D:/voice-emo/dat/disgust\\1001_WSI_DIS_XX_pitch_augmented.wav", "label": "disgust"}, "201": {"wav": "D:/voice-emo/dat/fear\\1001_WSI_FEA_XX_pitch_augmented.wav", "label": "fear"}, "202": {"wav": "D:/voice-emo/dat/disgust\\1001_IEO_DIS_HI_pitch_augmented.wav", "label": "disgust"}, "203": {"wav": "D:/voice-emo/dat/surprise\\DC_su02_pitch_augmented.wav", "label": "surprise"}, "204": {"wav": "D:/voice-emo/dat/fear\\1001_ITS_FEA_XX_pitch_augmented.wav", "label": "fear"}, "205": {"wav": "D:/voice-emo/dat/fear\\1001_TSI_FEA_XX_stretch_augmented.wav", "label": "fear"}, "206": {"wav": "D:/voice-emo/dat/happy\\1001_ITS_HAP_XX.wav", "label": "happy"}, "207": {"wav": "D:/voice-emo/dat/disgust\\1001_IWL_DIS_XX_stretch_augmented.wav", "label": "disgust"}, "208": {"wav": "D:/voice-emo/dat/fear\\1001_IWL_FEA_XX.wav", "label": "fear"}, "209": {"wav": "D:/voice-emo/dat/happy\\1001_WSI_HAP_XX.wav", "label": "happy"}, "210": {"wav": "D:/voice-emo/dat/angry\\1001_TAI_ANG_XX_pitch_augmented.wav", "label": "angry"}, "211": {"wav": "D:/voice-emo/dat/disgust\\1001_TSI_DIS_XX_noise_augmented.wav", "label": "disgust"}, "212": {"wav": "D:/voice-emo/dat/happy\\1001_IWL_HAP_XX_noise_augmented.wav", "label": "happy"}, "213": {"wav": "D:/voice-emo/dat/happy\\1001_TIE_HAP_XX_stretch_augmented.wav", "label": "happy"}, "214": {"wav": "D:/voice-emo/dat/surprise\\DC_su06_noise_augmented.wav", "label": "surprise"}, "215": {"wav": "D:/voice-emo/dat/fear\\1001_IEO_FEA_MD_pitch_augmented.wav", "label": "fear"}, "216": {"wav": "D:/voice-emo/dat/angry\\1001_IWW_ANG_XX_stretch_augmented.wav", "label": "angry"}, "217": {"wav": "D:/voice-emo/dat/disgust\\1001_DFA_DIS_XX.wav", "label": "disgust"}, "218": {"wav": "D:/voice-emo/dat/fear\\1001_TSI_FEA_XX_noise_augmented.wav", "label": "fear"}, "219": {"wav": "D:/voice-emo/dat/disgust\\1001_TAI_DIS_XX_pitch_augmented.wav", "label": "disgust"}, "220": {"wav": "D:/voice-emo/dat/angry\\1001_IWW_ANG_XX_noise_augmented.wav", "label": "angry"}, "221": {"wav": "D:/voice-emo/dat/sad\\1001_IOM_SAD_XX_stretch_augmented.wav", "label": "sad"}, "222": {"wav": "D:/voice-emo/dat/fear\\1001_DFA_FEA_XX_noise_augmented.wav", "label": "fear"}, "223": {"wav": "D:/voice-emo/dat/fear\\1001_IEO_FEA_LO.wav", "label": "fear"}, "224": {"wav": "D:/voice-emo/dat/surprise\\DC_su07.wav", "label": "surprise"}, "225": {"wav": "D:/voice-emo/dat/sad\\1001_DFA_SAD_XX.wav", "label": "sad"}, "226": {"wav": "D:/voice-emo/dat/fear\\1001_MTI_FEA_XX_pitch_augmented.wav", "label": "fear"}, "227": {"wav": "D:/voice-emo/dat/neutral\\1001_ITH_NEU_XX.wav", "label": "neutral"}, "228": {"wav": "D:/voice-emo/dat/angry\\1001_IEO_ANG_LO_pitch_augmented.wav", "label": "angry"}, "229": {"wav": "D:/voice-emo/dat/surprise\\DC_su10.wav", "label": "surprise"}, "230": {"wav": "D:/voice-emo/dat/disgust\\1001_DFA_DIS_XX_noise_augmented.wav", "label": "disgust"}, "231": {"wav": "D:/voice-emo/dat/disgust\\1001_IWL_DIS_XX_pitch_augmented.wav", "label": "disgust"}, "232": {"wav": "D:/voice-emo/dat/disgust\\1001_ITH_DIS_XX_pitch_augmented.wav", "label": "disgust"}, "233": {"wav": "D:/voice-emo/dat/fear\\1001_IWW_FEA_XX_stretch_augmented.wav", "label": "fear"}, "234": {"wav": "D:/voice-emo/dat/neutral\\1001_IWW_NEU_XX_noise_augmented.wav", "label": "neutral"}, "235": {"wav": "D:/voice-emo/dat/sad\\1001_ITS_SAD_XX_pitch_augmented.wav", "label": "sad"}, "236": {"wav": "D:/voice-emo/dat/angry\\1001_IWW_ANG_XX.wav", "label": "angry"}, "237": {"wav": "D:/voice-emo/dat/surprise\\DC_su01_stretch_augmented.wav", "label": "surprise"}, "238": {"wav": "D:/voice-emo/dat/fear\\1001_TSI_FEA_XX.wav", "label": "fear"}, "239": {"wav": "D:/voice-emo/dat/neutral\\1001_IEO_NEU_XX_pitch_augmented.wav", "label": "neutral"}, "240": {"wav": "D:/voice-emo/dat/sad\\1001_IWW_SAD_XX_stretch_augmented.wav", "label": "sad"}, "241": {"wav": "D:/voice-emo/dat/surprise\\DC_su01.wav", "label": "surprise"}, "242": {"wav": "D:/voice-emo/dat/angry\\1001_IEO_ANG_LO_stretch_augmented.wav", "label": "angry"}, "243": {"wav": "D:/voice-emo/dat/disgust\\1001_TIE_DIS_XX_stretch_augmented.wav", "label": "disgust"}, "244": {"wav": "D:/voice-emo/dat/sad\\1001_IOM_SAD_XX_noise_augmented.wav", "label": "sad"}, "245": {"wav": "D:/voice-emo/dat/fear\\1001_IOM_FEA_XX.wav", "label": "fear"}, "246": {"wav": "D:/voice-emo/dat/sad\\1001_TAI_SAD_XX_stretch_augmented.wav", "label": "sad"}, "247": {"wav": "D:/voice-emo/dat/disgust\\1001_TIE_DIS_XX_noise_augmented.wav", "label": "disgust"}, "248": {"wav": "D:/voice-emo/dat/disgust\\1001_WSI_DIS_XX_stretch_augmented.wav", "label": "disgust"}, "249": {"wav": "D:/voice-emo/dat/sad\\1001_IWL_SAD_XX_stretch_augmented.wav", "label": "sad"}, "250": {"wav": "D:/voice-emo/dat/happy\\1001_TSI_HAP_XX.wav", "label": "happy"}, "251": {"wav": "D:/voice-emo/dat/fear\\1001_ITH_FEA_XX_stretch_augmented.wav", "label": "fear"}, "252": {"wav": "D:/voice-emo/dat/fear\\1001_TIE_FEA_XX_pitch_augmented.wav", "label": "fear"}, "253": {"wav": "D:/voice-emo/dat/angry\\1001_WSI_ANG_XX_noise_augmented.wav", "label": "angry"}, "254": {"wav": "D:/voice-emo/dat/angry\\1001_TAI_ANG_XX_noise_augmented.wav", "label": "angry"}, "255": {"wav": "D:/voice-emo/dat/happy\\1001_WSI_HAP_XX_stretch_augmented.wav", "label": "happy"}, "256": {"wav": "D:/voice-emo/dat/neutral\\1001_TAI_NEU_XX_noise_augmented.wav", "label": "neutral"}, "257": {"wav": "D:/voice-emo/dat/surprise\\DC_su03_noise_augmented.wav", "label": "surprise"}, "258": {"wav": "D:/voice-emo/dat/happy\\1001_MTI_HAP_XX_stretch_augmented.wav", "label": "happy"}, "259": {"wav": "D:/voice-emo/dat/neutral\\1001_IOM_NEU_XX_pitch_augmented.wav", "label": "neutral"}, "260": {"wav": "D:/voice-emo/dat/happy\\1001_WSI_HAP_XX_pitch_augmented.wav", "label": "happy"}, "261": {"wav": "D:/voice-emo/dat/happy\\1001_ITH_HAP_XX_stretch_augmented.wav", "label": "happy"}, "262": {"wav": "D:/voice-emo/dat/sad\\1001_ITH_SAD_XX_noise_augmented.wav", "label": "sad"}, "263": {"wav": "D:/voice-emo/dat/sad\\1001_IEO_SAD_LO_stretch_augmented.wav", "label": "sad"}, "264": {"wav": "D:/voice-emo/dat/angry\\1001_TIE_ANG_XX_pitch_augmented.wav", "label": "angry"}, "265": {"wav": "D:/voice-emo/dat/happy\\1001_IEO_HAP_HI_stretch_augmented.wav", "label": "happy"}, "266": {"wav": "D:/voice-emo/dat/neutral\\1001_TSI_NEU_XX_noise_augmented.wav", "label": "neutral"}, "267": {"wav": "D:/voice-emo/dat/happy\\1001_IWW_HAP_XX_noise_augmented.wav", "label": "happy"}, "268": {"wav": "D:/voice-emo/dat/angry\\1001_IWL_ANG_XX.wav", "label": "angry"}, "269": {"wav": "D:/voice-emo/dat/surprise\\DC_su09_stretch_augmented.wav", "label": "surprise"}, "270": {"wav": "D:/voice-emo/dat/surprise\\DC_su10_pitch_augmented.wav", "label": "surprise"}, "271": {"wav": "D:/voice-emo/dat/neutral\\1001_WSI_NEU_XX_noise_augmented.wav", "label": "neutral"}, "272": {"wav": "D:/voice-emo/dat/surprise\\DC_su05_noise_augmented.wav", "label": "surprise"}, "273": {"wav": "D:/voice-emo/dat/angry\\1001_TAI_ANG_XX_stretch_augmented.wav", "label": "angry"}, "274": {"wav": "D:/voice-emo/dat/angry\\1001_TAI_ANG_XX.wav", "label": "angry"}, "275": {"wav": "D:/voice-emo/dat/happy\\1001_TAI_HAP_XX_pitch_augmented.wav", "label": "happy"}, "276": {"wav": "D:/voice-emo/dat/fear\\1001_ITS_FEA_XX.wav", "label": "fear"}, "277": {"wav": "D:/voice-emo/dat/happy\\1001_IEO_HAP_LO.wav", "label": "happy"}, "278": {"wav": "D:/voice-emo/dat/surprise\\DC_su02_noise_augmented.wav", "label": "surprise"}, "279": {"wav": "D:/voice-emo/dat/neutral\\1001_IWW_NEU_XX.wav", "label": "neutral"}, "280": {"wav": "D:/voice-emo/dat/neutral\\1001_DFA_NEU_XX_stretch_augmented.wav", "label": "neutral"}, "281": {"wav": "D:/voice-emo/dat/happy\\1001_DFA_HAP_XX_stretch_augmented.wav", "label": "happy"}, "282": {"wav": "D:/voice-emo/dat/angry\\1001_IOM_ANG_XX_noise_augmented.wav", "label": "angry"}, "283": {"wav": "D:/voice-emo/dat/fear\\1001_TIE_FEA_XX_noise_augmented.wav", "label": "fear"}, "284": {"wav": "D:/voice-emo/dat/neutral\\1001_WSI_NEU_XX_stretch_augmented.wav", "label": "neutral"}, "285": {"wav": "D:/voice-emo/dat/surprise\\DC_su07_stretch_augmented.wav", "label": "surprise"}, "286": {"wav": "D:/voice-emo/dat/fear\\1001_MTI_FEA_XX_noise_augmented.wav", "label": "fear"}, "287": {"wav": "D:/voice-emo/dat/fear\\1001_DFA_FEA_XX_pitch_augmented.wav", "label": "fear"}, "288": {"wav": "D:/voice-emo/dat/neutral\\1001_TIE_NEU_XX_stretch_augmented.wav", "label": "neutral"}, "289": {"wav": "D:/voice-emo/dat/surprise\\DC_su02.wav", "label": "surprise"}, "290": {"wav": "D:/voice-emo/dat/disgust\\1001_MTI_DIS_XX_pitch_augmented.wav", "label": "disgust"}, "291": {"wav": "D:/voice-emo/dat/disgust\\1001_IEO_DIS_MD_pitch_augmented.wav", "label": "disgust"}, "292": {"wav": "D:/voice-emo/dat/happy\\1001_IEO_HAP_HI_pitch_augmented.wav", "label": "happy"}, "293": {"wav": "D:/voice-emo/dat/sad\\1001_TIE_SAD_XX_stretch_augmented.wav", "label": "sad"}}

results/train_with_wav2vec2/1993/valid.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"0": {"wav": "D:/voice-emo/dat/happy\\1001_IWW_HAP_XX_pitch_augmented.wav", "label": "happy"}, "1": {"wav": "D:/voice-emo/dat/happy\\1001_ITH_HAP_XX_noise_augmented.wav", "label": "happy"}, "2": {"wav": "D:/voice-emo/dat/disgust\\1001_ITS_DIS_XX_pitch_augmented.wav", "label": "disgust"}, "3": {"wav": "D:/voice-emo/dat/happy\\1001_DFA_HAP_XX_noise_augmented.wav", "label": "happy"}, "4": {"wav": "D:/voice-emo/dat/disgust\\1001_IWL_DIS_XX.wav", "label": "disgust"}, "5": {"wav": "D:/voice-emo/dat/disgust\\1001_ITH_DIS_XX.wav", "label": "disgust"}, "6": {"wav": "D:/voice-emo/dat/neutral\\1001_DFA_NEU_XX_pitch_augmented.wav", "label": "neutral"}, "7": {"wav": "D:/voice-emo/dat/happy\\1001_IEO_HAP_MD.wav", "label": "happy"}, "8": {"wav": "D:/voice-emo/dat/angry\\1001_IEO_ANG_HI.wav", "label": "angry"}, "9": {"wav": "D:/voice-emo/dat/happy\\1001_TSI_HAP_XX_noise_augmented.wav", "label": "happy"}, "10": {"wav": "D:/voice-emo/dat/sad\\1001_DFA_SAD_XX_stretch_augmented.wav", "label": "sad"}, "11": {"wav": "D:/voice-emo/dat/neutral\\1001_IOM_NEU_XX_noise_augmented.wav", "label": "neutral"}, "12": {"wav": "D:/voice-emo/dat/sad\\1001_IWW_SAD_XX_noise_augmented.wav", "label": "sad"}, "13": {"wav": "D:/voice-emo/dat/angry\\1001_IEO_ANG_LO.wav", "label": "angry"}, "14": {"wav": "D:/voice-emo/dat/happy\\1001_IWL_HAP_XX_pitch_augmented.wav", "label": "happy"}, "15": {"wav": "D:/voice-emo/dat/sad\\1001_IEO_SAD_MD_pitch_augmented.wav", "label": "sad"}, "16": {"wav": "D:/voice-emo/dat/angry\\1001_TIE_ANG_XX_noise_augmented.wav", "label": "angry"}, "17": {"wav": "D:/voice-emo/dat/surprise\\DC_su03_pitch_augmented.wav", "label": "surprise"}, "18": {"wav": "D:/voice-emo/dat/neutral\\1001_ITS_NEU_XX_pitch_augmented.wav", "label": "neutral"}, "19": {"wav": "D:/voice-emo/dat/fear\\1001_IOM_FEA_XX_stretch_augmented.wav", "label": "fear"}, "20": {"wav": "D:/voice-emo/dat/happy\\1001_IEO_HAP_HI_noise_augmented.wav", "label": "happy"}, "21": {"wav": "D:/voice-emo/dat/neutral\\1001_TSI_NEU_XX_pitch_augmented.wav", "label": "neutral"}, "22": {"wav": "D:/voice-emo/dat/sad\\1001_IEO_SAD_MD.wav", "label": "sad"}, "23": {"wav": "D:/voice-emo/dat/fear\\1001_MTI_FEA_XX_stretch_augmented.wav", "label": "fear"}, "24": {"wav": "D:/voice-emo/dat/angry\\1001_IEO_ANG_MD.wav", "label": "angry"}, "25": {"wav": "D:/voice-emo/dat/surprise\\DC_su05.wav", "label": "surprise"}, "26": {"wav": "D:/voice-emo/dat/fear\\1001_IEO_FEA_MD_stretch_augmented.wav", "label": "fear"}, "27": {"wav": "D:/voice-emo/dat/happy\\1001_IEO_HAP_MD_pitch_augmented.wav", "label": "happy"}, "28": {"wav": "D:/voice-emo/dat/happy\\1001_TSI_HAP_XX_stretch_augmented.wav", "label": "happy"}, "29": {"wav": "D:/voice-emo/dat/angry\\1001_IWL_ANG_XX_pitch_augmented.wav", "label": "angry"}, "30": {"wav": "D:/voice-emo/dat/disgust\\1001_IEO_DIS_HI.wav", "label": "disgust"}, "31": {"wav": "D:/voice-emo/dat/happy\\1001_ITH_HAP_XX_pitch_augmented.wav", "label": "happy"}, "32": {"wav": "D:/voice-emo/dat/happy\\1001_IOM_HAP_XX.wav", "label": "happy"}, "33": {"wav": "D:/voice-emo/dat/angry\\1001_IEO_ANG_HI_stretch_augmented.wav", "label": "angry"}, "34": {"wav": "D:/voice-emo/dat/happy\\1001_IWL_HAP_XX_stretch_augmented.wav", "label": "happy"}, "35": {"wav": "D:/voice-emo/dat/neutral\\1001_MTI_NEU_XX_noise_augmented.wav", "label": "neutral"}}

train_with_wav2vec.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import os
+import sys
+import logging
+import speechbrain as sb
+from hyperpyyaml import load_hyperpyyaml
+import json
+import random
+import torch
+from sklearn.preprocessing import LabelEncoder
+# Check if GPU is available
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+logger = logging.getLogger(__name__)
+SAMPLERATE = 16000
+def prepare_data(data_original, save_json_train, save_json_valid, save_json_test, split_ratio=[80, 10, 10], seed=12):
+    # Setting seeds for reproducible code.
+    random.seed(seed)
+    # Check if data preparation has already been done (skip if files exist)
+    if skip(save_json_train, save_json_valid, save_json_test):
+        logger.info("Preparation completed in previous run, skipping.")
+        return
+    # Collect audio files and labels
+    wav_list = []
+    labels = os.listdir(data_original)
+    for label in labels:
+        label_dir = os.path.join(data_original, label)
+        if os.path.isdir(label_dir):
+            for audio_file in os.listdir(label_dir):
+                if audio_file.endswith('.wav'):
+                    wav_file = os.path.join(label_dir, audio_file)
+                    if os.path.isfile(wav_file):
+                        wav_list.append((wav_file, label))
+                    else:
+                        logger.warning(f"Skipping invalid audio file: {wav_file}")
+    # Shuffle and split the data
+    random.shuffle(wav_list)
+    n_total = len(wav_list)
+    n_train = n_total * split_ratio[0] // 100
+    n_valid = n_total * split_ratio[1] // 100
+    train_set = wav_list[:n_train]
+    valid_set = wav_list[n_train:n_train + n_valid]
+    test_set = wav_list[n_train + n_valid:]
+    # Create JSON files for train, valid, and test sets
+    create_json(train_set, save_json_train)
+    create_json(valid_set, save_json_valid)
+    create_json(test_set, save_json_test)
+    logger.info(f"Created {save_json_train}, {save_json_valid}, and {save_json_test}")
+def create_json(wav_list, json_file):
+    json_dict = {}
+    for wav_file, label in wav_list:
+        signal = sb.dataio.dataio.read_audio(wav_file)
+        duration = signal.shape[0] / SAMPLERATE
+        uttid = os.path.splitext(os.path.basename(wav_file))[0]
+        json_dict[uttid] = {
+            "wav": wav_file,
+            "length": duration,
+            "label": label,
+        }
+    with open(json_file, mode="w") as json_f:
+        json.dump(json_dict, json_f, indent=2)
+    logger.info(f"Created {json_file}")
+def skip(*filenames):
+    for filename in filenames:
+        if not os.path.isfile(filename):
+            return False
+    return True
+class EmoIdBrain(sb.Brain):
+    def compute_forward(self, batch, stage):
+        """Computation pipeline based on an encoder + emotion classifier."""
+        batch = batch.to(self.device)
+        wavs, lens = batch.sig
+        outputs = self.modules.wav2vec2(wavs, lens)
+        # Apply pooling and MLP layers
+        outputs = self.hparams.avg_pool(outputs, lens)
+        outputs = outputs.view(outputs.shape[0], -1)
+        outputs = self.modules.output_mlp(outputs)
+        outputs = self.hparams.log_softmax(outputs)
+        return outputs
+    def compute_objectives(self, predictions, batch, stage):
+      emo_encoded_list = []
+      for sample in batch:
+          # Check if 'emo_encoded' exists in the sample
+          if 'emo_encoded' in sample:
+              emo_encoded_list.append(sample['emo_encoded'])
+          else:
+              # Log a warning and skip this sample if 'emo_encoded' is missing
+              logging.warning(f"'emo_encoded' key not found in sample: {sample}")
+      if not emo_encoded_list:
+          # If no valid 'emo_encoded' values were found in the batch, raise an error
+          raise ValueError("No valid 'emo_encoded' values found in the batch.")
+      # Convert emo_encoded_list to a torch tensor
+      emo_encoded = torch.tensor(emo_encoded_list, dtype=torch.long)
+      # Ensure emo_encoded is a tensor
+      if not isinstance(emo_encoded, torch.Tensor):
+          raise TypeError(f"Unsupported label type encountered: {type(emo_encoded)}")
+      # Perform any necessary operations with emo_encoded here
+      loss = self.hparams.compute_cost(predictions, emo_encoded)
+      if stage != sb.Stage.TRAIN:
+          self.error_metrics.append(batch.id, predictions, emo_encoded)
+      return loss
+    def on_stage_start(self, stage, epoch=None):
+        """Gets called at the beginning of each epoch."""
+        self.loss_metric = sb.utils.metric_stats.MetricStats(metric=sb.nnet.losses.nll_loss)
+        if stage != sb.Stage.TRAIN:
+            self.error_metrics = self.hparams.error_stats()
+    def on_stage_end(self, stage, stage_loss, epoch=None):
+      """Gets called at the end of an epoch."""
+      if stage == sb.Stage.TRAIN:
+          self.train_loss = stage_loss
+      else:
+          stats = {
+              "loss": stage_loss,
+          }
+          if self.error_metrics is not None and len(self.error_metrics.scores) > 0:
+              # Calculate error rate only if there are scores in the error_metrics
+              stats["error_rate"] = self.error_metrics.summarize("average")
+          else:
+              # Handle case where error_metrics are None or empty
+              stats["error_rate"] = float('nan')  # Set error_rate to NaN if no scores available
+          if stage == sb.Stage.VALID:
+              old_lr, new_lr = self.hparams.lr_annealing(stats["error_rate"])
+              sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)
+              self.hparams.train_logger.log_stats(
+                  {"Epoch": epoch, "lr": old_lr},
+                  train_stats={"loss": self.train_loss},
+                  valid_stats=stats,
+              )
+              self.checkpointer.save_and_keep_only(meta=stats, min_keys=["error_rate"])
+          elif stage == sb.Stage.TEST:
+              self.hparams.train_logger.log_stats(
+                  {"Epoch loaded": self.hparams.epoch_counter.current},
+                  test_stats=stats,
+              )
+    def init_optimizers(self):
+        """Initializes the optimizer."""
+        self.optimizer = self.hparams.opt_class(self.hparams.model.parameters())
+        if self.checkpointer is not None:
+            self.checkpointer.add_recoverable("optimizer", self.optimizer)
+        self.optimizers_dict = {"model_optimizer": self.optimizer}
+def dataio_prep(hparams):
+    """Prepares the datasets to be used in the brain class."""
+    # Define the audio processing pipeline
+    @sb.utils.data_pipeline.takes("wav")
+    @sb.utils.data_pipeline.provides("sig")
+    def audio_pipeline(wav):
+        """Load the signal from a WAV file."""
+        sig = sb.dataio.dataio.read_audio(wav)
+        return sig
+    # Initialize the label encoder
+    label_encoder = sb.dataio.encoder.CategoricalEncoder()
+    label_encoder.add_unk()
+    label_to_index = {
+        'angry': 0,
+        'happy': 1,
+        'neutral': 2,
+        'sad': 3,
+        'surprise': 4,
+        'disgust': 5,
+        'fear': 6
+    }
+    @sb.utils.data_pipeline.takes("label")
+    @sb.utils.data_pipeline.provides("label", "emo_encoded")
+    def label_pipeline(label):
+        """Encode the emotion label."""
+        if label in label_to_index:
+            emo_encoded = label_to_index[label]
+        else:
+            raise ValueError(f"Unknown label encountered: {label}")
+        yield label, torch.tensor(emo_encoded, dtype=torch.long)
+    # Define datasets dictionary
+    datasets = {}
+    data_info = {
+        "train": hparams["train_annotation"],
+        "valid": hparams["valid_annotation"],
+        "test": hparams["test_annotation"],
+    }
+    # Load datasets and apply pipelines
+    for dataset_name, json_path in data_info.items():
+        datasets[dataset_name] = sb.dataio.dataset.DynamicItemDataset.from_json(
+            json_path=json_path,
+            replacements={"data_root": hparams["data_original"]},
+            dynamic_items=[audio_pipeline, label_pipeline],
+            output_keys=["id", "sig", "label", "emo_encoded"],
+        )
+    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
+    label_encoder.load_or_create(
+        path=lab_enc_file,
+        from_didatasets=[datasets["train"]],
+        output_key="label",
+    )
+    return datasets
+if __name__ == "__main__":
+    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])
+    sb.utils.distributed.ddp_init_group(run_opts)
+    try:
+        with open(hparams_file) as fin:
+            hparams = load_hyperpyyaml(fin, overrides)
+        data_original = hparams.get("data_original")
+        if data_original is not None:
+            data_original = os.path.normpath(data_original)
+            if not os.path.exists(data_original):
+                raise ValueError(f"data_original path '{data_original}' does not exist.")
+        else:
+            raise ValueError("data_original path is not specified in the YAML configuration.")
+    except Exception as e:
+        print("Error occurred", e)
+        sys.exit(1)
+    sb.create_experiment_directory(
+        experiment_directory=hparams["output_folder"],
+        hyperparams_to_save=hparams_file,
+        overrides=overrides,
+    )
+    if not hparams["skip_prep"]:
+        prepare_kwargs = {
+            "data_original": hparams["data_original"],
+            "save_json_train": hparams["train_annotation"],
+            "save_json_valid": hparams["valid_annotation"],
+            "save_json_test": hparams["test_annotation"],
+            "split_ratio": hparams["split_ratio"],
+            "seed": hparams["seed"],
+        }
+        sb.utils.distributed.run_on_main(prepare_data, kwargs=prepare_kwargs)
+    datasets = dataio_prep(hparams)
+    hparams["wav2vec2"] = hparams["wav2vec2"].to(device=run_opts["device"])
+    if not hparams["freeze_wav2vec2"] and hparams["freeze_wav2vec2_conv"]:
+        hparams["wav2vec2"].model.feature_extractor._freeze_parameters()
+    emo_id_brain = EmoIdBrain(
+        modules=hparams["modules"],
+        opt_class=hparams["opt_class"],
+        hparams=hparams,
+        run_opts=run_opts,
+        checkpointer=hparams["checkpointer"],
+    )
+    emo_id_brain.fit(
+        epoch_counter=emo_id_brain.hparams.epoch_counter,
+        train_set=datasets["train"],
+        valid_set=datasets["valid"],
+        train_loader_kwargs=hparams["dataloader_options"],
+        valid_loader_kwargs=hparams["dataloader_options"],
+    )
+    test_stats = emo_id_brain.evaluate(
+        test_set=datasets["test"],
+        min_key="error_rate",
+        test_loader_kwargs=hparams["dataloader_options"],
+    )