Spaces:

reputation
/

speech_eMEOWtion

Sleeping

App Files Files Community

Tanishq commited on Jan 12, 2024

Commit

4dee9c4

verified ·

1 Parent(s): f0a9b85

Upload 9 files

Browse files

Files changed (9) hide show

config.py +13 -0
dataset.py +68 -0
model.py +60 -0
requirements.txt +5 -0
speech_emotion.pth.tar +3 -0
speech_emotionbn.pth.tar +3 -0
train.py +134 -0
uploads/OAF_bar_fear.wav +0 -0
utils.py +21 -0

config.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import torch
+# DEVICE = "cpu"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+TRAIN_DIR = "data/train"
+VAL_DIR = "data/val"
+LEARNING_RATE = 2e-3
+BATCH_SIZE = 32
+NUM_WORKERS = 2
+NUM_EPOCHS = 50
+LOAD_MODEL = True
+SAVE_MODEL = True
+CHECKPOINT = "speech_emotionbn.pth.tar"

dataset.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import os
+import librosa
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+def extract_mfcc(filename):
+    y, sr = librosa.load(filename, duration=3, offset=0.5)
+    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
+    return mfcc
+class SpeechEmotionDataset(Dataset):
+    def __init__(self, root_dir, transform=None):
+        self.root_dir = root_dir
+        self.transform = transform
+        self.classes = sorted(os.listdir(root_dir))
+        self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}
+        self.file_list = self.create_file_list()
+    def create_file_list(self):
+        file_list = []
+        for cls in self.classes:
+            class_path = os.path.join(self.root_dir, cls)
+            for file_name in os.listdir(class_path):
+                file_path = os.path.join(class_path, file_name)
+                file_list.append((file_path, self.class_to_idx[cls]))
+        return file_list
+    def __len__(self):
+        return len(self.file_list)
+    # def __init__(self, root_dir, transform=None):
+    #     self.root_dir = root_dir
+    #     self.transform = transform
+    #     self.paths, self.labels = self._load_file_list()
+    #     self.class_to_index = {cls: i for i, cls in enumerate(set(self.labels))}
+    #
+    # def _load_file_list(self):
+    #     paths = []
+    #     labels = []
+    #     for dirname, _, filenames in os.walk(self.root_dir):
+    #         for filename in filenames:
+    #             audio_path = os.path.join(dirname, filename)
+    #             label = filename.split('_')[-1].split('.')[0].lower()
+    #             paths.append(audio_path)
+    #             labels.append(label)
+    #     return paths, labels
+    #
+    # def __len__(self):
+    #     return len(self.paths)
+    def __getitem__(self, idx):
+        audio_path, label = self.file_list[idx]
+        # audio_path = self.paths[idx]
+        # label = self.labels[idx]
+        # class_index = self.class_to_index[label]
+        target = [0]*7
+        target[label] = 1
+        target = torch.FloatTensor(target)
+        mfcc = extract_mfcc(audio_path)
+        mfcc = torch.from_numpy(mfcc)
+        if self.transform:
+            mfcc = self.transform(mfcc)
+        return mfcc, target

model.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from torch import nn as nn
+# class SpeechEmotionModel(nn.Module):
+#     def __init__(self):
+#         super(SpeechEmotionModel, self).__init__()
+#         self.lstm = nn.LSTM(input_size=1, hidden_size=256, num_layers=1, batch_first=True)
+#         self.dropout1 = nn.Dropout(0.2)
+#         self.fc1 = nn.Linear(256, 128)
+#         self.dropout2 = nn.Dropout(0.2)
+#         self.relu1 = nn.ReLU()
+#         self.fc2 = nn.Linear(128, 64)
+#         self.dropout3 = nn.Dropout(0.2)
+#         self.relu2 = nn.ReLU()
+#         self.fc3 = nn.Linear(64, 7)
+#         self.softmax = nn.Softmax(dim=1)
+#
+#     def forward(self, x):
+#         x, _ = self.lstm(x)
+#         x = x[:, -1, :]
+#         x = self.dropout1(x)
+#         x = self.relu1(self.fc1(x))
+#         x = self.dropout2(x)
+#         x = self.relu2(self.fc2(x))
+#         x = self.dropout3(x)
+#         x = self.fc3(x)
+#         x = self.softmax(x)
+#         return x
+class SpeechEmotionModel(nn.Module):
+    def __init__(self):
+        super(SpeechEmotionModel, self).__init__()
+        self.lstm = nn.LSTM(input_size=1, hidden_size=256, num_layers=1, batch_first=True, bidirectional=True)
+        self.batch_norm1 = nn.BatchNorm1d(512)
+        self.dropout1 = nn.Dropout(0.2)
+        self.fc1 = nn.Linear(512, 128)
+        self.batch_norm2 = nn.BatchNorm1d(128)
+        self.dropout2 = nn.Dropout(0.2)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(128, 64)
+        self.batch_norm3 = nn.BatchNorm1d(64)
+        self.dropout3 = nn.Dropout(0.2)
+        self.relu2 = nn.ReLU()
+        self.fc3 = nn.Linear(64, 7)
+        self.softmax = nn.Softmax(dim=1)
+    def forward(self, x):
+        x, _ = self.lstm(x)
+        x = x[:, -1, :]
+        x = self.batch_norm1(x)
+        x = self.dropout1(x)
+        x = self.relu1(self.fc1(x))
+        x = self.batch_norm2(x)
+        x = self.dropout2(x)
+        x = self.relu2(self.fc2(x))
+        x = self.batch_norm3(x)
+        x = self.dropout3(x)
+        x = self.fc3(x)
+        x = self.softmax(x)
+        return x

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch
+streamlit
+tqdm
+librosa
+numpy

speech_emotion.pth.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:776eafaf5ea2e83c0855a00a99b106b4ff09847eccca818838944cfb02a11a5b
+size 3693802

speech_emotionbn.pth.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ccb73cf621b50b83c25ba70f84383e40d54aeecb54eafea1a176eee101563afb
+size 7306366

train.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import time
+import torch
+from torch import nn as nn, optim
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+import config
+from dataset import SpeechEmotionDataset, extract_mfcc
+from model import SpeechEmotionModel
+from utils import load_checkpoint, save_checkpoint
+def train_fn(model, loader, opt, criterion, epoch):
+    loop = tqdm(loader, leave=True)
+    model.train()
+    epoch_loss = 0.0
+    for idx, (feature, label) in enumerate(loop):
+        total_acc, total_count = 0, 0
+        feature = feature.to(config.DEVICE)
+        label = label.to(config.DEVICE)
+        opt.zero_grad()
+        feature = torch.unsqueeze(feature, dim=2)
+        predicted_label = model(feature)
+        loss = criterion(predicted_label, label)
+        epoch_loss += loss.item()
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
+        opt.step()
+        total_acc += (predicted_label.argmax(1) == label.argmax(1)).sum().item()
+        total_count += label.size(0)
+        loop.set_postfix({"epoch": epoch, "loss": epoch_loss / len(loader), "accuracy": total_acc / total_count})
+def main():
+    model = SpeechEmotionModel().to(config.DEVICE)
+    opt = optim.Adam(model.parameters(), lr=config.LEARNING_RATE, betas=(0.5, 0.999), )
+    criterion = nn.CrossEntropyLoss()
+    # if config.LOAD_MODEL:
+    #     load_checkpoint(
+    #         config.CHECKPOINT, model, opt, config.LEARNING_RATE,
+    #     )
+    train_dataset = SpeechEmotionDataset(root_dir=config.TRAIN_DIR)
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=config.BATCH_SIZE,
+        shuffle=True,
+        num_workers=config.NUM_WORKERS,
+    )
+    val_dataset = SpeechEmotionDataset(root_dir=config.VAL_DIR)
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=config.BATCH_SIZE,
+        shuffle=True,
+        num_workers=config.NUM_WORKERS,
+    )
+    total_accu = None
+    # scheduler = torch.optim.lr_scheduler.StepLR(opt, 1, gamma=0.5)
+    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, patience=2, verbose=True)
+    for epoch in range(config.NUM_EPOCHS):
+        epoch_start_time = time.time()
+        train_fn(
+            model, train_loader, opt, criterion, epoch
+        )
+        accu_val, loss_val = evaluate(model, criterion, val_loader)
+        # if total_accu is not None and total_accu > accu_val:
+        #     scheduler.step()
+        # else:
+        #     total_accu = accu_val
+        scheduler.step(loss_val)
+        print("+" + "-" * 19 + "+" + "-" * 15 + "+" + "-" * 20 + "+" + "-" * 24 + "+")
+        print(
+            "| end of epoch: {:3d} | time: {:6.2f}s | val_loss: {:8.3f} | "
+            "val_accuracy: {:8.3f} |".format(
+                epoch, time.time() - epoch_start_time, loss_val, accu_val
+            )
+        )
+        print("+" + "-" * 19 + "+" + "-" * 15 + "+" + "-" * 20 + "+" + "-" * 24 + "+")
+        if config.SAVE_MODEL:
+            save_checkpoint(model, opt, filename=config.CHECKPOINT)
+def test():
+    model = SpeechEmotionModel().to(config.DEVICE)
+    opt = optim.Adam(model.parameters(), lr=config.LEARNING_RATE, betas=(0.5, 0.999), )
+    criterion = nn.CrossEntropyLoss()
+    if config.LOAD_MODEL:
+        load_checkpoint(
+            config.CHECKPOINT, model, opt, config.LEARNING_RATE,
+        )
+    val_dataset = SpeechEmotionDataset(root_dir=config.VAL_DIR)
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=config.BATCH_SIZE,
+        shuffle=True,
+        num_workers=config.NUM_WORKERS,
+    )
+    label = {0: 'anger', 1: 'disgust', 2: 'fear', 3: 'happy', 4: 'neutral', 5: 'ps', 6: 'sad'}
+    mfcc = extract_mfcc("uploads/OAF_bar_fear.wav")
+    mfcc = torch.from_numpy(mfcc)
+    mfcc = mfcc.to(config.DEVICE)
+    mfcc = torch.unsqueeze(mfcc, dim=1)
+    mfcc = torch.unsqueeze(mfcc, dim=0)
+    model.eval()
+    y_pred = model(mfcc)
+    print(torch.argmax(y_pred))
+    print(val_dataset.class_to_idx)
+    print(evaluate(model, criterion, val_loader))
+def evaluate(model, criterion, dataloader):
+    model.eval()
+    total_correct = 0
+    total_samples = 0
+    total_loss = 0.0
+    with torch.no_grad():
+        for inputs, labels in dataloader:
+            inputs, labels = inputs.to(config.DEVICE), labels.to(config.DEVICE)
+            inputs = torch.unsqueeze(inputs, dim=2)
+            outputs = model(inputs)
+            loss = criterion(outputs, labels)
+            total_loss += loss.item()
+            _, predicted = torch.max(outputs, 1)
+            total_correct += (predicted == labels.argmax(1)).sum().item()
+            total_samples += labels.size(0)
+    accuracy = total_correct / total_samples
+    average_loss = total_loss / len(dataloader)
+    return accuracy, average_loss
+if __name__ == "__main__":
+    test()

uploads/OAF_bar_fear.wav ADDED Viewed

Binary file (82.6 kB). View file

utils.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+import config
+def save_checkpoint(model, optimizer, filename="my_checkpoint.pth.tar"):
+    print("=> Saving checkpoint")
+    checkpoint = {
+        "state_dict": model.state_dict(),
+        "optimizer": optimizer.state_dict(),
+    }
+    torch.save(checkpoint, filename)
+def load_checkpoint(checkpoint_file, model, optimizer, lr):
+    print("=> Loading checkpoint")
+    checkpoint = torch.load(checkpoint_file, map_location=config.DEVICE)
+    model.load_state_dict(checkpoint["state_dict"])
+    optimizer.load_state_dict(checkpoint["optimizer"])
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = lr