Spaces:
Sleeping
Sleeping
Tanishq commited on
Upload 9 files
Browse files- config.py +13 -0
- dataset.py +68 -0
- model.py +60 -0
- requirements.txt +5 -0
- speech_emotion.pth.tar +3 -0
- speech_emotionbn.pth.tar +3 -0
- train.py +134 -0
- uploads/OAF_bar_fear.wav +0 -0
- utils.py +21 -0
config.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
# DEVICE = "cpu"
|
| 4 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 5 |
+
TRAIN_DIR = "data/train"
|
| 6 |
+
VAL_DIR = "data/val"
|
| 7 |
+
LEARNING_RATE = 2e-3
|
| 8 |
+
BATCH_SIZE = 32
|
| 9 |
+
NUM_WORKERS = 2
|
| 10 |
+
NUM_EPOCHS = 50
|
| 11 |
+
LOAD_MODEL = True
|
| 12 |
+
SAVE_MODEL = True
|
| 13 |
+
CHECKPOINT = "speech_emotionbn.pth.tar"
|
dataset.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import librosa
|
| 3 |
+
import numpy as np
|
| 4 |
+
import torch
|
| 5 |
+
from torch.utils.data import Dataset
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def extract_mfcc(filename):
|
| 9 |
+
y, sr = librosa.load(filename, duration=3, offset=0.5)
|
| 10 |
+
mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
|
| 11 |
+
return mfcc
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class SpeechEmotionDataset(Dataset):
|
| 15 |
+
def __init__(self, root_dir, transform=None):
|
| 16 |
+
self.root_dir = root_dir
|
| 17 |
+
self.transform = transform
|
| 18 |
+
self.classes = sorted(os.listdir(root_dir))
|
| 19 |
+
self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}
|
| 20 |
+
self.file_list = self.create_file_list()
|
| 21 |
+
|
| 22 |
+
def create_file_list(self):
|
| 23 |
+
file_list = []
|
| 24 |
+
for cls in self.classes:
|
| 25 |
+
class_path = os.path.join(self.root_dir, cls)
|
| 26 |
+
for file_name in os.listdir(class_path):
|
| 27 |
+
file_path = os.path.join(class_path, file_name)
|
| 28 |
+
file_list.append((file_path, self.class_to_idx[cls]))
|
| 29 |
+
return file_list
|
| 30 |
+
|
| 31 |
+
def __len__(self):
|
| 32 |
+
return len(self.file_list)
|
| 33 |
+
# def __init__(self, root_dir, transform=None):
|
| 34 |
+
# self.root_dir = root_dir
|
| 35 |
+
# self.transform = transform
|
| 36 |
+
# self.paths, self.labels = self._load_file_list()
|
| 37 |
+
# self.class_to_index = {cls: i for i, cls in enumerate(set(self.labels))}
|
| 38 |
+
#
|
| 39 |
+
# def _load_file_list(self):
|
| 40 |
+
# paths = []
|
| 41 |
+
# labels = []
|
| 42 |
+
# for dirname, _, filenames in os.walk(self.root_dir):
|
| 43 |
+
# for filename in filenames:
|
| 44 |
+
# audio_path = os.path.join(dirname, filename)
|
| 45 |
+
# label = filename.split('_')[-1].split('.')[0].lower()
|
| 46 |
+
# paths.append(audio_path)
|
| 47 |
+
# labels.append(label)
|
| 48 |
+
# return paths, labels
|
| 49 |
+
#
|
| 50 |
+
# def __len__(self):
|
| 51 |
+
# return len(self.paths)
|
| 52 |
+
|
| 53 |
+
def __getitem__(self, idx):
|
| 54 |
+
audio_path, label = self.file_list[idx]
|
| 55 |
+
# audio_path = self.paths[idx]
|
| 56 |
+
# label = self.labels[idx]
|
| 57 |
+
# class_index = self.class_to_index[label]
|
| 58 |
+
target = [0]*7
|
| 59 |
+
target[label] = 1
|
| 60 |
+
target = torch.FloatTensor(target)
|
| 61 |
+
|
| 62 |
+
mfcc = extract_mfcc(audio_path)
|
| 63 |
+
mfcc = torch.from_numpy(mfcc)
|
| 64 |
+
|
| 65 |
+
if self.transform:
|
| 66 |
+
mfcc = self.transform(mfcc)
|
| 67 |
+
|
| 68 |
+
return mfcc, target
|
model.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from torch import nn as nn
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
# class SpeechEmotionModel(nn.Module):
|
| 5 |
+
# def __init__(self):
|
| 6 |
+
# super(SpeechEmotionModel, self).__init__()
|
| 7 |
+
# self.lstm = nn.LSTM(input_size=1, hidden_size=256, num_layers=1, batch_first=True)
|
| 8 |
+
# self.dropout1 = nn.Dropout(0.2)
|
| 9 |
+
# self.fc1 = nn.Linear(256, 128)
|
| 10 |
+
# self.dropout2 = nn.Dropout(0.2)
|
| 11 |
+
# self.relu1 = nn.ReLU()
|
| 12 |
+
# self.fc2 = nn.Linear(128, 64)
|
| 13 |
+
# self.dropout3 = nn.Dropout(0.2)
|
| 14 |
+
# self.relu2 = nn.ReLU()
|
| 15 |
+
# self.fc3 = nn.Linear(64, 7)
|
| 16 |
+
# self.softmax = nn.Softmax(dim=1)
|
| 17 |
+
#
|
| 18 |
+
# def forward(self, x):
|
| 19 |
+
# x, _ = self.lstm(x)
|
| 20 |
+
# x = x[:, -1, :]
|
| 21 |
+
# x = self.dropout1(x)
|
| 22 |
+
# x = self.relu1(self.fc1(x))
|
| 23 |
+
# x = self.dropout2(x)
|
| 24 |
+
# x = self.relu2(self.fc2(x))
|
| 25 |
+
# x = self.dropout3(x)
|
| 26 |
+
# x = self.fc3(x)
|
| 27 |
+
# x = self.softmax(x)
|
| 28 |
+
# return x
|
| 29 |
+
|
| 30 |
+
class SpeechEmotionModel(nn.Module):
|
| 31 |
+
def __init__(self):
|
| 32 |
+
super(SpeechEmotionModel, self).__init__()
|
| 33 |
+
self.lstm = nn.LSTM(input_size=1, hidden_size=256, num_layers=1, batch_first=True, bidirectional=True)
|
| 34 |
+
self.batch_norm1 = nn.BatchNorm1d(512)
|
| 35 |
+
self.dropout1 = nn.Dropout(0.2)
|
| 36 |
+
self.fc1 = nn.Linear(512, 128)
|
| 37 |
+
self.batch_norm2 = nn.BatchNorm1d(128)
|
| 38 |
+
self.dropout2 = nn.Dropout(0.2)
|
| 39 |
+
self.relu1 = nn.ReLU()
|
| 40 |
+
self.fc2 = nn.Linear(128, 64)
|
| 41 |
+
self.batch_norm3 = nn.BatchNorm1d(64)
|
| 42 |
+
self.dropout3 = nn.Dropout(0.2)
|
| 43 |
+
self.relu2 = nn.ReLU()
|
| 44 |
+
self.fc3 = nn.Linear(64, 7)
|
| 45 |
+
self.softmax = nn.Softmax(dim=1)
|
| 46 |
+
|
| 47 |
+
def forward(self, x):
|
| 48 |
+
x, _ = self.lstm(x)
|
| 49 |
+
x = x[:, -1, :]
|
| 50 |
+
x = self.batch_norm1(x)
|
| 51 |
+
x = self.dropout1(x)
|
| 52 |
+
x = self.relu1(self.fc1(x))
|
| 53 |
+
x = self.batch_norm2(x)
|
| 54 |
+
x = self.dropout2(x)
|
| 55 |
+
x = self.relu2(self.fc2(x))
|
| 56 |
+
x = self.batch_norm3(x)
|
| 57 |
+
x = self.dropout3(x)
|
| 58 |
+
x = self.fc3(x)
|
| 59 |
+
x = self.softmax(x)
|
| 60 |
+
return x
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
streamlit
|
| 3 |
+
tqdm
|
| 4 |
+
librosa
|
| 5 |
+
numpy
|
speech_emotion.pth.tar
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:776eafaf5ea2e83c0855a00a99b106b4ff09847eccca818838944cfb02a11a5b
|
| 3 |
+
size 3693802
|
speech_emotionbn.pth.tar
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ccb73cf621b50b83c25ba70f84383e40d54aeecb54eafea1a176eee101563afb
|
| 3 |
+
size 7306366
|
train.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import torch
|
| 3 |
+
from torch import nn as nn, optim
|
| 4 |
+
from torch.utils.data import DataLoader
|
| 5 |
+
from tqdm import tqdm
|
| 6 |
+
import config
|
| 7 |
+
from dataset import SpeechEmotionDataset, extract_mfcc
|
| 8 |
+
from model import SpeechEmotionModel
|
| 9 |
+
from utils import load_checkpoint, save_checkpoint
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def train_fn(model, loader, opt, criterion, epoch):
|
| 13 |
+
loop = tqdm(loader, leave=True)
|
| 14 |
+
model.train()
|
| 15 |
+
epoch_loss = 0.0
|
| 16 |
+
for idx, (feature, label) in enumerate(loop):
|
| 17 |
+
total_acc, total_count = 0, 0
|
| 18 |
+
feature = feature.to(config.DEVICE)
|
| 19 |
+
label = label.to(config.DEVICE)
|
| 20 |
+
opt.zero_grad()
|
| 21 |
+
feature = torch.unsqueeze(feature, dim=2)
|
| 22 |
+
predicted_label = model(feature)
|
| 23 |
+
loss = criterion(predicted_label, label)
|
| 24 |
+
epoch_loss += loss.item()
|
| 25 |
+
loss.backward()
|
| 26 |
+
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
|
| 27 |
+
opt.step()
|
| 28 |
+
total_acc += (predicted_label.argmax(1) == label.argmax(1)).sum().item()
|
| 29 |
+
total_count += label.size(0)
|
| 30 |
+
loop.set_postfix({"epoch": epoch, "loss": epoch_loss / len(loader), "accuracy": total_acc / total_count})
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def main():
|
| 34 |
+
model = SpeechEmotionModel().to(config.DEVICE)
|
| 35 |
+
opt = optim.Adam(model.parameters(), lr=config.LEARNING_RATE, betas=(0.5, 0.999), )
|
| 36 |
+
criterion = nn.CrossEntropyLoss()
|
| 37 |
+
|
| 38 |
+
# if config.LOAD_MODEL:
|
| 39 |
+
# load_checkpoint(
|
| 40 |
+
# config.CHECKPOINT, model, opt, config.LEARNING_RATE,
|
| 41 |
+
# )
|
| 42 |
+
|
| 43 |
+
train_dataset = SpeechEmotionDataset(root_dir=config.TRAIN_DIR)
|
| 44 |
+
train_loader = DataLoader(
|
| 45 |
+
train_dataset,
|
| 46 |
+
batch_size=config.BATCH_SIZE,
|
| 47 |
+
shuffle=True,
|
| 48 |
+
num_workers=config.NUM_WORKERS,
|
| 49 |
+
)
|
| 50 |
+
val_dataset = SpeechEmotionDataset(root_dir=config.VAL_DIR)
|
| 51 |
+
val_loader = DataLoader(
|
| 52 |
+
val_dataset,
|
| 53 |
+
batch_size=config.BATCH_SIZE,
|
| 54 |
+
shuffle=True,
|
| 55 |
+
num_workers=config.NUM_WORKERS,
|
| 56 |
+
)
|
| 57 |
+
total_accu = None
|
| 58 |
+
# scheduler = torch.optim.lr_scheduler.StepLR(opt, 1, gamma=0.5)
|
| 59 |
+
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, patience=2, verbose=True)
|
| 60 |
+
for epoch in range(config.NUM_EPOCHS):
|
| 61 |
+
epoch_start_time = time.time()
|
| 62 |
+
train_fn(
|
| 63 |
+
model, train_loader, opt, criterion, epoch
|
| 64 |
+
)
|
| 65 |
+
accu_val, loss_val = evaluate(model, criterion, val_loader)
|
| 66 |
+
# if total_accu is not None and total_accu > accu_val:
|
| 67 |
+
# scheduler.step()
|
| 68 |
+
# else:
|
| 69 |
+
# total_accu = accu_val
|
| 70 |
+
scheduler.step(loss_val)
|
| 71 |
+
print("+" + "-" * 19 + "+" + "-" * 15 + "+" + "-" * 20 + "+" + "-" * 24 + "+")
|
| 72 |
+
print(
|
| 73 |
+
"| end of epoch: {:3d} | time: {:6.2f}s | val_loss: {:8.3f} | "
|
| 74 |
+
"val_accuracy: {:8.3f} |".format(
|
| 75 |
+
epoch, time.time() - epoch_start_time, loss_val, accu_val
|
| 76 |
+
)
|
| 77 |
+
)
|
| 78 |
+
print("+" + "-" * 19 + "+" + "-" * 15 + "+" + "-" * 20 + "+" + "-" * 24 + "+")
|
| 79 |
+
if config.SAVE_MODEL:
|
| 80 |
+
save_checkpoint(model, opt, filename=config.CHECKPOINT)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def test():
|
| 84 |
+
model = SpeechEmotionModel().to(config.DEVICE)
|
| 85 |
+
opt = optim.Adam(model.parameters(), lr=config.LEARNING_RATE, betas=(0.5, 0.999), )
|
| 86 |
+
criterion = nn.CrossEntropyLoss()
|
| 87 |
+
if config.LOAD_MODEL:
|
| 88 |
+
load_checkpoint(
|
| 89 |
+
config.CHECKPOINT, model, opt, config.LEARNING_RATE,
|
| 90 |
+
)
|
| 91 |
+
val_dataset = SpeechEmotionDataset(root_dir=config.VAL_DIR)
|
| 92 |
+
val_loader = DataLoader(
|
| 93 |
+
val_dataset,
|
| 94 |
+
batch_size=config.BATCH_SIZE,
|
| 95 |
+
shuffle=True,
|
| 96 |
+
num_workers=config.NUM_WORKERS,
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
label = {0: 'anger', 1: 'disgust', 2: 'fear', 3: 'happy', 4: 'neutral', 5: 'ps', 6: 'sad'}
|
| 100 |
+
|
| 101 |
+
mfcc = extract_mfcc("uploads/OAF_bar_fear.wav")
|
| 102 |
+
mfcc = torch.from_numpy(mfcc)
|
| 103 |
+
mfcc = mfcc.to(config.DEVICE)
|
| 104 |
+
mfcc = torch.unsqueeze(mfcc, dim=1)
|
| 105 |
+
mfcc = torch.unsqueeze(mfcc, dim=0)
|
| 106 |
+
model.eval()
|
| 107 |
+
y_pred = model(mfcc)
|
| 108 |
+
print(torch.argmax(y_pred))
|
| 109 |
+
print(val_dataset.class_to_idx)
|
| 110 |
+
print(evaluate(model, criterion, val_loader))
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def evaluate(model, criterion, dataloader):
|
| 114 |
+
model.eval()
|
| 115 |
+
total_correct = 0
|
| 116 |
+
total_samples = 0
|
| 117 |
+
total_loss = 0.0
|
| 118 |
+
with torch.no_grad():
|
| 119 |
+
for inputs, labels in dataloader:
|
| 120 |
+
inputs, labels = inputs.to(config.DEVICE), labels.to(config.DEVICE)
|
| 121 |
+
inputs = torch.unsqueeze(inputs, dim=2)
|
| 122 |
+
outputs = model(inputs)
|
| 123 |
+
loss = criterion(outputs, labels)
|
| 124 |
+
total_loss += loss.item()
|
| 125 |
+
_, predicted = torch.max(outputs, 1)
|
| 126 |
+
total_correct += (predicted == labels.argmax(1)).sum().item()
|
| 127 |
+
total_samples += labels.size(0)
|
| 128 |
+
accuracy = total_correct / total_samples
|
| 129 |
+
average_loss = total_loss / len(dataloader)
|
| 130 |
+
return accuracy, average_loss
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
if __name__ == "__main__":
|
| 134 |
+
test()
|
uploads/OAF_bar_fear.wav
ADDED
|
Binary file (82.6 kB). View file
|
|
|
utils.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import config
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def save_checkpoint(model, optimizer, filename="my_checkpoint.pth.tar"):
|
| 6 |
+
print("=> Saving checkpoint")
|
| 7 |
+
checkpoint = {
|
| 8 |
+
"state_dict": model.state_dict(),
|
| 9 |
+
"optimizer": optimizer.state_dict(),
|
| 10 |
+
}
|
| 11 |
+
torch.save(checkpoint, filename)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def load_checkpoint(checkpoint_file, model, optimizer, lr):
|
| 15 |
+
print("=> Loading checkpoint")
|
| 16 |
+
checkpoint = torch.load(checkpoint_file, map_location=config.DEVICE)
|
| 17 |
+
model.load_state_dict(checkpoint["state_dict"])
|
| 18 |
+
optimizer.load_state_dict(checkpoint["optimizer"])
|
| 19 |
+
|
| 20 |
+
for param_group in optimizer.param_groups:
|
| 21 |
+
param_group["lr"] = lr
|