Tanishq commited on
Commit
4dee9c4
·
verified ·
1 Parent(s): f0a9b85

Upload 9 files

Browse files
config.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ # DEVICE = "cpu"
4
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
5
+ TRAIN_DIR = "data/train"
6
+ VAL_DIR = "data/val"
7
+ LEARNING_RATE = 2e-3
8
+ BATCH_SIZE = 32
9
+ NUM_WORKERS = 2
10
+ NUM_EPOCHS = 50
11
+ LOAD_MODEL = True
12
+ SAVE_MODEL = True
13
+ CHECKPOINT = "speech_emotionbn.pth.tar"
dataset.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import librosa
3
+ import numpy as np
4
+ import torch
5
+ from torch.utils.data import Dataset
6
+
7
+
8
+ def extract_mfcc(filename):
9
+ y, sr = librosa.load(filename, duration=3, offset=0.5)
10
+ mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
11
+ return mfcc
12
+
13
+
14
+ class SpeechEmotionDataset(Dataset):
15
+ def __init__(self, root_dir, transform=None):
16
+ self.root_dir = root_dir
17
+ self.transform = transform
18
+ self.classes = sorted(os.listdir(root_dir))
19
+ self.class_to_idx = {cls: idx for idx, cls in enumerate(self.classes)}
20
+ self.file_list = self.create_file_list()
21
+
22
+ def create_file_list(self):
23
+ file_list = []
24
+ for cls in self.classes:
25
+ class_path = os.path.join(self.root_dir, cls)
26
+ for file_name in os.listdir(class_path):
27
+ file_path = os.path.join(class_path, file_name)
28
+ file_list.append((file_path, self.class_to_idx[cls]))
29
+ return file_list
30
+
31
+ def __len__(self):
32
+ return len(self.file_list)
33
+ # def __init__(self, root_dir, transform=None):
34
+ # self.root_dir = root_dir
35
+ # self.transform = transform
36
+ # self.paths, self.labels = self._load_file_list()
37
+ # self.class_to_index = {cls: i for i, cls in enumerate(set(self.labels))}
38
+ #
39
+ # def _load_file_list(self):
40
+ # paths = []
41
+ # labels = []
42
+ # for dirname, _, filenames in os.walk(self.root_dir):
43
+ # for filename in filenames:
44
+ # audio_path = os.path.join(dirname, filename)
45
+ # label = filename.split('_')[-1].split('.')[0].lower()
46
+ # paths.append(audio_path)
47
+ # labels.append(label)
48
+ # return paths, labels
49
+ #
50
+ # def __len__(self):
51
+ # return len(self.paths)
52
+
53
+ def __getitem__(self, idx):
54
+ audio_path, label = self.file_list[idx]
55
+ # audio_path = self.paths[idx]
56
+ # label = self.labels[idx]
57
+ # class_index = self.class_to_index[label]
58
+ target = [0]*7
59
+ target[label] = 1
60
+ target = torch.FloatTensor(target)
61
+
62
+ mfcc = extract_mfcc(audio_path)
63
+ mfcc = torch.from_numpy(mfcc)
64
+
65
+ if self.transform:
66
+ mfcc = self.transform(mfcc)
67
+
68
+ return mfcc, target
model.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn as nn
2
+
3
+
4
+ # class SpeechEmotionModel(nn.Module):
5
+ # def __init__(self):
6
+ # super(SpeechEmotionModel, self).__init__()
7
+ # self.lstm = nn.LSTM(input_size=1, hidden_size=256, num_layers=1, batch_first=True)
8
+ # self.dropout1 = nn.Dropout(0.2)
9
+ # self.fc1 = nn.Linear(256, 128)
10
+ # self.dropout2 = nn.Dropout(0.2)
11
+ # self.relu1 = nn.ReLU()
12
+ # self.fc2 = nn.Linear(128, 64)
13
+ # self.dropout3 = nn.Dropout(0.2)
14
+ # self.relu2 = nn.ReLU()
15
+ # self.fc3 = nn.Linear(64, 7)
16
+ # self.softmax = nn.Softmax(dim=1)
17
+ #
18
+ # def forward(self, x):
19
+ # x, _ = self.lstm(x)
20
+ # x = x[:, -1, :]
21
+ # x = self.dropout1(x)
22
+ # x = self.relu1(self.fc1(x))
23
+ # x = self.dropout2(x)
24
+ # x = self.relu2(self.fc2(x))
25
+ # x = self.dropout3(x)
26
+ # x = self.fc3(x)
27
+ # x = self.softmax(x)
28
+ # return x
29
+
30
+ class SpeechEmotionModel(nn.Module):
31
+ def __init__(self):
32
+ super(SpeechEmotionModel, self).__init__()
33
+ self.lstm = nn.LSTM(input_size=1, hidden_size=256, num_layers=1, batch_first=True, bidirectional=True)
34
+ self.batch_norm1 = nn.BatchNorm1d(512)
35
+ self.dropout1 = nn.Dropout(0.2)
36
+ self.fc1 = nn.Linear(512, 128)
37
+ self.batch_norm2 = nn.BatchNorm1d(128)
38
+ self.dropout2 = nn.Dropout(0.2)
39
+ self.relu1 = nn.ReLU()
40
+ self.fc2 = nn.Linear(128, 64)
41
+ self.batch_norm3 = nn.BatchNorm1d(64)
42
+ self.dropout3 = nn.Dropout(0.2)
43
+ self.relu2 = nn.ReLU()
44
+ self.fc3 = nn.Linear(64, 7)
45
+ self.softmax = nn.Softmax(dim=1)
46
+
47
+ def forward(self, x):
48
+ x, _ = self.lstm(x)
49
+ x = x[:, -1, :]
50
+ x = self.batch_norm1(x)
51
+ x = self.dropout1(x)
52
+ x = self.relu1(self.fc1(x))
53
+ x = self.batch_norm2(x)
54
+ x = self.dropout2(x)
55
+ x = self.relu2(self.fc2(x))
56
+ x = self.batch_norm3(x)
57
+ x = self.dropout3(x)
58
+ x = self.fc3(x)
59
+ x = self.softmax(x)
60
+ return x
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ streamlit
3
+ tqdm
4
+ librosa
5
+ numpy
speech_emotion.pth.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:776eafaf5ea2e83c0855a00a99b106b4ff09847eccca818838944cfb02a11a5b
3
+ size 3693802
speech_emotionbn.pth.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccb73cf621b50b83c25ba70f84383e40d54aeecb54eafea1a176eee101563afb
3
+ size 7306366
train.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import torch
3
+ from torch import nn as nn, optim
4
+ from torch.utils.data import DataLoader
5
+ from tqdm import tqdm
6
+ import config
7
+ from dataset import SpeechEmotionDataset, extract_mfcc
8
+ from model import SpeechEmotionModel
9
+ from utils import load_checkpoint, save_checkpoint
10
+
11
+
12
+ def train_fn(model, loader, opt, criterion, epoch):
13
+ loop = tqdm(loader, leave=True)
14
+ model.train()
15
+ epoch_loss = 0.0
16
+ for idx, (feature, label) in enumerate(loop):
17
+ total_acc, total_count = 0, 0
18
+ feature = feature.to(config.DEVICE)
19
+ label = label.to(config.DEVICE)
20
+ opt.zero_grad()
21
+ feature = torch.unsqueeze(feature, dim=2)
22
+ predicted_label = model(feature)
23
+ loss = criterion(predicted_label, label)
24
+ epoch_loss += loss.item()
25
+ loss.backward()
26
+ torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
27
+ opt.step()
28
+ total_acc += (predicted_label.argmax(1) == label.argmax(1)).sum().item()
29
+ total_count += label.size(0)
30
+ loop.set_postfix({"epoch": epoch, "loss": epoch_loss / len(loader), "accuracy": total_acc / total_count})
31
+
32
+
33
+ def main():
34
+ model = SpeechEmotionModel().to(config.DEVICE)
35
+ opt = optim.Adam(model.parameters(), lr=config.LEARNING_RATE, betas=(0.5, 0.999), )
36
+ criterion = nn.CrossEntropyLoss()
37
+
38
+ # if config.LOAD_MODEL:
39
+ # load_checkpoint(
40
+ # config.CHECKPOINT, model, opt, config.LEARNING_RATE,
41
+ # )
42
+
43
+ train_dataset = SpeechEmotionDataset(root_dir=config.TRAIN_DIR)
44
+ train_loader = DataLoader(
45
+ train_dataset,
46
+ batch_size=config.BATCH_SIZE,
47
+ shuffle=True,
48
+ num_workers=config.NUM_WORKERS,
49
+ )
50
+ val_dataset = SpeechEmotionDataset(root_dir=config.VAL_DIR)
51
+ val_loader = DataLoader(
52
+ val_dataset,
53
+ batch_size=config.BATCH_SIZE,
54
+ shuffle=True,
55
+ num_workers=config.NUM_WORKERS,
56
+ )
57
+ total_accu = None
58
+ # scheduler = torch.optim.lr_scheduler.StepLR(opt, 1, gamma=0.5)
59
+ scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, patience=2, verbose=True)
60
+ for epoch in range(config.NUM_EPOCHS):
61
+ epoch_start_time = time.time()
62
+ train_fn(
63
+ model, train_loader, opt, criterion, epoch
64
+ )
65
+ accu_val, loss_val = evaluate(model, criterion, val_loader)
66
+ # if total_accu is not None and total_accu > accu_val:
67
+ # scheduler.step()
68
+ # else:
69
+ # total_accu = accu_val
70
+ scheduler.step(loss_val)
71
+ print("+" + "-" * 19 + "+" + "-" * 15 + "+" + "-" * 20 + "+" + "-" * 24 + "+")
72
+ print(
73
+ "| end of epoch: {:3d} | time: {:6.2f}s | val_loss: {:8.3f} | "
74
+ "val_accuracy: {:8.3f} |".format(
75
+ epoch, time.time() - epoch_start_time, loss_val, accu_val
76
+ )
77
+ )
78
+ print("+" + "-" * 19 + "+" + "-" * 15 + "+" + "-" * 20 + "+" + "-" * 24 + "+")
79
+ if config.SAVE_MODEL:
80
+ save_checkpoint(model, opt, filename=config.CHECKPOINT)
81
+
82
+
83
+ def test():
84
+ model = SpeechEmotionModel().to(config.DEVICE)
85
+ opt = optim.Adam(model.parameters(), lr=config.LEARNING_RATE, betas=(0.5, 0.999), )
86
+ criterion = nn.CrossEntropyLoss()
87
+ if config.LOAD_MODEL:
88
+ load_checkpoint(
89
+ config.CHECKPOINT, model, opt, config.LEARNING_RATE,
90
+ )
91
+ val_dataset = SpeechEmotionDataset(root_dir=config.VAL_DIR)
92
+ val_loader = DataLoader(
93
+ val_dataset,
94
+ batch_size=config.BATCH_SIZE,
95
+ shuffle=True,
96
+ num_workers=config.NUM_WORKERS,
97
+ )
98
+
99
+ label = {0: 'anger', 1: 'disgust', 2: 'fear', 3: 'happy', 4: 'neutral', 5: 'ps', 6: 'sad'}
100
+
101
+ mfcc = extract_mfcc("uploads/OAF_bar_fear.wav")
102
+ mfcc = torch.from_numpy(mfcc)
103
+ mfcc = mfcc.to(config.DEVICE)
104
+ mfcc = torch.unsqueeze(mfcc, dim=1)
105
+ mfcc = torch.unsqueeze(mfcc, dim=0)
106
+ model.eval()
107
+ y_pred = model(mfcc)
108
+ print(torch.argmax(y_pred))
109
+ print(val_dataset.class_to_idx)
110
+ print(evaluate(model, criterion, val_loader))
111
+
112
+
113
+ def evaluate(model, criterion, dataloader):
114
+ model.eval()
115
+ total_correct = 0
116
+ total_samples = 0
117
+ total_loss = 0.0
118
+ with torch.no_grad():
119
+ for inputs, labels in dataloader:
120
+ inputs, labels = inputs.to(config.DEVICE), labels.to(config.DEVICE)
121
+ inputs = torch.unsqueeze(inputs, dim=2)
122
+ outputs = model(inputs)
123
+ loss = criterion(outputs, labels)
124
+ total_loss += loss.item()
125
+ _, predicted = torch.max(outputs, 1)
126
+ total_correct += (predicted == labels.argmax(1)).sum().item()
127
+ total_samples += labels.size(0)
128
+ accuracy = total_correct / total_samples
129
+ average_loss = total_loss / len(dataloader)
130
+ return accuracy, average_loss
131
+
132
+
133
+ if __name__ == "__main__":
134
+ test()
uploads/OAF_bar_fear.wav ADDED
Binary file (82.6 kB). View file
 
utils.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import config
3
+
4
+
5
+ def save_checkpoint(model, optimizer, filename="my_checkpoint.pth.tar"):
6
+ print("=> Saving checkpoint")
7
+ checkpoint = {
8
+ "state_dict": model.state_dict(),
9
+ "optimizer": optimizer.state_dict(),
10
+ }
11
+ torch.save(checkpoint, filename)
12
+
13
+
14
+ def load_checkpoint(checkpoint_file, model, optimizer, lr):
15
+ print("=> Loading checkpoint")
16
+ checkpoint = torch.load(checkpoint_file, map_location=config.DEVICE)
17
+ model.load_state_dict(checkpoint["state_dict"])
18
+ optimizer.load_state_dict(checkpoint["optimizer"])
19
+
20
+ for param_group in optimizer.param_groups:
21
+ param_group["lr"] = lr