diegovelilla commited on
Commit
23cb5b5
·
verified ·
1 Parent(s): c809f45

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -0
app.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pydub import AudioSegment
3
+ import os
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn as nn
7
+ import torch.nn.functional as F
8
+ import librosa
9
+ import math
10
+ import json
11
+
12
+ GENRES = ["Metal", "Disco", "Pop", "Classical", "Reggae", "Country", "Rock", "Hiphop", "Jazz", "Blues"]
13
+
14
+ class CNNModel2(nn.Module):
15
+ def __init__(self, input_shape, num_classes=10):
16
+ super(CNNModel2, self).__init__()
17
+ self.conv1 = nn.Conv2d(input_shape[0], 32, kernel_size=3)
18
+ self.bn1 = nn.BatchNorm2d(32)
19
+ self.pool1 = nn.MaxPool2d(3, stride=2, padding=1)
20
+ self.dropout1 = nn.Dropout(0.2)
21
+ self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
22
+ self.bn2 = nn.BatchNorm2d(64)
23
+ self.pool2 = nn.MaxPool2d(3, stride=2, padding=1)
24
+ self.dropout2 = nn.Dropout(0.1)
25
+ self.conv3 = nn.Conv2d(64, 64, kernel_size=2)
26
+ self.bn3 = nn.BatchNorm2d(64)
27
+ self.pool3 = nn.MaxPool2d(2, stride=2, padding=1)
28
+ self.dropout3 = nn.Dropout(0.1)
29
+ self.flatten_dim = self._calculate_flatten_dim(input_shape)
30
+ self.fc1 = nn.Linear(self.flatten_dim, 128)
31
+ self.dropout4 = nn.Dropout(0.5)
32
+ self.fc2 = nn.Linear(128, num_classes)
33
+
34
+ def _calculate_flatten_dim(self, input_shape):
35
+ x = torch.zeros(1, *input_shape)
36
+ x = self.pool1(F.relu(self.bn1(self.conv1(x))))
37
+ x = self.pool2(F.relu(self.bn2(self.conv2(x))))
38
+ x = self.pool3(F.relu(self.bn3(self.conv3(x))))
39
+ return x.numel()
40
+
41
+ def forward(self, x):
42
+ x = self.pool1(F.relu(self.bn1(self.conv1(x))))
43
+ x = self.dropout1(x)
44
+ x = self.pool2(F.relu(self.bn2(self.conv2(x))))
45
+ x = self.dropout2(x)
46
+ x = self.pool3(F.relu(self.bn3(self.conv3(x))))
47
+ x = self.dropout3(x)
48
+ x = x.view(-1, self.flatten_dim)
49
+ x = F.relu(self.fc1(x))
50
+ x = self.dropout4(x)
51
+ x = self.fc2(x)
52
+ return x
53
+
54
+ def get_mfccs(file, fs=22500, duration=30, n_fft=2048, hop_length=512, n_mfcc=13, num_segments=10):
55
+ data = {
56
+ "genre_name": [],
57
+ "genre_num": [],
58
+ "mfcc": []
59
+ }
60
+ samples_per_track = fs * duration
61
+ samps_per_segment = int(samples_per_track/num_segments)
62
+ mfccs_per_segment = math.ceil(samps_per_segment/hop_length)
63
+ audio, fs = librosa.load(file, sr=fs)
64
+ for seg in range(num_segments):
65
+ start_sample = seg * samps_per_segment
66
+ end_sample = start_sample + samps_per_segment
67
+ mfcc = librosa.feature.mfcc(y=audio[start_sample:end_sample],
68
+ sr=fs,
69
+ n_fft=n_fft,
70
+ hop_length=hop_length,
71
+ n_mfcc=n_mfcc)
72
+ mfcc = mfcc.T
73
+ if len(mfcc) == mfccs_per_segment:
74
+ data["mfcc"].append(mfcc.tolist())
75
+ with open('data.json', "w") as filepath:
76
+ json.dump(data, filepath, indent=4)
77
+
78
+ return np.array(data["mfcc"]), np.array(data["genre_name"]), np.array(data["genre_num"])
79
+
80
+ def cut_audio(input_file):
81
+ audio = AudioSegment.from_wav(input_file)
82
+ first_30_seconds = audio[:30 * 1000]
83
+ output_file = "output_30_seconds.wav"
84
+ first_30_seconds.export(output_file, format="wav")
85
+ return output_file
86
+
87
+ model = CNNModel2((1, 130, 13))
88
+ model.load_state_dict(torch.load("model_cnn2.pth"))
89
+ model.eval()
90
+
91
+ st.title("Audio Genre Classification")
92
+ st.write("Upload an audio file to classify its genre.")
93
+ uploaded_file = st.file_uploader("Choose an audio file...", type=["wav"])
94
+
95
+ if uploaded_file is not None:
96
+ file_path = "temp.wav"
97
+ with open(file_path, "wb") as f:
98
+ f.write(uploaded_file.getbuffer())
99
+
100
+ file_path = cut_audio(file_path)
101
+ mfccs, genre_names, genre_nums = get_mfccs(file_path)
102
+
103
+ X_to_pred = torch.tensor(mfccs, dtype=torch.float32).unsqueeze(1)
104
+ predictions = []
105
+
106
+ for mfcc in X_to_pred:
107
+ mfcc = mfcc.unsqueeze(0)
108
+ with torch.no_grad():
109
+ output = model(mfcc)
110
+ _, predicted_class = torch.max(output, 1)
111
+ predictions.append(predicted_class.item())
112
+
113
+ st.write("Prediction:")
114
+ st.write(GENRES[max(set(predictions), key=predictions.count)])