File size: 4,211 Bytes
23cb5b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
baa6140
23cb5b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import streamlit as st
from pydub import AudioSegment
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import librosa
import math
import json

GENRES = ["Metal", "Disco", "Pop", "Classical", "Reggae", "Country", "Rock", "Hiphop", "Jazz", "Blues"]

class CNNModel2(nn.Module):
    def __init__(self, input_shape, num_classes=10):
        super(CNNModel2, self).__init__()
        self.conv1 = nn.Conv2d(input_shape[0], 32, kernel_size=3)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool1 = nn.MaxPool2d(3, stride=2, padding=1)
        self.dropout1 = nn.Dropout(0.2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool2 = nn.MaxPool2d(3, stride=2, padding=1)
        self.dropout2 = nn.Dropout(0.1)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=2)
        self.bn3 = nn.BatchNorm2d(64)
        self.pool3 = nn.MaxPool2d(2, stride=2, padding=1)
        self.dropout3 = nn.Dropout(0.1)
        self.flatten_dim = self._calculate_flatten_dim(input_shape)
        self.fc1 = nn.Linear(self.flatten_dim, 128)
        self.dropout4 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, num_classes)

    def _calculate_flatten_dim(self, input_shape):
        x = torch.zeros(1, *input_shape)
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        return x.numel()

    def forward(self, x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.dropout1(x)
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.dropout2(x)
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        x = self.dropout3(x)
        x = x.view(-1, self.flatten_dim)
        x = F.relu(self.fc1(x))
        x = self.dropout4(x)
        x = self.fc2(x)
        return x

def get_mfccs(file, fs=22500, duration=30, n_fft=2048, hop_length=512, n_mfcc=13, num_segments=10):
    data = {
        "genre_name": [],
        "genre_num": [],
        "mfcc": []
    }
    samples_per_track = fs * duration
    samps_per_segment = int(samples_per_track/num_segments)
    mfccs_per_segment = math.ceil(samps_per_segment/hop_length)
    audio, fs = librosa.load(file, sr=fs)
    for seg in range(num_segments):
        start_sample = seg * samps_per_segment
        end_sample = start_sample + samps_per_segment
        mfcc = librosa.feature.mfcc(y=audio[start_sample:end_sample],
                                     sr=fs,
                                     n_fft=n_fft,
                                     hop_length=hop_length,
                                     n_mfcc=n_mfcc)
        mfcc = mfcc.T
        if len(mfcc) == mfccs_per_segment:
            data["mfcc"].append(mfcc.tolist())
    with open('data.json', "w") as filepath:
        json.dump(data, filepath, indent=4)

    return np.array(data["mfcc"]), np.array(data["genre_name"]), np.array(data["genre_num"])

def cut_audio(input_file):
    audio = AudioSegment.from_wav(input_file)
    first_30_seconds = audio[:30 * 1000]
    output_file = "output_30_seconds.wav"
    first_30_seconds.export(output_file, format="wav")
    return output_file

model = CNNModel2((1, 130, 13))
model.load_state_dict(torch.load("model_cnn2.pth", map_location=torch.device('cpu')))
model.eval()

st.title("Audio Genre Classification")
st.write("Upload an audio file to classify its genre.")
uploaded_file = st.file_uploader("Choose an audio file...", type=["wav"])

if uploaded_file is not None:
    file_path = "temp.wav"
    with open(file_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    file_path = cut_audio(file_path)
    mfccs, genre_names, genre_nums = get_mfccs(file_path)

    X_to_pred = torch.tensor(mfccs, dtype=torch.float32).unsqueeze(1)
    predictions = []

    for mfcc in X_to_pred:
        mfcc = mfcc.unsqueeze(0)
        with torch.no_grad():
            output = model(mfcc)
            _, predicted_class = torch.max(output, 1)
            predictions.append(predicted_class.item())

    st.write("Prediction:")
    st.write(GENRES[max(set(predictions), key=predictions.count)])