MFCC_CNN_16kHz / app.py
AlexTolstenko's picture
Update app.py
dd23bdf
import numpy as np
import pandas as pd
import os
import librosa as lr
import torch
import torch.nn as nn
import pytorch_lightning as pl
import gradio as gr
from models.model import MFCC_CNN
EMOTIONS = {
9 : 'Нейтральная, мужской голос',
7: 'Счастье, мужской голос',
11: 'Грусть, мужской голос',
1: 'Злость, мужской голос',
5: 'Страх, мужской голос',
3: 'Отвращение, мужской голос',
13 : 'Удивление, мужской голос',
8 : 'Нейтральная, женский голос',
6 : 'Счастье, женский голос',
10 : 'Грусть, женский голос',
0 : 'Злость, женский голос',
4 : 'Страх, женский голос',
2 : 'Отвращение, женский голос',
12 : 'Удивление, женский голос'
}
# LOAD AUDIO
SAMPLE_RATE = 16000
DURATION = 3
# GET MFCC
N_MFCC = 50
WIN_LENGTH = 2048
WINDOW = 'hann'
HOP_LENGTH = 512
PATH = './chekpoint/models-epoch=97-val_loss=2.09.ckpt'
ckpt = torch.load(PATH)
pretrained_model = MFCC_CNN(14)
pretrained_model.load_state_dict(ckpt['state_dict'])
pretrained_model.eval()
pretrained_model.freeze()
def scaler_params():
with open('./mean_.txt', mode='r') as f:
mean = f.read()
mean = mean.split(',')
mean_final = []
for number in mean:
if number != ' ':
mean_final.append(float(number))
with open('./scale_.txt', mode='r') as f:
scale = f.read()
scale = scale.split(',')
scale_final = []
for number in scale:
if number != ' ':
scale_final.append(float(number))
return np.array(mean_final), np.array(scale_final)
def processAudio(audio_file):
audio, sr = lr.load(audio_file,
duration=DURATION,
sr=SAMPLE_RATE)
signal = np.zeros((int(SAMPLE_RATE*3,)))
signal[:len(audio)] = audio
feature_set = []
mfcc = lr.feature.mfcc(y=signal,
sr=sr,
n_mfcc=N_MFCC,
win_length=WIN_LENGTH,
window=WINDOW,
hop_length=HOP_LENGTH,
)
feature_set = torch.tensor(mfcc, dtype=torch.float)
feature_set.view(1, 1, 50, 94)
feature_set = np.reshape(feature_set, newshape=(1,-1))
mean, scale = scaler_params()
feature_set = (feature_set - mean)/scale
feature_set = np.reshape(feature_set, newshape=(1, 1, 50, 94))
feature_set = torch.tensor(feature_set, dtype=torch.float)
prediction = pretrained_model(feature_set,)
prediction = torch.argmax(prediction)
return EMOTIONS[prediction.item()]
demo = gr.Interface(
fn=processAudio,
inputs=gr.Audio(type='filepath'),
outputs=gr.Label(),
examples=[
[os.path.join(os.path.dirname(__file__), "files/03-01-01-01-02-02-01.wav")],
[os.path.join(os.path.dirname(__file__), "files/03-01-07-01-02-02-01.wav")],
[os.path.join(os.path.dirname(__file__), "files/03-01-08-02-02-02-01.wav")],
],
)
if __name__ == '__main__':
demo.launch()