Spaces:
Runtime error
Runtime error
File size: 3,399 Bytes
be0572c 75aea43 502fc28 3e65a1c 75aea43 dd23bdf 75aea43 08373fb 75aea43 be0572c 75aea43 e6425ef 75aea43 be0572c 75aea43 502fc28 75aea43 e6425ef 362a080 75aea43 e6425ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import numpy as np
import pandas as pd
import os
import librosa as lr
import torch
import torch.nn as nn
import pytorch_lightning as pl
import gradio as gr
from models.model import MFCC_CNN
EMOTIONS = {
9 : 'Нейтральная, мужской голос',
7: 'Счастье, мужской голос',
11: 'Грусть, мужской голос',
1: 'Злость, мужской голос',
5: 'Страх, мужской голос',
3: 'Отвращение, мужской голос',
13 : 'Удивление, мужской голос',
8 : 'Нейтральная, женский голос',
6 : 'Счастье, женский голос',
10 : 'Грусть, женский голос',
0 : 'Злость, женский голос',
4 : 'Страх, женский голос',
2 : 'Отвращение, женский голос',
12 : 'Удивление, женский голос'
}
# LOAD AUDIO
SAMPLE_RATE = 16000
DURATION = 3
# GET MFCC
N_MFCC = 50
WIN_LENGTH = 2048
WINDOW = 'hann'
HOP_LENGTH = 512
PATH = './chekpoint/models-epoch=97-val_loss=2.09.ckpt'
ckpt = torch.load(PATH)
pretrained_model = MFCC_CNN(14)
pretrained_model.load_state_dict(ckpt['state_dict'])
pretrained_model.eval()
pretrained_model.freeze()
def scaler_params():
with open('./mean_.txt', mode='r') as f:
mean = f.read()
mean = mean.split(',')
mean_final = []
for number in mean:
if number != ' ':
mean_final.append(float(number))
with open('./scale_.txt', mode='r') as f:
scale = f.read()
scale = scale.split(',')
scale_final = []
for number in scale:
if number != ' ':
scale_final.append(float(number))
return np.array(mean_final), np.array(scale_final)
def processAudio(audio_file):
audio, sr = lr.load(audio_file,
duration=DURATION,
sr=SAMPLE_RATE)
signal = np.zeros((int(SAMPLE_RATE*3,)))
signal[:len(audio)] = audio
feature_set = []
mfcc = lr.feature.mfcc(y=signal,
sr=sr,
n_mfcc=N_MFCC,
win_length=WIN_LENGTH,
window=WINDOW,
hop_length=HOP_LENGTH,
)
feature_set = torch.tensor(mfcc, dtype=torch.float)
feature_set.view(1, 1, 50, 94)
feature_set = np.reshape(feature_set, newshape=(1,-1))
mean, scale = scaler_params()
feature_set = (feature_set - mean)/scale
feature_set = np.reshape(feature_set, newshape=(1, 1, 50, 94))
feature_set = torch.tensor(feature_set, dtype=torch.float)
prediction = pretrained_model(feature_set,)
prediction = torch.argmax(prediction)
return EMOTIONS[prediction.item()]
demo = gr.Interface(
fn=processAudio,
inputs=gr.Audio(type='filepath'),
outputs=gr.Label(),
examples=[
[os.path.join(os.path.dirname(__file__), "files/03-01-01-01-02-02-01.wav")],
[os.path.join(os.path.dirname(__file__), "files/03-01-07-01-02-02-01.wav")],
[os.path.join(os.path.dirname(__file__), "files/03-01-08-02-02-02-01.wav")],
],
)
if __name__ == '__main__':
demo.launch() |