File size: 3,399 Bytes
be0572c
75aea43
 
 
 
 
 
502fc28
3e65a1c
75aea43
 
dd23bdf
 
 
 
 
 
 
 
 
 
 
 
 
 
75aea43
 
 
 
 
 
 
 
 
 
 
 
08373fb
75aea43
 
 
 
 
 
 
be0572c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75aea43
 
 
 
 
 
 
 
 
 
 
e6425ef
75aea43
 
 
 
 
 
 
 
be0572c
 
 
 
 
 
 
 
 
75aea43
 
 
 
 
502fc28
75aea43
e6425ef
362a080
75aea43
 
 
 
 
 
 
 
e6425ef
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import numpy as np
import pandas as pd
import os
import librosa as lr
import torch
import torch.nn as nn
import pytorch_lightning as pl
import gradio as gr
from models.model import MFCC_CNN

EMOTIONS = {
    9 : 'Нейтральная, мужской голос',
    7: 'Счастье, мужской голос',
    11: 'Грусть, мужской голос',
    1: 'Злость, мужской голос',
    5: 'Страх, мужской голос',
    3: 'Отвращение, мужской голос',
    13 : 'Удивление, мужской голос',
    8 : 'Нейтральная, женский голос',
    6 : 'Счастье, женский голос',
    10 : 'Грусть, женский голос',
    0 : 'Злость, женский голос',
    4 : 'Страх, женский голос',
    2 : 'Отвращение, женский голос',
    12 : 'Удивление, женский голос'
    }

# LOAD AUDIO
SAMPLE_RATE = 16000
DURATION = 3

# GET MFCC
N_MFCC = 50
WIN_LENGTH = 2048
WINDOW = 'hann'
HOP_LENGTH = 512

PATH = './chekpoint/models-epoch=97-val_loss=2.09.ckpt'
ckpt = torch.load(PATH)

pretrained_model = MFCC_CNN(14)
pretrained_model.load_state_dict(ckpt['state_dict'])
pretrained_model.eval()
pretrained_model.freeze()

def scaler_params():
    with open('./mean_.txt', mode='r') as f:
            mean = f.read()
            mean = mean.split(',')

    mean_final = []
    for number in mean:
        if number != ' ':
            mean_final.append(float(number))

    with open('./scale_.txt', mode='r') as f:
        scale = f.read()
        scale = scale.split(',')

    scale_final = []
    for number in scale:
        if number != ' ':
            scale_final.append(float(number))

    return np.array(mean_final), np.array(scale_final)

def processAudio(audio_file):
    audio, sr = lr.load(audio_file, 
                             duration=DURATION, 
                             sr=SAMPLE_RATE)
        
    signal = np.zeros((int(SAMPLE_RATE*3,)))
    signal[:len(audio)] = audio

    feature_set = []

    mfcc = lr.feature.mfcc(y=signal,
                                    sr=sr,
                                    n_mfcc=N_MFCC,
                                    win_length=WIN_LENGTH,
                                    window=WINDOW,
                                    hop_length=HOP_LENGTH,
                                    )
            
    feature_set = torch.tensor(mfcc, dtype=torch.float)

    feature_set.view(1, 1, 50, 94)
    feature_set = np.reshape(feature_set, newshape=(1,-1))

    mean, scale = scaler_params()
    feature_set = (feature_set - mean)/scale
    
    feature_set = np.reshape(feature_set, newshape=(1, 1, 50, 94))
    feature_set = torch.tensor(feature_set, dtype=torch.float)
    
    prediction = pretrained_model(feature_set,)
    prediction = torch.argmax(prediction)

    return EMOTIONS[prediction.item()]
    
demo = gr.Interface(
    fn=processAudio,
    inputs=gr.Audio(type='filepath'),
    outputs=gr.Label(),
    examples=[
        [os.path.join(os.path.dirname(__file__), "files/03-01-01-01-02-02-01.wav")],
        [os.path.join(os.path.dirname(__file__), "files/03-01-07-01-02-02-01.wav")],
        [os.path.join(os.path.dirname(__file__), "files/03-01-08-02-02-02-01.wav")],
        ],
    )

if __name__ == '__main__':
    demo.launch()