import numpy as np import pandas as pd import os import librosa as lr import torch import torch.nn as nn import pytorch_lightning as pl import gradio as gr from models.model import MFCC_CNN EMOTIONS = { 9 : 'Нейтральная, мужской голос', 7: 'Счастье, мужской голос', 11: 'Грусть, мужской голос', 1: 'Злость, мужской голос', 5: 'Страх, мужской голос', 3: 'Отвращение, мужской голос', 13 : 'Удивление, мужской голос', 8 : 'Нейтральная, женский голос', 6 : 'Счастье, женский голос', 10 : 'Грусть, женский голос', 0 : 'Злость, женский голос', 4 : 'Страх, женский голос', 2 : 'Отвращение, женский голос', 12 : 'Удивление, женский голос' } # LOAD AUDIO SAMPLE_RATE = 16000 DURATION = 3 # GET MFCC N_MFCC = 50 WIN_LENGTH = 2048 WINDOW = 'hann' HOP_LENGTH = 512 PATH = './chekpoint/models-epoch=97-val_loss=2.09.ckpt' ckpt = torch.load(PATH) pretrained_model = MFCC_CNN(14) pretrained_model.load_state_dict(ckpt['state_dict']) pretrained_model.eval() pretrained_model.freeze() def scaler_params(): with open('./mean_.txt', mode='r') as f: mean = f.read() mean = mean.split(',') mean_final = [] for number in mean: if number != ' ': mean_final.append(float(number)) with open('./scale_.txt', mode='r') as f: scale = f.read() scale = scale.split(',') scale_final = [] for number in scale: if number != ' ': scale_final.append(float(number)) return np.array(mean_final), np.array(scale_final) def processAudio(audio_file): audio, sr = lr.load(audio_file, duration=DURATION, sr=SAMPLE_RATE) signal = np.zeros((int(SAMPLE_RATE*3,))) signal[:len(audio)] = audio feature_set = [] mfcc = lr.feature.mfcc(y=signal, sr=sr, n_mfcc=N_MFCC, win_length=WIN_LENGTH, window=WINDOW, hop_length=HOP_LENGTH, ) feature_set = torch.tensor(mfcc, dtype=torch.float) feature_set.view(1, 1, 50, 94) feature_set = np.reshape(feature_set, newshape=(1,-1)) mean, scale = scaler_params() feature_set = (feature_set - mean)/scale feature_set = np.reshape(feature_set, newshape=(1, 1, 50, 94)) feature_set = torch.tensor(feature_set, dtype=torch.float) prediction = pretrained_model(feature_set,) prediction = torch.argmax(prediction) return EMOTIONS[prediction.item()] demo = gr.Interface( fn=processAudio, inputs=gr.Audio(type='filepath'), outputs=gr.Label(), examples=[ [os.path.join(os.path.dirname(__file__), "files/03-01-01-01-02-02-01.wav")], [os.path.join(os.path.dirname(__file__), "files/03-01-07-01-02-02-01.wav")], [os.path.join(os.path.dirname(__file__), "files/03-01-08-02-02-02-01.wav")], ], ) if __name__ == '__main__': demo.launch()