Spaces:
Runtime error
Runtime error
Init commit
Browse files- app.py +197 -0
- model8723.json +1 -0
- model8723_weights.h5 +3 -0
- requirements.txt +7 -0
app.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import os
|
| 3 |
+
from json_tricks import load
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
import librosa
|
| 8 |
+
from pydub import AudioSegment, effects
|
| 9 |
+
import noisereduce as nr
|
| 10 |
+
|
| 11 |
+
import tensorflow as tf
|
| 12 |
+
import keras
|
| 13 |
+
from keras.models import model_from_json
|
| 14 |
+
from keras.models import load_model
|
| 15 |
+
|
| 16 |
+
import matplotlib.pyplot as plt
|
| 17 |
+
import warnings
|
| 18 |
+
warnings.filterwarnings('ignore')
|
| 19 |
+
|
| 20 |
+
saved_model_path = r'./model8723.json'
|
| 21 |
+
saved_weights_path = r'./model8723_weights.h5'
|
| 22 |
+
|
| 23 |
+
#Reading the model from JSON file
|
| 24 |
+
with open(saved_model_path, 'r') as json_file:
|
| 25 |
+
json_savedModel = json_file.read()
|
| 26 |
+
|
| 27 |
+
# Loading the model architecture, weights
|
| 28 |
+
model = tf.keras.models.model_from_json(json_savedModel)
|
| 29 |
+
model.load_weights(saved_weights_path)
|
| 30 |
+
|
| 31 |
+
# Compiling the model with similar parameters as the original model.
|
| 32 |
+
model.compile(loss='categorical_crossentropy',
|
| 33 |
+
optimizer='RMSProp',
|
| 34 |
+
metrics=['categorical_accuracy'])
|
| 35 |
+
|
| 36 |
+
print(model.summary())
|
| 37 |
+
|
| 38 |
+
def convert(y,sr):
|
| 39 |
+
# convert from float to uint16
|
| 40 |
+
y = np.array(y * (1<<15), dtype=np.int16)
|
| 41 |
+
audio_segment = AudioSegment(
|
| 42 |
+
y.tobytes(),
|
| 43 |
+
frame_rate=sr,
|
| 44 |
+
sample_width=y.dtype.itemsize,
|
| 45 |
+
channels=1
|
| 46 |
+
)
|
| 47 |
+
return audio_segment
|
| 48 |
+
|
| 49 |
+
def preprocess(y,sr ):
|
| 50 |
+
|
| 51 |
+
'''
|
| 52 |
+
A process to an audio .wav file before execcuting a prediction.
|
| 53 |
+
Arguments:
|
| 54 |
+
- file_path - The system path to the audio file.
|
| 55 |
+
- frame_length - Length of the frame over which to compute the speech features. default: 2048
|
| 56 |
+
- hop_length - Number of samples to advance for each frame. default: 512
|
| 57 |
+
|
| 58 |
+
Return:
|
| 59 |
+
'X_3D' variable, containing a shape of: (batch, timesteps, feature) for a single file (batch = 1).
|
| 60 |
+
'''
|
| 61 |
+
total_length = 204288
|
| 62 |
+
frame_length = 2048
|
| 63 |
+
hop_length = 512
|
| 64 |
+
# Fetch sample rate.
|
| 65 |
+
# _, sr = librosa.load(path = file_path, sr = None)
|
| 66 |
+
# Load audio file
|
| 67 |
+
rawsound = convert(y,sr)
|
| 68 |
+
# y = y.astype(np.float32)
|
| 69 |
+
# y /= np.max(np.abs(y))
|
| 70 |
+
|
| 71 |
+
# rawsound = AudioSegment.from_mono_audiosegments(y)
|
| 72 |
+
# Normalize to 5 dBFS
|
| 73 |
+
normalizedsound = effects.normalize(rawsound, headroom = 5.0)
|
| 74 |
+
# Transform the audio file to np.array of samples
|
| 75 |
+
normal_x = np.array(normalizedsound.get_array_of_samples(), dtype = 'float32')
|
| 76 |
+
|
| 77 |
+
final_x = nr.reduce_noise(normal_x, sr=sr) #updated 03/03/22
|
| 78 |
+
|
| 79 |
+
# Features extraction
|
| 80 |
+
f1 = librosa.feature.rms(y = final_x, frame_length=frame_length, hop_length=hop_length,center=True,pad_mode='reflect').T # Energy - Root Mean Square
|
| 81 |
+
f2 = librosa.feature.zero_crossing_rate(final_x , frame_length=frame_length, hop_length=hop_length, center=True).T # ZCR
|
| 82 |
+
f3 = librosa.feature.mfcc(y = final_x, sr=sr, n_mfcc=13, hop_length = hop_length).T # MFCC
|
| 83 |
+
|
| 84 |
+
X = np.concatenate((f1, f2, f3), axis = 1)
|
| 85 |
+
# Pad the array
|
| 86 |
+
padding_rows = 448-len(X)
|
| 87 |
+
X = np.vstack(( X, np.zeros((padding_rows, 15))))
|
| 88 |
+
|
| 89 |
+
X_3D = np.expand_dims(X, axis=0)
|
| 90 |
+
|
| 91 |
+
return X_3D
|
| 92 |
+
|
| 93 |
+
emotions = {
|
| 94 |
+
0 : 'neutral',
|
| 95 |
+
1 : 'calm',
|
| 96 |
+
2 : 'happy',
|
| 97 |
+
3 : 'sad',
|
| 98 |
+
4 : 'angry',
|
| 99 |
+
5 : 'fearful',
|
| 100 |
+
6 : 'disgust',
|
| 101 |
+
7 : 'suprised'
|
| 102 |
+
}
|
| 103 |
+
emo_list = list(emotions.values())
|
| 104 |
+
|
| 105 |
+
def is_silent(data):
|
| 106 |
+
# Returns 'True' if below the 'silent' threshold
|
| 107 |
+
return max(data) < 100
|
| 108 |
+
import pyaudio
|
| 109 |
+
import wave
|
| 110 |
+
from array import array
|
| 111 |
+
import struct
|
| 112 |
+
import time
|
| 113 |
+
|
| 114 |
+
# Initialize variables
|
| 115 |
+
RATE = 24414
|
| 116 |
+
CHUNK = 512
|
| 117 |
+
RECORD_SECONDS = 7.1
|
| 118 |
+
|
| 119 |
+
CHANNELS = 1
|
| 120 |
+
WAVE_OUTPUT_FILE = "./output.wav"
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def EmotionRecogniser(stream,new_chunk):
|
| 124 |
+
# process only when stream gets to length 7.1 seconds, else donot update prediction yet
|
| 125 |
+
sr, y = new_chunk
|
| 126 |
+
|
| 127 |
+
y = y.astype(np.float32)
|
| 128 |
+
y /= np.max(np.abs(y))
|
| 129 |
+
|
| 130 |
+
# SESSION START
|
| 131 |
+
print("** session started")
|
| 132 |
+
total_predictions = [] # A list for all predictions in the session.
|
| 133 |
+
if stream is not None:
|
| 134 |
+
stream = np.concatenate([stream, y])
|
| 135 |
+
else:
|
| 136 |
+
stream = y
|
| 137 |
+
|
| 138 |
+
# if len(stream) < int(RATE*RECORD_SECONDS):
|
| 139 |
+
# return stream, 'neutral'
|
| 140 |
+
|
| 141 |
+
x = preprocess(y=stream,sr =sr) # 'output.wav' file preprocessing.
|
| 142 |
+
print('x shape:', x.shape)
|
| 143 |
+
# Model's prediction => an 8 emotion probabilities array.
|
| 144 |
+
predictions = model.predict(x, use_multiprocessing=True)
|
| 145 |
+
pred_list = list(predictions)
|
| 146 |
+
pred_np = np.squeeze(np.array(pred_list).tolist(), axis=0) # Get rid of 'array' & 'dtype' statments.
|
| 147 |
+
total_predictions.append(pred_np)
|
| 148 |
+
|
| 149 |
+
#dict of emotions with their respective probabilities
|
| 150 |
+
emotions_prob = dict(zip(emo_list, pred_np))
|
| 151 |
+
max_emo = np.argmax(predictions)
|
| 152 |
+
print('max emotion:', emotions.get(max_emo,-1))
|
| 153 |
+
|
| 154 |
+
stream = stream[len(y):] # Reset the stream for the next session.
|
| 155 |
+
emotions_prob
|
| 156 |
+
|
| 157 |
+
return stream , emotions_prob
|
| 158 |
+
|
| 159 |
+
# Present emotion distribution for the whole session.
|
| 160 |
+
# total_predictions_np = np.mean(np.array(total_predictions).tolist(), axis=0)
|
| 161 |
+
# fig = plt.figure(figsize = (10, 5))
|
| 162 |
+
# plt.bar(emo_list, total_predictions_np, color = 'indigo')
|
| 163 |
+
# plt.ylabel("Mean probabilty (%)")
|
| 164 |
+
# plt.title("Session Summary")
|
| 165 |
+
# plt.show()
|
| 166 |
+
|
| 167 |
+
# print(f"Emotions analyzed for: {(toc - tic):0.4f} seconds")
|
| 168 |
+
# return str(emotions.get(np.argmax(total_predictions_np),-1))
|
| 169 |
+
|
| 170 |
+
##################################################
|
| 171 |
+
|
| 172 |
+
import gradio as gr
|
| 173 |
+
from transformers import pipeline
|
| 174 |
+
import numpy as np
|
| 175 |
+
|
| 176 |
+
# transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
|
| 177 |
+
|
| 178 |
+
# def transcribe(stream, new_chunk):
|
| 179 |
+
# sr, y = new_chunk
|
| 180 |
+
# y = y.astype(np.float32)
|
| 181 |
+
# y /= np.max(np.abs(y))
|
| 182 |
+
|
| 183 |
+
# if stream is not None:
|
| 184 |
+
# stream = np.concatenate([stream, y])
|
| 185 |
+
# else:
|
| 186 |
+
# stream = y
|
| 187 |
+
# return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"]
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
demo = gr.Interface(
|
| 191 |
+
EmotionRecogniser,
|
| 192 |
+
["state",gr.Audio(sources=["microphone"], streaming=True,every=1.0)],
|
| 193 |
+
["state",'label'],
|
| 194 |
+
live=True,
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
demo.launch()
|
model8723.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"class_name": "Sequential", "config": {"name": "sequential", "layers": [{"module": "keras.layers", "class_name": "InputLayer", "config": {"batch_input_shape": [null, 448, 15], "dtype": "float32", "sparse": false, "ragged": false, "name": "lstm_input"}, "registered_name": null}, {"module": "keras.layers", "class_name": "LSTM", "config": {"name": "lstm", "trainable": true, "dtype": "float32", "batch_input_shape": [null, 448, 15], "return_sequences": true, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "time_major": false, "units": 64, "activation": "tanh", "recurrent_activation": "sigmoid", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "recurrent_initializer": {"module": "keras.initializers", "class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 2}, "registered_name": null, "build_config": {"input_shape": [null, 448, 15]}}, {"module": "keras.layers", "class_name": "LSTM", "config": {"name": "lstm_1", "trainable": true, "dtype": "float32", "return_sequences": false, "return_state": false, "go_backwards": false, "stateful": false, "unroll": false, "time_major": false, "units": 64, "activation": "tanh", "recurrent_activation": "sigmoid", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "recurrent_initializer": {"module": "keras.initializers", "class_name": "Orthogonal", "config": {"gain": 1.0, "seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "unit_forget_bias": true, "kernel_regularizer": null, "recurrent_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "recurrent_constraint": null, "bias_constraint": null, "dropout": 0.0, "recurrent_dropout": 0.0, "implementation": 2}, "registered_name": null, "build_config": {"input_shape": [null, 448, 64]}}, {"module": "keras.layers", "class_name": "Dense", "config": {"name": "dense", "trainable": true, "dtype": "float32", "units": 8, "activation": "softmax", "use_bias": true, "kernel_initializer": {"module": "keras.initializers", "class_name": "GlorotUniform", "config": {"seed": null}, "registered_name": null}, "bias_initializer": {"module": "keras.initializers", "class_name": "Zeros", "config": {}, "registered_name": null}, "kernel_regularizer": null, "bias_regularizer": null, "activity_regularizer": null, "kernel_constraint": null, "bias_constraint": null}, "registered_name": null, "build_config": {"input_shape": [null, 64]}}]}, "keras_version": "2.15.0", "backend": "tensorflow"}
|
model8723_weights.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5acd498600e161e247956e57b46d051444c65fba7a9799e393e36386b2bb7b6
|
| 3 |
+
size 235056
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pydub
|
| 2 |
+
noisereduce
|
| 3 |
+
pyaudio
|
| 4 |
+
json-tricks
|
| 5 |
+
tensorflow
|
| 6 |
+
keras
|
| 7 |
+
librosa
|