Spaces:
Runtime error
Runtime error
| import os | |
| from json_tricks import load | |
| import numpy as np | |
| import librosa | |
| from pydub import AudioSegment, effects | |
| import noisereduce as nr | |
| import tensorflow as tf | |
| import keras | |
| from keras.models import model_from_json | |
| from keras.models import load_model | |
| import matplotlib.pyplot as plt | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| saved_model_path = r'./model8723.json' | |
| saved_weights_path = r'./model8723_weights.h5' | |
| #Reading the model from JSON file | |
| with open(saved_model_path, 'r') as json_file: | |
| json_savedModel = json_file.read() | |
| # Loading the model architecture, weights | |
| model = tf.keras.models.model_from_json(json_savedModel) | |
| model.load_weights(saved_weights_path) | |
| # Compiling the model with similar parameters as the original model. | |
| model.compile(loss='categorical_crossentropy', | |
| optimizer='RMSProp', | |
| metrics=['categorical_accuracy']) | |
| print(model.summary()) | |
| def convert(y,sr): | |
| # convert from float to uint16 | |
| y = np.array(y * (1<<15), dtype=np.int16) | |
| audio_segment = AudioSegment( | |
| y.tobytes(), | |
| frame_rate=sr, | |
| sample_width=y.dtype.itemsize, | |
| channels=1 | |
| ) | |
| return audio_segment | |
| def preprocess(y,sr ): | |
| ''' | |
| A process to an audio .wav file before execcuting a prediction. | |
| Arguments: | |
| - file_path - The system path to the audio file. | |
| - frame_length - Length of the frame over which to compute the speech features. default: 2048 | |
| - hop_length - Number of samples to advance for each frame. default: 512 | |
| Return: | |
| 'X_3D' variable, containing a shape of: (batch, timesteps, feature) for a single file (batch = 1). | |
| ''' | |
| total_length = 204288 | |
| frame_length = 2048 | |
| hop_length = 512 | |
| # Fetch sample rate. | |
| # _, sr = librosa.load(path = file_path, sr = None) | |
| # Load audio file | |
| rawsound = convert(y,sr) | |
| # y = y.astype(np.float32) | |
| # y /= np.max(np.abs(y)) | |
| # rawsound = AudioSegment.from_mono_audiosegments(y) | |
| # Normalize to 5 dBFS | |
| normalizedsound = effects.normalize(rawsound, headroom = 5.0) | |
| # Transform the audio file to np.array of samples | |
| normal_x = np.array(normalizedsound.get_array_of_samples(), dtype = 'float32') | |
| final_x = nr.reduce_noise(normal_x, sr=sr) #updated 03/03/22 | |
| # Features extraction | |
| f1 = librosa.feature.rms(y = final_x, frame_length=frame_length, hop_length=hop_length,center=True,pad_mode='reflect').T # Energy - Root Mean Square | |
| f2 = librosa.feature.zero_crossing_rate(final_x , frame_length=frame_length, hop_length=hop_length, center=True).T # ZCR | |
| f3 = librosa.feature.mfcc(y = final_x, sr=sr, n_mfcc=13, hop_length = hop_length).T # MFCC | |
| X = np.concatenate((f1, f2, f3), axis = 1) | |
| # Pad the array | |
| padding_rows = 448-len(X) | |
| if padding_rows < 0: | |
| X = X[:448, :15] | |
| if padding_rows > 0: | |
| X = np.vstack(( X, np.zeros((padding_rows, 15)))) | |
| X_3D = np.expand_dims(X, axis=0) | |
| return X_3D | |
| emotions = { | |
| 0 : 'neutral', | |
| 1 : 'calm', | |
| 2 : 'happy', | |
| 3 : 'sad', | |
| 4 : 'angry', | |
| 5 : 'fearful', | |
| 6 : 'disgust', | |
| 7 : 'suprised' | |
| } | |
| emo_list = list(emotions.values()) | |
| def is_silent(data): | |
| # Returns 'True' if below the 'silent' threshold | |
| return max(data) < 100 | |
| import pyaudio | |
| import wave | |
| from array import array | |
| import struct | |
| import time | |
| # Initialize variables | |
| RATE = 24414 | |
| CHUNK = 512 | |
| RECORD_SECONDS = 7.1 | |
| CHANNELS = 1 | |
| WAVE_OUTPUT_FILE = "./output.wav" | |
| def EmotionRecogniser(stream,new_chunk): | |
| # process only when stream gets to length 7.1 seconds, else donot update prediction yet | |
| sr, y = new_chunk | |
| y = y.astype(np.float32) | |
| y /= np.max(np.abs(y)) | |
| # SESSION START | |
| print("** session started") | |
| total_predictions = [] # A list for all predictions in the session. | |
| if stream is not None: | |
| stream = np.concatenate([stream, y]) | |
| else: | |
| stream = y | |
| # if len(stream) < int(RATE*RECORD_SECONDS): | |
| # return stream, 'neutral' | |
| x = preprocess(y=stream,sr =sr) # 'output.wav' file preprocessing. | |
| print('x shape:', x.shape) | |
| # Model's prediction => an 8 emotion probabilities array. | |
| predictions = model.predict(x, use_multiprocessing=True) | |
| pred_list = list(predictions) | |
| pred_np = np.squeeze(np.array(pred_list).tolist(), axis=0) # Get rid of 'array' & 'dtype' statments. | |
| total_predictions.append(pred_np) | |
| #dict of emotions with their respective probabilities | |
| emotions_prob = dict(zip(emo_list, pred_np)) | |
| max_emo = np.argmax(predictions) | |
| print('max emotion:', emotions.get(max_emo,-1)) | |
| stream = stream[len(y):] # Reset the stream for the next session. | |
| emotions_prob | |
| return stream , emotions_prob | |
| # Present emotion distribution for the whole session. | |
| # total_predictions_np = np.mean(np.array(total_predictions).tolist(), axis=0) | |
| # fig = plt.figure(figsize = (10, 5)) | |
| # plt.bar(emo_list, total_predictions_np, color = 'indigo') | |
| # plt.ylabel("Mean probabilty (%)") | |
| # plt.title("Session Summary") | |
| # plt.show() | |
| # print(f"Emotions analyzed for: {(toc - tic):0.4f} seconds") | |
| # return str(emotions.get(np.argmax(total_predictions_np),-1)) | |
| ################################################## | |
| import gradio as gr | |
| from transformers import pipeline | |
| import numpy as np | |
| # transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en") | |
| # def transcribe(stream, new_chunk): | |
| # sr, y = new_chunk | |
| # y = y.astype(np.float32) | |
| # y /= np.max(np.abs(y)) | |
| # if stream is not None: | |
| # stream = np.concatenate([stream, y]) | |
| # else: | |
| # stream = y | |
| # return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"] | |
| demo = gr.Interface( | |
| EmotionRecogniser, | |
| ["state",gr.Audio(sources=["microphone"], streaming=True,every=1.0)], | |
| ["state",'label'], | |
| live=True, | |
| ) | |
| demo.launch() | |