| import pandas as pd |
| import numpy as np |
|
|
| import librosa |
|
|
| import sklearn |
| from sklearn.preprocessing import StandardScaler, OneHotEncoder |
| from sklearn.model_selection import train_test_split |
|
|
| import tensorflow as tf |
| from keras.models import load_model |
|
|
| import pickle |
|
|
| sample_rate = 22050 |
|
|
| def noise(data): |
| noise_value = 0.015 * np.random.uniform() * np.amax(data) |
| data = data + noise_value * np.random.normal(size=data.shape[0]) |
| return data |
|
|
| def stretch(data, rate=0.8): |
| return librosa.effects.time_stretch(data, rate=rate) |
|
|
| def shift(data): |
| shift_range = int(np.random.uniform(low=-5, high=5) * 1000) |
| return np.roll(data, shift_range) |
|
|
| def pitch(data,sampling_rate,pitch_factor=0.7): |
| return librosa.effects.pitch_shift(data,sr=sampling_rate, n_steps=pitch_factor) |
|
|
| def extract_process(data): |
|
|
| sample_rate = 22050 |
| output_result = np.array([]) |
| mean_zero = np.mean(librosa.feature.zero_crossing_rate(y=data).T,axis=0) |
| output_result = np.hstack((output_result,mean_zero)) |
|
|
| stft_out = np.abs(librosa.stft(data)) |
| chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft_out,sr=sample_rate).T,axis=0) |
| output_result = np.hstack((output_result,chroma_stft)) |
|
|
| mfcc_out = np.mean(librosa.feature.mfcc(y=data,sr=sample_rate).T,axis=0) |
| output_result = np.hstack((output_result,mfcc_out)) |
|
|
| root_mean_out = np.mean(librosa.feature.rms(y=data).T,axis=0) |
| output_result = np.hstack((output_result,root_mean_out)) |
|
|
| mel_spectogram = np.mean(librosa.feature.melspectrogram(y=data,sr=sample_rate).T,axis=0) |
| output_result = np.hstack((output_result,mel_spectogram)) |
|
|
| return output_result |
|
|
| def export_process(path): |
|
|
| data,sample_rate = librosa.load(path,duration=5,offset=1) |
|
|
| output_1 = extract_process(data) |
| result = np.array(output_1) |
|
|
| noise_out = noise(data) |
| output_2 = extract_process(noise_out) |
| result = np.vstack((result,output_2)) |
|
|
| new_out = stretch(data) |
| strectch_pitch = pitch(new_out,sample_rate) |
| output_3 = extract_process(strectch_pitch) |
| result = np.vstack((result,output_3)) |
|
|
| return result |
|
|
| |
| with open('X_train.pkl', 'rb') as f: |
| X_train = pickle.load(f) |
|
|
| |
| with open('Y_train.pkl', 'rb') as f: |
| Y_train = pickle.load(f) |
|
|
| Features = pd.DataFrame(X_train) |
| Features['labels'] = Y_train |
|
|
| X = Features.iloc[: ,:-1].values |
| Y = Features['labels'].values |
|
|
| encoder_label = OneHotEncoder() |
| Y = encoder_label.fit_transform(np.array(Y).reshape(-1,1)).toarray() |
|
|
| x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.9, random_state=42, shuffle=True) |
|
|
| scaler_data = StandardScaler() |
| x_train = scaler_data.fit_transform(x_train) |
| x_test = scaler_data.transform(x_test) |
|
|
| def preprocess_audio(audio): |
| |
| features = export_process(audio) |
| features = scaler_data.transform(features) |
| return np.expand_dims(features, axis=2) |
|
|
| |
| def predict_emotion(preprocessed_audio): |
| model = load_model('speech-emotion-recognition.hdf5') |
| prediction = model.predict(preprocessed_audio) |
| predicted_emotion = encoder_label.inverse_transform(prediction) |
| return predicted_emotion[0] |
|
|
| |
| def live_emotion_recognition(audio_path): |
| |
| preprocessed_audio = preprocess_audio(audio_path) |
| |
| predicted_emotion = predict_emotion(preprocessed_audio) |
| |
| return predicted_emotion[0] |