| """ |
| Speech Emotion Recognition Inference Script |
| """ |
|
|
| import librosa |
| import numpy as np |
| import tensorflow as tf |
| from tensorflow.keras.models import load_model |
| from sklearn.preprocessing import LabelEncoder |
| import argparse |
|
|
| def extract_feature(data, sr, mfcc=True, chroma=True, mel=True): |
| """ |
| Extract features from audio files into numpy array |
| """ |
| result = np.array([]) |
| if mfcc: |
| mfccs = np.mean(librosa.feature.mfcc(y=data, sr=sr, n_mfcc=40).T, axis=0) |
| result = np.hstack((result, mfccs)) |
| if chroma: |
| stft = np.abs(librosa.stft(data)) |
| chroma_feat = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0) |
| result = np.hstack((result, chroma_feat)) |
| if mel: |
| mel_feat = np.mean(librosa.feature.melspectrogram(y=data, sr=sr).T, axis=0) |
| result = np.hstack((result, mel_feat)) |
| return result |
|
|
| def predict_emotion(audio_path, model_path='trained_model.h5'): |
| """ |
| Predict emotion from audio file |
| """ |
| |
| data, sr = librosa.load(audio_path, sr=22050) |
| |
| |
| feature = extract_feature(data, sr, mfcc=True, chroma=True, mel=True) |
| feature = np.expand_dims(feature, axis=0) |
| feature = np.expand_dims(feature, axis=2) |
| |
| |
| model = load_model(model_path) |
| prediction = model.predict(feature) |
| predicted_class = np.argmax(prediction, axis=1) |
| |
| |
| emotions = { |
| '01': 'Neutral', |
| '02': 'Calm', |
| '03': 'Happy', |
| '04': 'Sad', |
| '05': 'Angry', |
| '06': 'Fearful', |
| '07': 'Disgust', |
| '08': 'Surprised' |
| } |
| |
| emojis = { |
| 'Neutral': 'π', |
| 'Calm': 'π', |
| 'Happy': 'π', |
| 'Sad': 'π’', |
| 'Angry': 'π ', |
| 'Fearful': 'π¨', |
| 'Disgust': 'π€’', |
| 'Surprised': 'π²' |
| } |
| |
| label_encoder = LabelEncoder() |
| label_encoder.fit(list(emotions.values())) |
| predicted_emotion = label_encoder.inverse_transform(predicted_class)[0] |
| |
| return predicted_emotion, emojis[predicted_emotion], prediction[0] |
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description='Predict emotion from audio file') |
| parser.add_argument('audio_path', help='Path to audio file') |
| parser.add_argument('--model', default='trained_model.h5', help='Path to model file') |
| |
| args = parser.parse_args() |
| |
| try: |
| emotion, emoji, confidence = predict_emotion(args.audio_path, args.model) |
| print(f"Predicted Emotion: {emotion} {emoji}") |
| print(f"Confidence scores: {confidence}") |
| except Exception as e: |
| print(f"Error: {e}") |
|
|