Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| import librosa | |
| import requests | |
| from io import BytesIO | |
| from PIL import Image | |
| import os | |
| from tensorflow.keras.models import load_model | |
| from faster_whisper import WhisperModel | |
| # Load the emotion prediction model | |
| def load_emotion_model(model_path): | |
| try: | |
| model = load_model(model_path) | |
| return model | |
| except Exception as e: | |
| print("Error loading emotion prediction model:", e) | |
| return None | |
| model_path = 'mymodel_SER_LSTM_RAVDESS.h5' | |
| model = load_emotion_model(model_path) | |
| # Initialize WhisperModel | |
| model_size = "small" | |
| model2 = WhisperModel(model_size, device="cpu", compute_type="int8") | |
| # Function to transcribe audio | |
| def transcribe(wav_filepath): | |
| segments, _ = model2.transcribe(wav_filepath, beam_size=5) | |
| return "".join([segment.text for segment in segments]) | |
| # Function to extract MFCC features from audio | |
| def extract_mfcc(wav_file_name): | |
| try: | |
| y, sr = librosa.load(wav_file_name) | |
| mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0) | |
| return mfccs | |
| except Exception as e: | |
| print("Error extracting MFCC features:", e) | |
| return None | |
| # Emotions dictionary | |
| emotions = {1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad', 5: 'angry', 6: 'fearful', 7: 'disgust', 8: 'surprised'} | |
| # Prompts for each emotion | |
| emotion_prompts = { | |
| 'neutral': "Generate a texture with neutral colors, balanced illumination, and simple shapes.", | |
| 'calm': "Generate a geometric texture with soft colors and tranquil illumination, using round and calming shapes.", | |
| 'happy': "Generate a geometric texture with vibrant colors and bright, sunny illumination with simple, round shapes.", | |
| 'sad': "Generate a geometric texture with muted colors, somber illumination, and dark and gloomy shapes.", | |
| 'angry': "Create a geometric texture with bold, dark colors, intense illumination, and sharp, irregular shapes. ", | |
| 'fearful': "Generate a scary geometric texture using dark, muted colors with harsh, dim illumination and irregular shapes.", | |
| 'disgust': "Generate a geometric texture with murky, sickly colors, distorted illumination effects, and irregular shapes.", | |
| 'surprised': "Create a geometric texture with vibrant electric and striking colors, using high-contrast lighting and sharp, angular shapes." | |
| } | |
| # Function to predict emotion from audio | |
| def predict_emotion_from_audio(wav_filepath): | |
| try: | |
| test_point = extract_mfcc(wav_filepath) | |
| if test_point is not None: | |
| test_point = np.reshape(test_point, newshape=(1, 40, 1)) | |
| predictions = model.predict(test_point) | |
| predicted_emotion_label = np.argmax(predictions[0]) + 1 | |
| return emotions[predicted_emotion_label] | |
| else: | |
| return "Error: Unable to extract features" | |
| except Exception as e: | |
| print("Error predicting emotion:", e) | |
| return None | |
| api_key = os.getenv("DeepAI_api_key") | |
| # Function to generate an image using DeepAI Text to Image API | |
| def generate_image(api_key, text): | |
| url = "https://api.deepai.org/api/text2img" | |
| headers = {'api-key': api_key} | |
| response = requests.post( | |
| url, | |
| data={'text': text, 'width': 512, 'height': 512, 'image_generator_version': 'hd' }, | |
| headers=headers | |
| ) | |
| response_data = response.json() | |
| if 'output_url' in response_data: | |
| image_url = response_data['output_url'] | |
| image_response = requests.get(image_url) | |
| image = Image.open(BytesIO(image_response.content)) | |
| return image | |
| else: | |
| return None | |
| # Function to get predictions | |
| def get_predictions(audio_input): | |
| emotion_prediction = predict_emotion_from_audio(audio_input) | |
| transcribed_text = transcribe(audio_input) | |
| # Get the corresponding prompt for the predicted emotion | |
| if emotion_prediction in emotion_prompts: | |
| prompt_text = emotion_prompts[emotion_prediction] | |
| else: | |
| prompt_text = "Generate an image that represents an ambiguous emotional state." | |
| image = generate_image(api_key, prompt_text) | |
| return emotion_prediction, transcribed_text, image | |
| # Create the Gradio interface | |
| interface = gr.Interface( | |
| fn=get_predictions, | |
| inputs=gr.Audio(label="Input Audio", type="filepath", sources=["microphone"]), | |
| outputs=[ | |
| gr.Label("Acoustic Prediction", label="Acoustic Prediction", visible=False), | |
| gr.Label("Transcribed Text", label="Transcribed Text", visible=False), | |
| gr.Image(type='pil', label="Generated Image") | |
| ], | |
| title="So What?", | |
| description=("So What? is a multimedia work-in-progress project that leverages speech emotion recognition to create textured images for the What XR space.\n\n" | |
| "Record yourself saying the expression \"What?\" \n\n" | |
| "Try saying \"What?\" with different intonations to convey various emotions.\n\n" | |
| "Press STOP when you finish, and SUBMIT your audio to generate a texture image.\n\n" | |
| "Use the X icon to clear the recording instead of the button. :) ") | |
| ) | |
| interface.launch() | |