Spaces:
Sleeping
Sleeping
File size: 5,130 Bytes
0601bf8 325f2d7 420184d ae6b18c 325f2d7 0601bf8 22f1f4f 0601bf8 325f2d7 0601bf8 325f2d7 0601bf8 325f2d7 0601bf8 325f2d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import gradio as gr
import numpy as np
import librosa
import requests
from io import BytesIO
from PIL import Image
import os
from tensorflow.keras.models import load_model
from faster_whisper import WhisperModel
# Load the emotion prediction model
def load_emotion_model(model_path):
try:
model = load_model(model_path)
return model
except Exception as e:
print("Error loading emotion prediction model:", e)
return None
model_path = 'mymodel_SER_LSTM_RAVDESS.h5'
model = load_emotion_model(model_path)
# Initialize WhisperModel
model_size = "small"
model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
# Function to transcribe audio
def transcribe(wav_filepath):
segments, _ = model2.transcribe(wav_filepath, beam_size=5)
return "".join([segment.text for segment in segments])
# Function to extract MFCC features from audio
def extract_mfcc(wav_file_name):
try:
y, sr = librosa.load(wav_file_name)
mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
return mfccs
except Exception as e:
print("Error extracting MFCC features:", e)
return None
# Emotions dictionary
emotions = {1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad', 5: 'angry', 6: 'fearful', 7: 'disgust', 8: 'surprised'}
# Prompts for each emotion
emotion_prompts = {
'neutral': "Generate a texture with neutral colors, balanced illumination, and simple shapes.",
'calm': "Generate a geometric texture with soft colors and tranquil illumination, using round and calming shapes.",
'happy': "Generate a geometric texture with vibrant colors and bright, sunny illumination with simple, round shapes.",
'sad': "Generate a geometric texture with muted colors, somber illumination, and dark and gloomy shapes.",
'angry': "Create a geometric texture with bold, dark colors, intense illumination, and sharp, irregular shapes. ",
'fearful': "Generate a scary geometric texture using dark, muted colors with harsh, dim illumination and irregular shapes.",
'disgust': "Generate a geometric texture with murky, sickly colors, distorted illumination effects, and irregular shapes.",
'surprised': "Create a geometric texture with vibrant electric and striking colors, using high-contrast lighting and sharp, angular shapes."
}
# Function to predict emotion from audio
def predict_emotion_from_audio(wav_filepath):
try:
test_point = extract_mfcc(wav_filepath)
if test_point is not None:
test_point = np.reshape(test_point, newshape=(1, 40, 1))
predictions = model.predict(test_point)
predicted_emotion_label = np.argmax(predictions[0]) + 1
return emotions[predicted_emotion_label]
else:
return "Error: Unable to extract features"
except Exception as e:
print("Error predicting emotion:", e)
return None
api_key = os.getenv("DeepAI_api_key")
# Function to generate an image using DeepAI Text to Image API
def generate_image(api_key, text):
url = "https://api.deepai.org/api/text2img"
headers = {'api-key': api_key}
response = requests.post(
url,
data={'text': text, 'width': 512, 'height': 512, 'image_generator_version': 'hd' },
headers=headers
)
response_data = response.json()
if 'output_url' in response_data:
image_url = response_data['output_url']
image_response = requests.get(image_url)
image = Image.open(BytesIO(image_response.content))
return image
else:
return None
# Function to get predictions
def get_predictions(audio_input):
emotion_prediction = predict_emotion_from_audio(audio_input)
transcribed_text = transcribe(audio_input)
# Get the corresponding prompt for the predicted emotion
if emotion_prediction in emotion_prompts:
prompt_text = emotion_prompts[emotion_prediction]
else:
prompt_text = "Generate an image that represents an ambiguous emotional state."
image = generate_image(api_key, prompt_text)
return emotion_prediction, transcribed_text, image
# Create the Gradio interface
interface = gr.Interface(
fn=get_predictions,
inputs=gr.Audio(label="Input Audio", type="filepath", sources=["microphone"]),
outputs=[
gr.Label("Acoustic Prediction", label="Acoustic Prediction", visible=False),
gr.Label("Transcribed Text", label="Transcribed Text", visible=False),
gr.Image(type='pil', label="Generated Image")
],
title="So What?",
description=("So What? is a multimedia work-in-progress project that leverages speech emotion recognition to create textured images for the What XR space.\n\n"
"Record yourself saying the expression \"What?\" \n\n"
"Try saying \"What?\" with different intonations to convey various emotions.\n\n"
"Press STOP when you finish, and SUBMIT your audio to generate a texture image.\n\n"
"Use the X icon to clear the recording instead of the button. :) ")
)
interface.launch()
|