Spaces:

jfforero
/

LoopArtCritique

Sleeping

App Files Files Community

jfforero commited on Aug 16, 2024

Commit

325f2d7

verified ·

1 Parent(s): f47982e

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -8

app.py CHANGED Viewed

@@ -42,6 +42,18 @@ def extract_mfcc(wav_file_name):
 # Emotions dictionary
 emotions = {1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad', 5: 'angry', 6: 'fearful', 7: 'disgust', 8: 'surprised'}
 # Function to predict emotion from audio
 def predict_emotion_from_audio(wav_filepath):
     try:
@@ -81,23 +93,31 @@ def generate_image(api_key, text):
 def get_predictions(audio_input):
     emotion_prediction = predict_emotion_from_audio(audio_input)
     transcribed_text = transcribe(audio_input)
-    texto_imagen = "Generate an equirectangular texture image' representing the concept of: [ "  + emotion_prediction  + "]ness. Illustrate this idea using a " + emotion_prediction + " graphic style."
-    image = generate_image(api_key, texto_imagen)
     return emotion_prediction, transcribed_text, image
 # Create the Gradio interface
 interface = gr.Interface(
     fn=get_predictions,
-    inputs=gr.Audio(label="Input Audio", type="filepath", sources=["microphone"]),     # gr.Audio(label="Input Audio", type="filepath")
     outputs=[
         gr.Label("Acoustic Prediction", label="Acoustic Prediction", visible=False),
         gr.Label("Transcribed Text", label="Transcribed Text", visible=False),
         gr.Image(type='pil', label="Generated Image")
     ],
-title = "So What?",
-description = "So What? is a multimedia work-in-progress project that leverages speech emotion recognition to create textured images for the What XR space.\n\n Record yourself saying the expression \"What?\" \n\n Try saying \"What?\" with different intonations to convey various emotions.\n\n Press STOP when you finish, and SUBMIT your audio to generate a texture image.\n\n Use the X icon to clear the recording instead of the button.  :) "
 )
-interface.launch()

 # Emotions dictionary
 emotions = {1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad', 5: 'angry', 6: 'fearful', 7: 'disgust', 8: 'surprised'}
+# Prompts for each emotion
+emotion_prompts = {
+    'neutral': "Generate an image that represents a balanced and neutral environment with a calm, serene setting.",
+    'calm': "Create a peaceful landscape with soft colors and a tranquil atmosphere.",
+    'happy': "Design a bright, sunny environment full of vibrant colors and cheerful elements.",
+    'sad': "Generate a dark, gloomy scene with muted colors to convey a somber mood.",
+    'angry': "Create an intense, fiery scene with bold colors and dynamic elements.",
+    'fearful': "Generate an eerie, ominous environment with shadows and unsettling details.",
+    'disgust': "Design a scene that feels unsettling, with distorted, unnatural elements.",
+    'surprised': "Create a scene with unexpected contrasts, bright lights, and elements that convey a sense of surprise."
+}
 # Function to predict emotion from audio
 def predict_emotion_from_audio(wav_filepath):
     try:
 def get_predictions(audio_input):
     emotion_prediction = predict_emotion_from_audio(audio_input)
     transcribed_text = transcribe(audio_input)
+    # Get the corresponding prompt for the predicted emotion
+    if emotion_prediction in emotion_prompts:
+        prompt_text = emotion_prompts[emotion_prediction]
+    else:
+        prompt_text = "Generate an image that represents an ambiguous emotional state."
+    image = generate_image(api_key, prompt_text)
     return emotion_prediction, transcribed_text, image
 # Create the Gradio interface
 interface = gr.Interface(
     fn=get_predictions,
+    inputs=gr.Audio(label="Input Audio", type="filepath", sources=["microphone"]),
     outputs=[
         gr.Label("Acoustic Prediction", label="Acoustic Prediction", visible=False),
         gr.Label("Transcribed Text", label="Transcribed Text", visible=False),
         gr.Image(type='pil', label="Generated Image")
     ],
+    title="So What?",
+    description=("So What? is a multimedia work-in-progress project that leverages speech emotion recognition to create textured images for the What XR space.\n\n"
+                 "Record yourself saying the expression \"What?\" \n\n"
+                 "Try saying \"What?\" with different intonations to convey various emotions.\n\n"
+                 "Press STOP when you finish, and SUBMIT your audio to generate a texture image.\n\n"
+                 "Use the X icon to clear the recording instead of the button.  :) ")
 )
+interface.launch()