jfforero commited on
Commit
325f2d7
·
verified ·
1 Parent(s): f47982e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -8
app.py CHANGED
@@ -42,6 +42,18 @@ def extract_mfcc(wav_file_name):
42
  # Emotions dictionary
43
  emotions = {1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad', 5: 'angry', 6: 'fearful', 7: 'disgust', 8: 'surprised'}
44
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  # Function to predict emotion from audio
46
  def predict_emotion_from_audio(wav_filepath):
47
  try:
@@ -81,23 +93,31 @@ def generate_image(api_key, text):
81
  def get_predictions(audio_input):
82
  emotion_prediction = predict_emotion_from_audio(audio_input)
83
  transcribed_text = transcribe(audio_input)
84
- texto_imagen = "Generate an equirectangular texture image' representing the concept of: [ " + emotion_prediction + "]ness. Illustrate this idea using a " + emotion_prediction + " graphic style."
85
- image = generate_image(api_key, texto_imagen)
 
 
 
 
 
 
86
  return emotion_prediction, transcribed_text, image
87
 
88
  # Create the Gradio interface
89
  interface = gr.Interface(
90
  fn=get_predictions,
91
- inputs=gr.Audio(label="Input Audio", type="filepath", sources=["microphone"]), # gr.Audio(label="Input Audio", type="filepath")
92
  outputs=[
93
  gr.Label("Acoustic Prediction", label="Acoustic Prediction", visible=False),
94
  gr.Label("Transcribed Text", label="Transcribed Text", visible=False),
95
  gr.Image(type='pil', label="Generated Image")
96
  ],
97
- title = "So What?",
98
- description = "So What? is a multimedia work-in-progress project that leverages speech emotion recognition to create textured images for the What XR space.\n\n Record yourself saying the expression \"What?\" \n\n Try saying \"What?\" with different intonations to convey various emotions.\n\n Press STOP when you finish, and SUBMIT your audio to generate a texture image.\n\n Use the X icon to clear the recording instead of the button. :) "
99
-
100
-
 
 
101
  )
102
 
103
- interface.launch()
 
42
  # Emotions dictionary
43
  emotions = {1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad', 5: 'angry', 6: 'fearful', 7: 'disgust', 8: 'surprised'}
44
 
45
+ # Prompts for each emotion
46
+ emotion_prompts = {
47
+ 'neutral': "Generate an image that represents a balanced and neutral environment with a calm, serene setting.",
48
+ 'calm': "Create a peaceful landscape with soft colors and a tranquil atmosphere.",
49
+ 'happy': "Design a bright, sunny environment full of vibrant colors and cheerful elements.",
50
+ 'sad': "Generate a dark, gloomy scene with muted colors to convey a somber mood.",
51
+ 'angry': "Create an intense, fiery scene with bold colors and dynamic elements.",
52
+ 'fearful': "Generate an eerie, ominous environment with shadows and unsettling details.",
53
+ 'disgust': "Design a scene that feels unsettling, with distorted, unnatural elements.",
54
+ 'surprised': "Create a scene with unexpected contrasts, bright lights, and elements that convey a sense of surprise."
55
+ }
56
+
57
  # Function to predict emotion from audio
58
  def predict_emotion_from_audio(wav_filepath):
59
  try:
 
93
  def get_predictions(audio_input):
94
  emotion_prediction = predict_emotion_from_audio(audio_input)
95
  transcribed_text = transcribe(audio_input)
96
+
97
+ # Get the corresponding prompt for the predicted emotion
98
+ if emotion_prediction in emotion_prompts:
99
+ prompt_text = emotion_prompts[emotion_prediction]
100
+ else:
101
+ prompt_text = "Generate an image that represents an ambiguous emotional state."
102
+
103
+ image = generate_image(api_key, prompt_text)
104
  return emotion_prediction, transcribed_text, image
105
 
106
  # Create the Gradio interface
107
  interface = gr.Interface(
108
  fn=get_predictions,
109
+ inputs=gr.Audio(label="Input Audio", type="filepath", sources=["microphone"]),
110
  outputs=[
111
  gr.Label("Acoustic Prediction", label="Acoustic Prediction", visible=False),
112
  gr.Label("Transcribed Text", label="Transcribed Text", visible=False),
113
  gr.Image(type='pil', label="Generated Image")
114
  ],
115
+ title="So What?",
116
+ description=("So What? is a multimedia work-in-progress project that leverages speech emotion recognition to create textured images for the What XR space.\n\n"
117
+ "Record yourself saying the expression \"What?\" \n\n"
118
+ "Try saying \"What?\" with different intonations to convey various emotions.\n\n"
119
+ "Press STOP when you finish, and SUBMIT your audio to generate a texture image.\n\n"
120
+ "Use the X icon to clear the recording instead of the button. :) ")
121
  )
122
 
123
+ interface.launch()