Update app.py
Browse files
app.py
CHANGED
|
@@ -81,7 +81,7 @@ def predict_emotion_from_audio(wav_filepath):
|
|
| 81 |
if test_point is not None:
|
| 82 |
test_point = np.reshape(test_point, newshape=(1, 40, 1))
|
| 83 |
predictions = model.predict(test_point)
|
| 84 |
-
predicted_emotion_label = np.argmax(predictions[0])
|
| 85 |
return emotions.get(predicted_emotion_label, "Unknown emotion")
|
| 86 |
else:
|
| 87 |
return "Error: Unable to extract features"
|
|
@@ -93,7 +93,7 @@ def predict_emotion_from_audio(wav_filepath):
|
|
| 93 |
def analyze_sentiment(text):
|
| 94 |
try:
|
| 95 |
if not text or text.strip() == "":
|
| 96 |
-
return "
|
| 97 |
|
| 98 |
analysis = TextBlob(text)
|
| 99 |
polarity = analysis.sentiment.polarity
|
|
@@ -108,15 +108,15 @@ def analyze_sentiment(text):
|
|
| 108 |
return sentiment, polarity
|
| 109 |
except Exception as e:
|
| 110 |
print("Error analyzing sentiment:", e)
|
| 111 |
-
return "
|
| 112 |
|
| 113 |
-
# Function to generate music with MusicGen
|
| 114 |
def generate_music(transcribed_text, emotion_prediction):
|
| 115 |
try:
|
| 116 |
if processor is None or music_model is None:
|
| 117 |
return None
|
| 118 |
|
| 119 |
-
# Create a prompt that combines the emotion and transcription
|
| 120 |
prompt = f"Background music that is {emotion_prediction} and represents: {transcribed_text}"
|
| 121 |
|
| 122 |
# Limit prompt length to avoid model issues
|
|
@@ -151,14 +151,14 @@ def generate_music(transcribed_text, emotion_prediction):
|
|
| 151 |
# --- DeepAI Image Generation (Text2Img) ---
|
| 152 |
api_key = os.getenv("DeepAI_api_key")
|
| 153 |
|
| 154 |
-
def generate_image(
|
| 155 |
try:
|
| 156 |
if not api_key:
|
| 157 |
# fallback white image if no API key
|
| 158 |
return Image.new('RGB', (1024, 512), color='white')
|
| 159 |
|
| 160 |
-
# Create the prompt for text2img
|
| 161 |
-
prompt = f"Generate an equirectangular 360 image texture {
|
| 162 |
|
| 163 |
# Make request to DeepAI text2img API
|
| 164 |
response = requests.post(
|
|
@@ -204,7 +204,6 @@ def create_texture_and_sphere_preview(image):
|
|
| 204 |
fig.add_trace(go.Image(z=img_array), row=1, col=1)
|
| 205 |
|
| 206 |
# Create a 3D sphere for the second subplot
|
| 207 |
-
# Since we can't directly apply the texture, we'll create a colored sphere
|
| 208 |
u = np.linspace(0, 2 * np.pi, 50)
|
| 209 |
v = np.linspace(0, np.pi, 25)
|
| 210 |
u, v = np.meshgrid(u, v)
|
|
@@ -218,7 +217,7 @@ def create_texture_and_sphere_preview(image):
|
|
| 218 |
|
| 219 |
fig.add_trace(go.Surface(
|
| 220 |
x=x, y=y, z=z,
|
| 221 |
-
surfacecolor=z,
|
| 222 |
colorscale='Viridis',
|
| 223 |
showscale=False,
|
| 224 |
opacity=0.8
|
|
@@ -249,20 +248,19 @@ def create_texture_and_sphere_preview(image):
|
|
| 249 |
|
| 250 |
# Function to get predictions
|
| 251 |
def get_predictions(audio_input):
|
|
|
|
| 252 |
emotion_prediction = predict_emotion_from_audio(audio_input)
|
| 253 |
-
transcribed_text = transcribe(audio_input)
|
| 254 |
|
| 255 |
-
#
|
| 256 |
-
|
| 257 |
-
emotion_prediction = "Unknown"
|
| 258 |
|
| 259 |
-
# Analyze sentiment of transcribed text
|
| 260 |
sentiment, polarity = analyze_sentiment(transcribed_text)
|
| 261 |
|
| 262 |
-
# Generate image using
|
| 263 |
-
image = generate_image(
|
| 264 |
|
| 265 |
-
# Generate music
|
| 266 |
music_path = generate_music(transcribed_text, emotion_prediction)
|
| 267 |
|
| 268 |
# Create visualization with both texture and sphere
|
|
@@ -275,15 +273,15 @@ interface = gr.Interface(
|
|
| 275 |
fn=get_predictions,
|
| 276 |
inputs=gr.Audio(label="Input Audio", type="filepath", sources=["microphone"]),
|
| 277 |
outputs=[
|
| 278 |
-
gr.Label(label="Acoustic Prediction"),
|
| 279 |
gr.Label(label="Transcribed Text"),
|
| 280 |
-
gr.Label(label="Sentiment Analysis"),
|
| 281 |
gr.Image(type='pil', label="Generated Equirectangular Image"),
|
| 282 |
gr.Audio(label="Generated Music", type="filepath"),
|
| 283 |
gr.Plot(label="Texture and Sphere Preview")
|
| 284 |
],
|
| 285 |
title="Affective Virtual Environments",
|
| 286 |
-
description="Create an AVE using your voice. Get emotion prediction, transcription, sentiment analysis, a generated equirectangular image, music, and a preview of how it would look as a texture on a sphere."
|
| 287 |
)
|
| 288 |
|
| 289 |
interface.launch()
|
|
|
|
| 81 |
if test_point is not None:
|
| 82 |
test_point = np.reshape(test_point, newshape=(1, 40, 1))
|
| 83 |
predictions = model.predict(test_point)
|
| 84 |
+
predicted_emotion_label = np.argmax(predictions[0])
|
| 85 |
return emotions.get(predicted_emotion_label, "Unknown emotion")
|
| 86 |
else:
|
| 87 |
return "Error: Unable to extract features"
|
|
|
|
| 93 |
def analyze_sentiment(text):
|
| 94 |
try:
|
| 95 |
if not text or text.strip() == "":
|
| 96 |
+
return "neutral", 0.0
|
| 97 |
|
| 98 |
analysis = TextBlob(text)
|
| 99 |
polarity = analysis.sentiment.polarity
|
|
|
|
| 108 |
return sentiment, polarity
|
| 109 |
except Exception as e:
|
| 110 |
print("Error analyzing sentiment:", e)
|
| 111 |
+
return "neutral", 0.0
|
| 112 |
|
| 113 |
+
# Function to generate music with MusicGen (using acoustic emotion prediction)
|
| 114 |
def generate_music(transcribed_text, emotion_prediction):
|
| 115 |
try:
|
| 116 |
if processor is None or music_model is None:
|
| 117 |
return None
|
| 118 |
|
| 119 |
+
# Create a prompt that combines the acoustic emotion and transcription
|
| 120 |
prompt = f"Background music that is {emotion_prediction} and represents: {transcribed_text}"
|
| 121 |
|
| 122 |
# Limit prompt length to avoid model issues
|
|
|
|
| 151 |
# --- DeepAI Image Generation (Text2Img) ---
|
| 152 |
api_key = os.getenv("DeepAI_api_key")
|
| 153 |
|
| 154 |
+
def generate_image(sentiment_prediction, transcribed_text):
|
| 155 |
try:
|
| 156 |
if not api_key:
|
| 157 |
# fallback white image if no API key
|
| 158 |
return Image.new('RGB', (1024, 512), color='white')
|
| 159 |
|
| 160 |
+
# Create the prompt for text2img using SENTIMENT analysis instead of acoustic emotion
|
| 161 |
+
prompt = f"Generate an equirectangular 360 image texture with {sentiment_prediction} sentiment, representing the idea of: [{transcribed_text}]."
|
| 162 |
|
| 163 |
# Make request to DeepAI text2img API
|
| 164 |
response = requests.post(
|
|
|
|
| 204 |
fig.add_trace(go.Image(z=img_array), row=1, col=1)
|
| 205 |
|
| 206 |
# Create a 3D sphere for the second subplot
|
|
|
|
| 207 |
u = np.linspace(0, 2 * np.pi, 50)
|
| 208 |
v = np.linspace(0, np.pi, 25)
|
| 209 |
u, v = np.meshgrid(u, v)
|
|
|
|
| 217 |
|
| 218 |
fig.add_trace(go.Surface(
|
| 219 |
x=x, y=y, z=z,
|
| 220 |
+
surfacecolor=z,
|
| 221 |
colorscale='Viridis',
|
| 222 |
showscale=False,
|
| 223 |
opacity=0.8
|
|
|
|
| 248 |
|
| 249 |
# Function to get predictions
|
| 250 |
def get_predictions(audio_input):
|
| 251 |
+
# Get acoustic emotion prediction (for music)
|
| 252 |
emotion_prediction = predict_emotion_from_audio(audio_input)
|
|
|
|
| 253 |
|
| 254 |
+
# Get transcribed text
|
| 255 |
+
transcribed_text = transcribe(audio_input)
|
|
|
|
| 256 |
|
| 257 |
+
# Analyze sentiment of transcribed text (for image)
|
| 258 |
sentiment, polarity = analyze_sentiment(transcribed_text)
|
| 259 |
|
| 260 |
+
# Generate image using SENTIMENT analysis
|
| 261 |
+
image = generate_image(sentiment, transcribed_text)
|
| 262 |
|
| 263 |
+
# Generate music using ACOUSTIC EMOTION prediction
|
| 264 |
music_path = generate_music(transcribed_text, emotion_prediction)
|
| 265 |
|
| 266 |
# Create visualization with both texture and sphere
|
|
|
|
| 273 |
fn=get_predictions,
|
| 274 |
inputs=gr.Audio(label="Input Audio", type="filepath", sources=["microphone"]),
|
| 275 |
outputs=[
|
| 276 |
+
gr.Label(label="Acoustic Emotion Prediction (for music)"),
|
| 277 |
gr.Label(label="Transcribed Text"),
|
| 278 |
+
gr.Label(label="Sentiment Analysis (for image)"),
|
| 279 |
gr.Image(type='pil', label="Generated Equirectangular Image"),
|
| 280 |
gr.Audio(label="Generated Music", type="filepath"),
|
| 281 |
gr.Plot(label="Texture and Sphere Preview")
|
| 282 |
],
|
| 283 |
title="Affective Virtual Environments",
|
| 284 |
+
description="Create an AVE using your voice. Get emotion prediction (for music), transcription, sentiment analysis (for image), a generated equirectangular image, music, and a preview of how it would look as a texture on a sphere."
|
| 285 |
)
|
| 286 |
|
| 287 |
interface.launch()
|