jfforero commited on
Commit
90faaef
·
verified ·
1 Parent(s): a8d1b32

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +884 -162
app.py CHANGED
@@ -18,11 +18,18 @@ import tempfile
18
  import base64
19
  import plotly.graph_objects as go
20
  from plotly.subplots import make_subplots
21
-
22
-
23
-
24
-
25
-
 
 
 
 
 
 
 
26
 
27
  # Load the emotion prediction model
28
  def load_emotion_model(model_path):
@@ -56,6 +63,49 @@ def load_musicgen_model():
56
 
57
  processor, music_model, device = load_musicgen_model()
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  # Function to transcribe audio
60
  def transcribe(wav_filepath):
61
  try:
@@ -118,40 +168,46 @@ def analyze_sentiment(text):
118
  return "neutral", 0.0
119
 
120
  # Function to get image prompt based on sentiment
121
- def get_image_prompt(sentiment, transcribed_text):
 
 
122
  if sentiment == "positive":
123
- return f"Generate a vibrant, uplifting equirectangular 360 image texture with bright colors, joyful atmosphere, and optimistic vibes representing: [{transcribed_text}]. The scene should evoke happiness and positivity."
124
 
125
  elif sentiment == "negative":
126
- return f"Generate a moody, dramatic equirectangular 360 image texture with dark tones, intense atmosphere, and emotional depth representing: [{transcribed_text}]. The scene should convey melancholy and intensity."
127
 
128
  else: # neutral
129
- return f"Generate a balanced, serene equirectangular 360 image texture with harmonious colors, peaceful atmosphere, and calm vibes representing: [{transcribed_text}]. The scene should evoke tranquility and balance."
130
 
131
  # Function to get music prompt based on emotion
132
- def get_music_prompt(emotion, transcribed_text):
 
 
133
  emotion_prompts = {
134
- 'neutral': f"Create ambient, background music with neutral tones, subtle melodies, and unobtrusive atmosphere that complements: {transcribed_text}. The music should be calm and balanced.",
135
- 'calm': f"Generate soothing, peaceful music with gentle melodies, soft instrumentation, and relaxing vibes that represents: {transcribed_text}. The music should evoke tranquility and serenity.",
136
- 'happy': f"Create joyful, upbeat music with cheerful melodies, bright instrumentation, and energetic rhythms that celebrates: {transcribed_text}. The music should evoke happiness and positivity.",
137
- 'sad': f"Generate emotional, melancholic music with poignant melodies, soft strings, and heartfelt atmosphere that reflects: {transcribed_text}. The music should evoke sadness and reflection.",
138
- 'angry': f"Create intense, powerful music with driving rhythms, aggressive instrumentation, and strong dynamics that expresses: {transcribed_text}. The music should evoke anger and intensity.",
139
- 'fearful': f"Generate suspenseful, tense music with eerie melodies, atmospheric sounds, and unsettling vibes that represents: {transcribed_text}. The music should evoke fear and anticipation.",
140
- 'disgust': f"Create dark, unsettling music with dissonant harmonies, unusual sounds, and uncomfortable atmosphere that reflects: {transcribed_text}. The music should evoke discomfort and unease.",
141
- 'surprised': f"Generate dynamic, unexpected music with sudden changes, playful melodies, and surprising elements that represents: {transcribed_text}. The music should evoke surprise and wonder."
142
  }
143
 
144
- return emotion_prompts.get(emotion.lower(),
145
- f"Create background music with {emotion} atmosphere that represents: {transcribed_text}")
146
-
 
 
147
  # Function to generate music with MusicGen (using acoustic emotion prediction)
148
- def generate_music(transcribed_text, emotion_prediction):
149
  try:
150
  if processor is None or music_model is None:
151
  return None
152
 
153
  # Get specific prompt based on emotion
154
- prompt = get_music_prompt(emotion_prediction, transcribed_text)
155
 
156
  # Limit prompt length to avoid model issues
157
  if len(prompt) > 200:
@@ -185,167 +241,833 @@ def generate_music(transcribed_text, emotion_prediction):
185
  # --- DeepAI Image Generation (Text2Img) ---
186
  api_key = os.getenv("DeepAI_api_key")
187
 
188
- def generate_image(sentiment_prediction, transcribed_text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  try:
190
  if not api_key:
191
  # fallback white image if no API key
192
- return Image.new('RGB', (1024, 512), color='white')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
- # Get specific prompt based on sentiment
195
- prompt = get_image_prompt(sentiment_prediction, transcribed_text)
 
 
 
 
 
 
 
196
 
197
- # Make request to DeepAI text2img API
198
- response = requests.post(
199
- "https://api.deepai.org/api/text2img",
200
- data={
201
- 'text': prompt,
202
- 'width': 1024,
203
- 'height': 512,
204
- 'image_generator_version': 'hd'
205
- },
206
- headers={'api-key': api_key}
207
- )
208
 
209
- data = response.json()
210
- if 'output_url' in data:
211
- # Download the generated image
212
- img_resp = requests.get(data['output_url'])
213
- return Image.open(BytesIO(img_resp.content))
214
- else:
215
- print("Error in DeepAI response:", data)
216
- # Return a fallback image
217
- return Image.new('RGB', (1024, 512), color='white')
218
  except Exception as e:
219
  print("Error generating image:", e)
220
  # Return a fallback image
221
- return Image.new('RGB', (1024, 512), color='white')
222
 
223
- # Function to create a visualization with both the equirectangular image and a 3D sphere
224
- # Function to create a visualization with both the equirectangular image and a 3D sphere
225
- def create_texture_and_sphere_preview(image):
226
  try:
227
- # Convert PIL image to numpy array
228
- img_array = np.array(image)
229
- height, width = img_array.shape[0], img_array.shape[1]
230
 
231
- # Create a subplot with the equirectangular image and a 3D sphere
232
- fig = make_subplots(
233
- rows=1, cols=2,
234
- subplot_titles=("Equirectangular Texture", "3D Sphere with Texture Mapping"),
235
- specs=[[{"type": "image"}, {"type": "scatter3d"}]],
236
- horizontal_spacing=0.1
237
- )
238
 
239
- # Add the equirectangular image to the first subplot
240
- fig.add_trace(go.Image(z=img_array), row=1, col=1)
241
 
242
- # Create sphere coordinates
243
- u_res, v_res = 50, 25
244
- u = np.linspace(0, 2 * np.pi, u_res)
245
- v = np.linspace(0, np.pi, v_res)
246
- u, v = np.meshgrid(u, v)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
 
248
- # Convert spherical coordinates to Cartesian coordinates
249
- x = np.sin(v) * np.cos(u)
250
- y = np.sin(v) * np.sin(u)
251
- z = np.cos(v)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
- # Sample colors from the equirectangular image based on UV coordinates
254
- # This approximates texture mapping by sampling the image at the correct UV coordinates
255
- texture_colors = np.zeros((v_res, u_res, 3), dtype=np.uint8)
256
 
257
- for i in range(v_res):
258
- for j in range(u_res):
259
- # Convert spherical coordinates to image coordinates
260
- img_x = int((u[i, j] / (2 * np.pi)) * (width - 1))
261
- img_y = int((v[i, j] / np.pi) * (height - 1))
 
 
 
 
 
 
 
 
 
 
262
 
263
- # Ensure coordinates are within bounds
264
- img_x = max(0, min(img_x, width - 1))
265
- img_y = max(0, min(img_y, height - 1))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
 
267
- # Get color from image
268
- if len(img_array.shape) == 3: # RGB image
269
- texture_colors[i, j] = img_array[img_y, img_x, :3]
270
- else: # Grayscale image
271
- texture_colors[i, j] = [img_array[img_y, img_x]] * 3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
- # Convert colors to Plotly format (normalized to [0,1])
274
- surface_colors = texture_colors.astype(float) / 255.0
 
 
 
 
 
 
 
 
 
 
 
 
275
 
276
- # Create surface with sampled colors
277
- fig.add_trace(go.Surface(
278
- x=x, y=y, z=z,
279
- surfacecolor=surface_colors,
280
- showscale=False,
281
- opacity=1.0,
282
- lighting=dict(ambient=0.8, diffuse=0.8, specular=0.1, roughness=0.5),
283
- lightposition=dict(x=100, y=100, z=100)
284
- ), row=1, col=2)
285
 
286
- # Update layout
287
- fig.update_layout(
288
- height=500,
289
- title_text="Equirectangular Texture and 3D Sphere Preview",
290
- showlegend=False,
291
- scene2=dict(
292
- xaxis=dict(visible=False, showticklabels=False),
293
- yaxis=dict(visible=False, showticklabels=False),
294
- zaxis=dict(visible=False, showticklabels=False),
295
- aspectmode='data',
296
- camera=dict(
297
- eye=dict(x=1.8, y=1.8, z=1.8)
298
- ),
299
- bgcolor='rgba(0,0,0,0)'
300
- )
301
- )
302
 
303
- # Update axes for the image subplot
304
- fig.update_xaxes(visible=False, row=1, col=1)
305
- fig.update_yaxes(visible=False, row=1, col=1)
 
 
 
 
 
 
 
 
 
 
 
 
306
 
307
- return fig
 
308
 
309
- except Exception as e:
310
- print("Error creating texture and sphere preview:", e)
311
- return go.Figure()
312
-
313
- # Function to get predictions
314
- def get_predictions(audio_input):
315
- # Get acoustic emotion prediction (for music)
316
- emotion_prediction = predict_emotion_from_audio(audio_input)
317
-
318
- # Get transcribed text
319
- transcribed_text = transcribe(audio_input)
320
-
321
- # Analyze sentiment of transcribed text (for image)
322
- sentiment, polarity = analyze_sentiment(transcribed_text)
323
-
324
- # Generate image using SENTIMENT analysis with specific prompt
325
- image = generate_image(sentiment, transcribed_text)
326
-
327
- # Generate music using ACOUSTIC EMOTION prediction with specific prompt
328
- music_path = generate_music(transcribed_text, emotion_prediction)
329
-
330
- # Create visualization with both texture and sphere
331
- preview_fig = create_texture_and_sphere_preview(image)
332
-
333
- return emotion_prediction, transcribed_text, f"Sentiment: {sentiment} (Polarity: {polarity:.2f})", image, music_path, preview_fig
334
-
335
- # Create the Gradio interface
336
- interface = gr.Interface(
337
- fn=get_predictions,
338
- inputs=gr.Audio(label="Input Audio", type="filepath", sources=["microphone"]),
339
- outputs=[
340
- gr.Label(label="Acoustic Emotion Prediction (for music)"),
341
- gr.Label(label="Transcribed Text"),
342
- gr.Label(label="Sentiment Analysis (for image)"),
343
- gr.Image(type='pil', label="Generated Equirectangular Image"),
344
- gr.Audio(label="Generated Music", type="filepath"),
345
- gr.Plot(label="Texture and Sphere Preview")
346
- ],
347
- title="Affective Virtual Environments",
348
- description="Create an AVE using your voice. Get emotion prediction (for music), transcription, sentiment analysis (for image), a generated equirectangular image, music, and a preview of how it would look as a texture on a sphere."
349
- )
350
-
351
- interface.launch()
 
18
  import base64
19
  import plotly.graph_objects as go
20
  from plotly.subplots import make_subplots
21
+ import soundfile as sf
22
+ from pydub import AudioSegment
23
+ import math
24
+ import json
25
+ import imageio
26
+ from PIL import Image, ImageFilter
27
+ import matplotlib.pyplot as plt
28
+ from matplotlib.animation import FuncAnimation
29
+ import base64
30
+ from io import BytesIO
31
+ import struct
32
+ import cv2
33
 
34
  # Load the emotion prediction model
35
  def load_emotion_model(model_path):
 
63
 
64
  processor, music_model, device = load_musicgen_model()
65
 
66
+ # Function to chunk audio into segments
67
+ def chunk_audio(audio_path, chunk_duration=10):
68
+ """Split audio into chunks and return list of chunk file paths"""
69
+ try:
70
+ # Load audio file
71
+ audio = AudioSegment.from_file(audio_path)
72
+ duration_ms = len(audio)
73
+ chunk_ms = chunk_duration * 1000
74
+
75
+ # Validate chunk duration
76
+ if chunk_duration <= 0:
77
+ raise ValueError("Chunk duration must be positive")
78
+
79
+ if chunk_duration > duration_ms / 1000:
80
+ # If chunk duration is longer than audio, return the whole audio
81
+ return [audio_path], 1
82
+
83
+ chunks = []
84
+ chunk_files = []
85
+
86
+ # Calculate number of chunks
87
+ num_chunks = math.ceil(duration_ms / chunk_ms)
88
+
89
+ for i in range(num_chunks):
90
+ start_ms = i * chunk_ms
91
+ end_ms = min((i + 1) * chunk_ms, duration_ms)
92
+
93
+ # Extract chunk
94
+ chunk = audio[start_ms:end_ms]
95
+ chunks.append(chunk)
96
+
97
+ # Save chunk to temporary file
98
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
99
+ chunk.export(tmp_file.name, format="wav")
100
+ chunk_files.append(tmp_file.name)
101
+
102
+ return chunk_files, num_chunks
103
+
104
+ except Exception as e:
105
+ print("Error chunking audio:", e)
106
+ # Return original file as single chunk if chunking fails
107
+ return [audio_path], 1
108
+
109
  # Function to transcribe audio
110
  def transcribe(wav_filepath):
111
  try:
 
168
  return "neutral", 0.0
169
 
170
  # Function to get image prompt based on sentiment
171
+ def get_image_prompt(sentiment, transcribed_text, chunk_idx, total_chunks):
172
+ base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: "
173
+
174
  if sentiment == "positive":
175
+ return f"Generate a non-figurative equirectangular 360 abstract texture of:{transcribed_text}. Use low histogram frequency in bright bins, dominant color in high RGB range, and high brightness and color variance. Apply high-frequency texture with strong filter energy, pronounced gradient magnitude, and strong local contrast. Use high spatial complexity, increased horizontal and vertical symmetry, high edge density, bright gray levels, and high contrast. Emphasize rich visual structure, color variation, and texture intensity across spatial composition."
176
 
177
  elif sentiment == "negative":
178
+ return f"Generate a non-figurative equirectangular 360 abstract texture of:{transcribed_text}. Use high histogram frequency in dark bins, dominant color in low RGB range, and low brightness and color variance. Apply low-frequency texture with low filter energy, weak gradient magnitude, and low local contrast. Use low spatial complexity, reduced horizontal and vertical symmetry, low edge density, dark gray levels, and moderate contrast. Emphasize coarse structure and limited variation in color, texture, and spatial distribution."
179
 
180
  else: # neutral
181
+ return f"Generate a non-figurative abstract equirectangular 360 abstract texture of:{transcribed_text}. Use a balanced histogram frequency across bins, dominant color in a mid RGB range, and moderate brightness and color variance. Apply medium-frequency texture with moderate filter energy, standard gradient magnitude, and average local contrast. Use medium spatial complexity, balanced horizontal and vertical symmetry, medium edge density, mid-range gray levels, and standard contrast. Emphasize naturalistic structure and typical variation in color, texture, and spatial distribution."
182
 
183
  # Function to get music prompt based on emotion
184
+ def get_music_prompt(emotion, transcribed_text, chunk_idx, total_chunks):
185
+ base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: "
186
+
187
  emotion_prompts = {
188
+ 'neutral': f"Generate a neutral soundtrack with balanced energy and smooth spectral profile. Use moderate tempo (~100 BPM), onset rate around 2.8/sec, spectral centroid near 1000 Hz, and low dissonance. Keep pitch salience moderate (0.50) and loudness stable (~0.70 dB). Maintain low harmonic change rate (~0.05/sec) and tonal entropy 1.5 for equilibrium. Emphasize tonal balance, steady dynamics, and calm tonal centers. The music should feel even, ambient, and unobtrusive, complementing: {transcribed_text}.",
189
+ 'calm': f"Generate a calm soundtrack with a slow tempo (~85 BPM), low onset rate (~2.2/sec), soft spectral centroid (~850 Hz), and smooth timbral evolution. Use low dissonance, high spectral flatness, and gentle pitch salience (~0.48). Keep loudness low (~0.65 dB) with infrequent harmonic changes (~0.04/sec) and stable tonality (Krumhansl value 0.80, major mode). The music should evoke tranquility and serenity through warm timbres, sustained harmonies, and flowing textures inspired by: {transcribed_text}.",
190
+ 'happy': f"Generate a happy soundtrack with fast tempo (~127 BPM), dense rhythmic activity (~4.2 onsets/sec), and bright timbre (spectral centroid ~1321 Hz). Use variable dissonance and peaked spectral kurtosis to create vivid texture. Maintain pitch salience (~0.54), loudness (~0.90 dB), and chord change rate (~0.07/sec). Keep tonal entropy moderate (1.95) and Krumhansl value (0.83, major mode). The music should convey joy and positivity through energetic rhythms, ornamented melodic contours, and harmonically grounded progressions inspired by: {transcribed_text}.",
191
+ 'sad': f"Generate a sad soundtrack with slow tempo (~72 BPM), sparse onset rate (~2.0/sec), and dark timbre (spectral centroid ~720 Hz). Use moderate dissonance, low spectral kurtosis, and soft pitch salience (~0.45). Keep loudness subdued (~0.60 dB) with rare harmonic changes (~0.05/sec) and low tonal entropy (~1.4). Emphasize minor mode with gentle phrasing and sustained harmonic textures. The music should evoke sadness, intimacy, and reflection in relation to: {transcribed_text}.",
192
+ 'angry': f"Generate an angry soundtrack with moderately fast tempo (~120 BPM), onset rate (~3.4/sec), and bright, sharp timbre (spectral centroid ~2002 Hz). Use flat spectral kurtosis and stable dissonance. Maintain clear pitch salience (~0.58), high loudness (~0.96 dB), and frequent chord changes (~0.10/sec). Set tonal entropy to 2.57 and Krumhansl key profile (~0.54, minor mode). The music should express anger through strong rhythmic drive, aggressive articulation, and harmonically unstable progressions that reflect: {transcribed_text}.",
193
+ 'fearful': f"Generate a fearful soundtrack with irregular tempo (~95 BPM), fluctuating onset rate (~3.0/sec), and high spectral variability (centroid ~1750 Hz). Use unstable dissonance, low pitch salience (~0.42), and dynamic loudness (~0.80 dB). Increase chord change irregularity (~0.09/sec) and tonal entropy (2.4, minor mode). Emphasize eerie textures, spatial tension, and spectral modulation. The music should evoke suspense, fear, and anticipation inspired by: {transcribed_text}.",
194
+ 'disgust': f"Generate a disgusted soundtrack with moderate tempo (~90 BPM), irregular onset rate (~2.5/sec), and dark, rough timbre (spectral centroid ~950 Hz). Use dissonant harmonies, unstable spectral kurtosis, and low pitch salience (~0.40). Keep loudness (~0.75 dB) and tonal entropy (~2.2, minor mode). The music should evoke discomfort and unease through distorted textures, rough intervals, and unstable harmonic motion reflecting: {transcribed_text}.",
195
+ 'surprised': f"Generate a surprised soundtrack with variable tempo (~110 BPM), fluctuating onset rate (~3.8/sec), and dynamic spectral centroid (~1500 Hz). Use high spectral kurtosis and pitch salience (~0.57) to accent sudden contrasts. Loudness should vary (~0.85 dB) with irregular chord changes (~0.11/sec) and moderate tonal entropy (~2.0, major mode). The music should evoke surprise and wonder through abrupt transitions, playful motifs, and expressive timbral changes inspired by: {transcribed_text}."
196
  }
197
 
198
+ return emotion_prompts.get(
199
+ emotion.lower(),
200
+ f"Create background music with {emotion} atmosphere that represents: {transcribed_text}"
201
+ )
202
+
203
  # Function to generate music with MusicGen (using acoustic emotion prediction)
204
+ def generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks):
205
  try:
206
  if processor is None or music_model is None:
207
  return None
208
 
209
  # Get specific prompt based on emotion
210
+ prompt = get_music_prompt(emotion_prediction, transcribed_text, chunk_idx, total_chunks)
211
 
212
  # Limit prompt length to avoid model issues
213
  if len(prompt) > 200:
 
241
  # --- DeepAI Image Generation (Text2Img) ---
242
  api_key = os.getenv("DeepAI_api_key")
243
 
244
+ # Function to upscale image using Lanczos interpolation
245
+ def upscale_image(image, target_width=4096, target_height=2048):
246
+ """
247
+ Upscale image using DeepAI's Torch-SRGAN API for super resolution
248
+ """
249
+ try:
250
+ if not api_key:
251
+ print("No API key available for upscaling")
252
+ # Fallback to OpenCV if no API key
253
+ img_array = np.array(image)
254
+ upscaled = cv2.resize(
255
+ img_array,
256
+ (target_width, target_height),
257
+ interpolation=cv2.INTER_LANCZOS4
258
+ )
259
+ return Image.fromarray(upscaled)
260
+
261
+ # Save the image to a temporary file
262
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_input:
263
+ image.save(tmp_input.name, "JPEG", quality=95)
264
+
265
+ # Make request to DeepAI torch-srgan API
266
+ response = requests.post(
267
+ "https://api.deepai.org/api/torch-srgan",
268
+ files={'image': open(tmp_input.name, 'rb')},
269
+ headers={'api-key': api_key}
270
+ )
271
+
272
+ data = response.json()
273
+
274
+ if 'output_url' in data:
275
+ # Download the upscaled image
276
+ img_resp = requests.get(data['output_url'])
277
+ upscaled_image = Image.open(BytesIO(img_resp.content))
278
+
279
+ # Ensure the image meets our target dimensions
280
+ if upscaled_image.size != (target_width, target_height):
281
+ upscaled_image = upscaled_image.resize(
282
+ (target_width, target_height),
283
+ Image.Resampling.LANCZOS
284
+ )
285
+
286
+ # Clean up temporary file
287
+ os.unlink(tmp_input.name)
288
+ return upscaled_image
289
+ else:
290
+ print("Error in DeepAI upscaling response:", data)
291
+ # Fallback to OpenCV if API fails
292
+ img_array = np.array(image)
293
+ upscaled = cv2.resize(
294
+ img_array,
295
+ (target_width, target_height),
296
+ interpolation=cv2.INTER_LANCZOS4
297
+ )
298
+ return Image.fromarray(upscaled)
299
+
300
+ except Exception as e:
301
+ print(f"Error upscaling image with DeepAI: {e}")
302
+ # Fallback to OpenCV if any error occurs
303
+ img_array = np.array(image)
304
+ upscaled = cv2.resize(
305
+ img_array,
306
+ (target_width, target_height),
307
+ interpolation=cv2.INTER_LANCZOS4
308
+ )
309
+ return Image.fromarray(upscaled)
310
+
311
+ # ADD THE MISSING generate_image FUNCTION HERE
312
+ def generate_image(sentiment_prediction, transcribed_text, chunk_idx, total_chunks):
313
  try:
314
  if not api_key:
315
  # fallback white image if no API key
316
+ base_image = Image.new('RGB', (1024,512), color='white')
317
+ else:
318
+ # Get specific prompt based on sentiment
319
+ prompt = get_image_prompt(sentiment_prediction, transcribed_text, chunk_idx, total_chunks)
320
+
321
+ # Make request to DeepAI text2img API
322
+ response = requests.post(
323
+ "https://api.deepai.org/api/text2img",
324
+ data={
325
+ 'text': prompt,
326
+ 'width': 1024,
327
+ 'height': 512,
328
+ 'image_generator_version': 'hd'
329
+ },
330
+ headers={'api-key': api_key}
331
+ )
332
 
333
+ data = response.json()
334
+ if 'output_url' in data:
335
+ # Download the generated image
336
+ img_resp = requests.get(data['output_url'])
337
+ base_image = Image.open(BytesIO(img_resp.content))
338
+ else:
339
+ print("Error in DeepAI response:", data)
340
+ # Return a fallback image
341
+ base_image = Image.new('RGB', (1024,512), color='white')
342
 
343
+ # Upscale the image for better quality in 360 viewer
344
+ upscaled_image = upscale_image(base_image)
345
+ return upscaled_image
 
 
 
 
 
 
 
 
346
 
 
 
 
 
 
 
 
 
 
347
  except Exception as e:
348
  print("Error generating image:", e)
349
  # Return a fallback image
350
+ return Image.new('RGB', (1024,512), color='white')
351
 
352
+ # Function to process a single chunk
353
+ def process_chunk(chunk_path, chunk_idx, total_chunks, generate_audio=True):
 
354
  try:
355
+ # Get acoustic emotion prediction (for music)
356
+ emotion_prediction = predict_emotion_from_audio(chunk_path)
 
357
 
358
+ # Get transcribed text
359
+ transcribed_text = transcribe(chunk_path)
 
 
 
 
 
360
 
361
+ # Analyze sentiment of transcribed text (for image)
362
+ sentiment, polarity = analyze_sentiment(transcribed_text)
363
 
364
+ # Generate image using SENTIMENT analysis with specific prompt
365
+ image = generate_image(sentiment, transcribed_text, chunk_idx, total_chunks)
366
+
367
+ # Add 360 metadata to the image
368
+ image_with_360_path = add_360_metadata(image)
369
+
370
+ # Generate music only if audio generation is enabled
371
+ music_path = None
372
+ if generate_audio:
373
+ music_path = generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks)
374
+
375
+ return {
376
+ 'chunk_index': chunk_idx + 1,
377
+ 'emotion': emotion_prediction,
378
+ 'transcription': transcribed_text,
379
+ 'sentiment': sentiment,
380
+ 'image': image, # Original image for display in Gradio
381
+ 'image_360': image_with_360_path, # Image with 360 metadata
382
+ 'music': music_path
383
+ }
384
+ except Exception as e:
385
+ print(f"Error processing chunk {chunk_idx + 1}:", e)
386
+ # Return a fallback result with all required keys
387
+ return {
388
+ 'chunk_index': chunk_idx + 1,
389
+ 'emotion': "Error",
390
+ 'transcription': "Transcription failed",
391
+ 'sentiment': "Sentiment: error",
392
+ 'image': Image.new('RGB', (1440, 770), color='white'),
393
+ 'image_360': None,
394
+ 'music': None
395
+ }
396
+
397
+ # Function to get predictions for all chunks
398
+ def get_predictions(audio_input, generate_audio=True, chunk_duration=10):
399
+ # Chunk the audio into segments
400
+ chunk_files, total_chunks = chunk_audio(audio_input, chunk_duration)
401
+
402
+ results = []
403
+
404
+ # Process each chunk
405
+ for i, chunk_path in enumerate(chunk_files):
406
+ print(f"Processing chunk {i+1}/{total_chunks} ({chunk_duration}s each)")
407
+ result = process_chunk(chunk_path, i, total_chunks, generate_audio)
408
+ results.append(result)
409
+
410
+ # Clean up temporary chunk files
411
+ for chunk_path in chunk_files:
412
+ try:
413
+ if chunk_path != audio_input: # Don't delete original input file
414
+ os.unlink(chunk_path)
415
+ except:
416
+ pass
417
+
418
+ return results
419
+
420
+ def create_xmp_block(width, height):
421
+ """Create XMP metadata block following ExifTool's exact format."""
422
+ xmp = (
423
+ f'<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>\n'
424
+ f'<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="ExifTool">\n'
425
+ f'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">\n'
426
+ f'<rdf:Description rdf:about=""\n'
427
+ f'xmlns:GPano="http://ns.google.com/photos/1.0/panorama/"\n'
428
+ f'GPano:ProjectionType="equirectangular"\n'
429
+ f'GPano:UsePanoramaViewer="True"\n'
430
+ f'GPano:FullPanoWidthPixels="{width}"\n'
431
+ f'GPano:FullPanoHeightPixels="{height}"\n'
432
+ f'GPano:CroppedAreaImageWidthPixels="{width}"\n'
433
+ f'GPano:CroppedAreaImageHeightPixels="{height}"\n'
434
+ f'GPano:CroppedAreaLeftPixels="0"\n'
435
+ f'GPano:CroppedAreaTopPixels="0"/>\n'
436
+ f'</rdf:RDF>\n'
437
+ f'</x:xmpmeta>\n'
438
+ f'<?xpacket end="w"?>'
439
+ )
440
+ return xmp
441
+
442
+ def write_xmp_to_jpg(input_path, output_path, width, height):
443
+ """Write XMP metadata to JPEG file following ExifTool's method."""
444
+ # Read the original JPEG
445
+ with open(input_path, 'rb') as f:
446
+ data = f.read()
447
+
448
+ # Find the start of image marker
449
+ if data[0:2] != b'\xFF\xD8':
450
+ raise ValueError("Not a valid JPEG file")
451
+
452
+ # Create XMP data
453
+ xmp_data = create_xmp_block(width, height)
454
+
455
+ # Create APP1 segment for XMP
456
+ app1_marker = b'\xFF\xE1'
457
+ xmp_header = b'http://ns.adobe.com/xap/1.0/\x00'
458
+ xmp_bytes = xmp_data.encode('utf-8')
459
+ length = len(xmp_header) + len(xmp_bytes) + 2 # +2 for length bytes
460
+ length_bytes = struct.pack('>H', length)
461
+
462
+ # Construct new file content
463
+ output = bytearray()
464
+ output.extend(data[0:2]) # SOI marker
465
+ output.extend(app1_marker)
466
+ output.extend(length_bytes)
467
+ output.extend(xmp_header)
468
+ output.extend(xmp_bytes)
469
+ output.extend(data[2:]) # Rest of the original file
470
+
471
+ # Write the new file
472
+ with open(output_path, 'wb') as f:
473
+ f.write(output)
474
+
475
+ def add_360_metadata(img):
476
+ """Add 360 photo metadata to a PIL Image and return the path to the processed image."""
477
+ try:
478
+ # First, ensure the image is upscaled to 4096x2048
479
+ target_width, target_height = 4096, 2048
480
+ if img.width != target_width or img.height != target_height:
481
+ img = img.resize((target_width, target_height), Image.Resampling.LANCZOS)
482
 
483
+ # Create a temporary file
484
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
485
+ # First save as high-quality JPEG
486
+ img.save(tmp_file.name, "JPEG", quality=95)
487
+
488
+ # Then inject XMP metadata directly into JPEG file
489
+ write_xmp_to_jpg(tmp_file.name, tmp_file.name, img.width, img.height)
490
+
491
+ return tmp_file.name
492
+
493
+ except Exception as e:
494
+ print(f"Error adding 360 metadata: {str(e)}")
495
+ # Fallback: return the original image path
496
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
497
+ img.save(tmp_file.name, "JPEG", quality=95)
498
+ return tmp_file.name
499
+
500
+ def create_360_viewer_html(image_paths, audio_paths, output_path):
501
+ """Create an HTML file with a 360 viewer and audio player for the given images and audio."""
502
+ # Create a list of image data URIs
503
+ image_data_list = []
504
+ for img_path in image_paths:
505
+ with open(img_path, "rb") as f:
506
+ img_data = base64.b64encode(f.read()).decode("utf-8")
507
+ image_data_list.append(f"data:image/jpeg;base64,{img_data}")
508
+
509
+ # Create a list of audio data URIs
510
+ audio_data_list = []
511
+ for audio_path in audio_paths:
512
+ if audio_path: # Only process if audio exists
513
+ with open(audio_path, "rb") as f:
514
+ audio_data = base64.b64encode(f.read()).decode("utf-8")
515
+ audio_data_list.append(f"data:audio/wav;base64,{audio_data}")
516
+ else:
517
+ audio_data_list.append(None) # Placeholder for chunks without audio
518
+
519
+ # Create the HTML content
520
+ html_content = f"""
521
+ <!DOCTYPE html>
522
+ <html lang="en">
523
+ <head>
524
+ <meta charset="UTF-8">
525
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
526
+ <title>360 Panorama Viewer with Audio</title>
527
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.css"/>
528
+ <style>
529
+ body {{
530
+ margin: 0;
531
+ overflow: hidden;
532
+ font-family: Arial, sans-serif;
533
+ }}
534
+ #panorama {{
535
+ width: 100vw;
536
+ height: 80vh;
537
+ }}
538
+ .pnlm-hotspot.pnlm-info-hotspot {{
539
+ background-color: rgba(0, 150, 255, 0.8);
540
+ border-radius: 50%;
541
+ width: 30px;
542
+ height: 30px;
543
+ }}
544
+ .pnlm-hotspot.pnlm-info-hotspot .pnlm-sprite {{
545
+ filter: brightness(0) invert(1);
546
+ }}
547
+ .pnlm-tooltip {{
548
+ background-color: rgba(0, 0, 0, 0.7);
549
+ color: white;
550
+ border-radius: 3px;
551
+ padding: 5px 10px;
552
+ }}
553
+ #controls {{
554
+ position: absolute;
555
+ top: 10px;
556
+ right: 10px;
557
+ z-index: 1000;
558
+ background: rgba(0, 0, 0, 0.7);
559
+ color: white;
560
+ padding: 10px;
561
+ border-radius: 5px;
562
+ display: flex;
563
+ flex-direction: column;
564
+ gap: 10px;
565
+ }}
566
+ #audio-controls {{
567
+ position: fixed;
568
+ bottom: 0;
569
+ left: 0;
570
+ width: 100%;
571
+ background: rgba(0, 0, 0, 0.8);
572
+ color: white;
573
+ padding: 15px;
574
+ display: flex;
575
+ flex-direction: column;
576
+ align-items: center;
577
+ z-index: 1000;
578
+ }}
579
+ #audio-player {{
580
+ width: 80%;
581
+ margin-bottom: 10px;
582
+ }}
583
+ #audio-info {{
584
+ text-align: center;
585
+ font-size: 14px;
586
+ }}
587
+ button {{
588
+ background: #3498db;
589
+ color: white;
590
+ border: none;
591
+ padding: 8px 15px;
592
+ border-radius: 3px;
593
+ cursor: pointer;
594
+ margin: 5px;
595
+ }}
596
+ button:hover {{
597
+ background: #2980b9;
598
+ }}
599
+ select {{
600
+ padding: 5px;
601
+ border-radius: 3px;
602
+ border: 1px solid #ccc;
603
+ }}
604
+ </style>
605
+ </head>
606
+ <body>
607
+ <div id="controls">
608
+ <select id="image-selector">
609
+ {"".join([f'<option value="{i}">Chunk {i+1}</option>' for i in range(len(image_data_list))])}
610
+ </select>
611
+ </div>
612
 
613
+ <div id="panorama"></div>
 
 
614
 
615
+ <div id="audio-controls">
616
+ <audio id="audio-player" controls></audio>
617
+ <div id="audio-info">No audio available for this chunk</div>
618
+ </div>
619
+
620
+ <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.js"></script>
621
+ <script>
622
+ const images = {json.dumps(image_data_list)};
623
+ const audioFiles = {json.dumps(audio_data_list)};
624
+ let currentViewer = null;
625
+
626
+ function loadPanorama(index) {{
627
+ if (currentViewer) {{
628
+ currentViewer.destroy();
629
+ }}
630
 
631
+ currentViewer = pannellum.viewer('panorama', {{
632
+ "type": "equirectangular",
633
+ "panorama": images[index],
634
+ "autoLoad": true,
635
+ "autoRotate": -2,
636
+ "showZoomCtrl": true,
637
+ "showFullscreenCtrl": true,
638
+ "hfov": 100
639
+ }});
640
+
641
+ // Update audio player
642
+ updateAudioPlayer(index);
643
+ }}
644
+
645
+ function updateAudioPlayer(index) {{
646
+ const audioPlayer = document.getElementById('audio-player');
647
+ const audioInfo = document.getElementById('audio-info');
648
 
649
+ if (audioFiles[index]) {{
650
+ audioPlayer.src = audioFiles[index];
651
+ audioInfo.textContent = 'Playing audio for Chunk ' + (index + 1);
652
+ // Try to play automatically (may be blocked by browser policies)
653
+ audioPlayer.play().catch(e => {{
654
+ audioInfo.textContent = 'Click play to listen to audio for Chunk ' + (index + 1);
655
+ }});
656
+ }} else {{
657
+ audioPlayer.src = '';
658
+ audioInfo.textContent = 'No audio available for this chunk';
659
+ }}
660
+ }}
661
+
662
+ // Load the first image initially
663
+ loadPanorama(0);
664
+
665
+ // Handle image selection changes
666
+ document.getElementById('image-selector').addEventListener('change', function(e) {{
667
+ const selectedIndex = parseInt(e.target.value);
668
+ loadPanorama(selectedIndex);
669
+ }});
670
+ </script>
671
+ </body>
672
+ </html>
673
+ """
674
+
675
+ # Write the HTML to a file
676
+ with open(output_path, 'w') as f:
677
+ f.write(html_content)
678
+
679
+ return output_path
680
+
681
+ # Update the process_and_display function
682
+ def process_and_display(audio_input, generate_audio, chunk_duration):
683
+ # Validate chunk duration
684
+ if chunk_duration is None or chunk_duration <= 0:
685
+ chunk_duration = 10
686
+
687
+ # Show loading indicator
688
+ yield [gr.HTML(f"""
689
+ <div style="text-align: center; margin: 20px;">
690
+ <p style="font-size: 18px; color: #4a4a4a;">Processing audio in {chunk_duration}-second chunks...</p>
691
+ <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
692
+ <style>@keyframes spin {{ 0% {{ transform: rotate(0deg); }} 100% {{ transform: rotate(360deg); }} }}</style>
693
+ <p style="font-size: 14px; color: #4a4a4a;">This may take several minutes depending on the audio length...</p>
694
+ </div>
695
+ """)] + [gr.Group(visible=False)] * len(group_components) + [None] * (len(output_containers) * 6) + [None, ""]
696
+
697
+ results = get_predictions(audio_input, generate_audio, chunk_duration)
698
+
699
+ # Initialize outputs list
700
+ outputs = []
701
+ group_visibility = []
702
+ all_360_images = [] # Collect all 360 images for the viewer
703
+ all_music_paths = [] # Collect all music paths for the viewer
704
+
705
+ # Process each result
706
+ for i, result in enumerate(results):
707
+ if i < len(output_containers):
708
+ group_visibility.append(gr.Group(visible=True))
709
+ outputs.extend([
710
+ result['emotion'],
711
+ result['transcription'],
712
+ result['sentiment'],
713
+ result['image'],
714
+ result['image_360'],
715
+ result['music']
716
+ ])
717
+ # Collect the 360-processed images and music
718
+ if result['image_360']:
719
+ all_360_images.append(result['image_360']) # Use the 360-processed image
720
+ all_music_paths.append(result['music']) # Can be None if no music generated
721
+ else:
722
+ # If we have more results than containers, just extend with None
723
+ group_visibility.append(gr.Group(visible=False))
724
+ outputs.extend([None] * 6)
725
+
726
+ # Hide remaining containers
727
+ for i in range(len(results), len(output_containers)):
728
+ group_visibility.append(gr.Group(visible=False))
729
+ outputs.extend([None] * 6)
730
+
731
+ # Create 360 viewer HTML if we have 360 images
732
+ viewer_html_path = None
733
+ if all_360_images:
734
+ with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as tmp_file:
735
+ viewer_html_path = create_360_viewer_html(all_360_images, all_music_paths, tmp_file.name)
736
+
737
+ # Hide loading indicator and show results
738
+ yield [gr.HTML("")] + group_visibility + outputs + [viewer_html_path, ""]
739
+
740
+ # Update the clear_all function to handle the new outputs
741
+ def clear_all():
742
+ # Create a list with None for all outputs
743
+ outputs = [None] # For audio input
744
+
745
+ # For group components (set to invisible)
746
+ outputs.extend([gr.Group(visible=False)] * len(group_components))
747
+
748
+ # For all output containers (set to None)
749
+ outputs.extend([None] * (len(output_containers) * 6))
750
+
751
+ # For loading indicator (empty HTML)
752
+ outputs.append(gr.HTML(""))
753
+
754
+ # For chunk duration (reset to 10)
755
+ outputs.append(10)
756
+
757
+ # For example selector (reset to None)
758
+ outputs.append(None)
759
+
760
+ # For viewer (set to None)
761
+ outputs.append(None)
762
+
763
+ # For JavaScript output (empty)
764
+ outputs.append("")
765
+
766
+ return outputs
767
+
768
+ # Function to load example audio (placeholder - you need to implement this)
769
+ def load_example_audio(example_name):
770
+ # This is a placeholder - you need to implement this function
771
+ # Return the path to the example audio file based on the example_name
772
+ return None
773
+
774
+ # Custom CSS for enhanced styling
775
+ custom_css = """
776
+ .download-section {
777
+ background: rgba(255,255,255,255);
778
+ padding: 25px;
779
+ border-radius: 15px;
780
+ border: 3px solid #764ba2;
781
+ text-align: left;
782
+ margin: 25px 0;
783
+ box-shadow: 0 10px 30px rgba(0,0,0,0.15);
784
+ position: relative;
785
+ overflow: hidden;
786
+ }
787
+
788
+ .download-section::before {
789
+ content: "";
790
+ position: absolute;
791
+ top: -50%;
792
+ left: -50%;
793
+ width: 200%;
794
+ height: 200%;
795
+ background: radial-gradient(circle, rgba(255,255,255,0.1) 0%, transparent 70%);
796
+ animation: shimmer 3s infinite linear;
797
+ pointer-events: none;
798
+ }
799
+
800
+ @keyframes shimmer {
801
+ 0% { transform: rotate(0deg); }
802
+ 100% { transform: rotate(360deg); }
803
+ }
804
+
805
+ .download-section h2 {
806
+ color: white;
807
+ font-size: 16px;
808
+ margin-bottom: 15px;
809
+ text-shadow: 1px 1px 3px rgba(0,0,0,0.3);
810
+ }
811
+
812
+ .download-section p {
813
+ color: rgba(255,255,255,0.9);
814
+ font-size: 16px;
815
+ margin-bottom: 20px;
816
+ line-height: 3.5;
817
+ }
818
+
819
+ .download-button {
820
+ background: rgba(155,155,155,255) !important;
821
+ color: white !important;
822
+ border: none !important;
823
+ padding: 12px 30px !important;
824
+ border-radius: 0px !important;
825
+ font-weight: bold !important;
826
+ font-size: 16px !important;
827
+ margin-top: 15px !important;
828
+ transition: all 0.3s ease !important;
829
+ cursor: pointer !important;
830
+ display: inline-block !important;
831
+ }
832
+
833
+ .download-button:hover {
834
+ transform: translateY(-3px) !important;
835
+ box-shadow: 0 8px 20px rgba(0,0,0,0.6) !important;
836
+ }
837
+
838
+ .download-button:active {
839
+ transform: translateY(1px) !important;
840
+ }
841
+
842
+ .download-icon {
843
+ margin-right: 8px;
844
+ font-size: 28px;
845
+ }
846
+
847
+ .feature-list {
848
+ display: flex;
849
+ justify-content: center;
850
+ flex-wrap: wrap;
851
+ gap: 15px;
852
+ margin: 20px 0;
853
+ }
854
+
855
+ .feature-item {
856
+ background: rgba(255,255,255,0.15);
857
+ padding: 10px 15px;
858
+ border-radius: 8px;
859
+ display: flex;
860
+ align-items: center;
861
+ gap: 8px;
862
+ color: white;
863
+ font-size: 14px;
864
+ }
865
+
866
+ .feature-icon {
867
+ font-size: 26px;
868
+ }
869
+
870
+ .viewer-preview {
871
+ margin-top: 20px;
872
+ border-radius: 10px;
873
+ overflow: hidden;
874
+ box-shadow: 0 5px 15px rgba(0,0,0,0.2);
875
+ max-width: 400px;
876
+ margin-left: auto;
877
+ margin-right: auto;
878
+ }
879
+
880
+ .viewer-preview img {
881
+ width: 100%;
882
+ display: block;
883
+ }
884
+
885
+ .instructions {
886
+ background: rgba(255,255,255,0.1);
887
+ padding: 15px;
888
+ border-radius: 8px;
889
+ margin-top: 20px;
890
+ text-align: left;
891
+ }
892
+
893
+ .instructions h3 {
894
+ color: white;
895
+ margin-top: 0;
896
+ font-size: 16px;
897
+ }
898
+
899
+ .instructions ol {
900
+ color: rgba(255,255,255,0.9);
901
+ padding-left: 20px;
902
+ margin-bottom: 0;
903
+ }
904
+
905
+ .instructions li {
906
+ margin-bottom: 8px;
907
+ }
908
+ """
909
+ # Create the Gradio interface with proper output handling
910
+ with gr.Blocks(title="Affective Virtual Environments - Chunked Processing", css=custom_css) as interface:
911
+ gr.Markdown("# The Emotional Machine")
912
+ gr.Markdown(
913
+ """
914
+ **The Emotional Machine** is a digital media project that generates virtual environments using multimodal speech emotion recognition as its main mode of interaction.
915
+
916
+ ### How to interact
917
+ 1. Record your voice or upload an audio file.
918
+ 2. Define the length to chunk your sample.
919
+ 3. Use the checkbox if you want to generate audio for each chunk.
920
+ 4. Generate your Affective Virtual Environment and wait for the results.
921
+ 5. Download the HTML file.
922
+ 6. Open your creation using any web browser.
923
+ ---
924
+ **Learn more:**
925
+ • Video Tutorial: [How to Use this space ](https://youtu.be/eVD1lzwVhi8)
926
+
927
+ • For more information about the project, visit: [www.emotional-machines.com](https://www.emotional-machines.com)
928
+
929
+ """
930
+ )
931
+
932
+
933
+ with gr.Row():
934
+ with gr.Column(scale=2):
935
+ audio_input = gr.Audio(label="Input Audio", type="filepath", sources=["microphone", "upload"])
936
+
937
+ # Add example audio selection
938
+ # example_selector = gr.Dropdown(
939
+ # label="Select Example Audio",
940
+ # choices=["Happy Speech", "Sad Story", "Neutral News"],
941
+ # value=None,
942
+ # info="Choose from pre-recorded example speeches"
943
+ # )
944
+
945
+ # Add button to load selected example
946
+ #load_example_btn = gr.Button("Load Example", variant="secondary")
947
+
948
+ with gr.Column(scale=1):
949
+ # Add chunk duration input
950
+ chunk_duration_input = gr.Number(
951
+ label="Chunk Duration (seconds)",
952
+ value=10,
953
+ minimum=1,
954
+ maximum=60,
955
+ step=1,
956
+ info="Duration of each audio segment to process (1-60 seconds)"
957
+ )
958
+ # Add checkbox for audio generation
959
+ generate_audio_checkbox = gr.Checkbox(
960
+ label="Generate Audio (may take longer)",
961
+ value=False,
962
+ info="Uncheck to skip music generation and speed up processing"
963
+ )
964
+ with gr.Row():
965
+ process_btn = gr.Button("Generate", variant="primary")
966
+ clear_btn = gr.Button("Clear All", variant="secondary")
967
+
968
+ # Add a loading indicator
969
+ loading_indicator = gr.HTML("""
970
+ <div id="loading" style="display: none; text-align: center; margin: 20px;">
971
+ <p style="font-size: 18px; color: #4a4a4a;">Processing audio chunks...</p>
972
+ <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
973
+ <style>@keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } }</style>
974
+ </div>
975
+ """)
976
+
977
+ # Create output components for each chunk type
978
+ output_containers = []
979
+ group_components = [] # Store group components separately
980
+
981
+ # We'll create up to 20 chunk slots to accommodate different chunk durations
982
+ for i in range(20):
983
+ with gr.Group(visible=False) as chunk_group:
984
+ gr.Markdown(f"### Chunk {i+1} Results")
985
+ with gr.Row():
986
+ emotion_output = gr.Label(label="Acoustic Emotion Prediction")
987
+ transcription_output = gr.Label(label="Transcribed Text")
988
+ sentiment_output = gr.Label(label="Sentimental Analysis")
989
+ with gr.Row():
990
+ image_output = gr.Image(label="Generated Equirectangular Image")
991
+ image_360_output = gr.File(label="Download 360 Image", type="filepath")
992
+ with gr.Row():
993
+ audio_output = gr.Audio(label="Generated Music")
994
+ gr.HTML("<hr style='margin: 20px 0; border: 1px solid #ccc;'>")
995
 
996
+ group_components.append(chunk_group)
997
+ output_containers.append({
998
+ 'emotion': emotion_output,
999
+ 'transcription': transcription_output,
1000
+ 'sentiment': sentiment_output,
1001
+ 'image': image_output,
1002
+ 'image_360': image_360_output,
1003
+ 'music': audio_output
1004
+ })
1005
+
1006
+ # Enhanced Download 360 Viewer Section
1007
+ with gr.Group(visible=True, elem_classes="download-section") as download_group:
1008
+ gr.Markdown("""
1009
+
1010
 
 
 
 
 
 
 
 
 
 
1011
 
1012
+ """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1013
 
1014
+ # Enhanced download button
1015
+ viewer_html_output = gr.File(
1016
+ label=" Once processing is complete, download your AVE from here 🚀",
1017
+ type="filepath",
1018
+ interactive=False,
1019
+ elem_classes="download-button"
1020
+ )
1021
+
1022
+ # Add a hidden HTML component for JavaScript execution
1023
+ js_output = gr.HTML(visible=False)
1024
+
1025
+ # Function to handle example selection
1026
+ def load_example(example_name):
1027
+ if not example_name:
1028
+ return None, None
1029
 
1030
+ # Get the path to the example audio file
1031
+ example_path = load_example_audio(example_name)
1032
 
1033
+ # Return the example path to update the audio component
1034
+ return example_path, example_name
1035
+
1036
+ # Set up the button clicks
1037
+ process_btn.click(
1038
+ fn=process_and_display,
1039
+ inputs=[audio_input, generate_audio_checkbox, chunk_duration_input],
1040
+ outputs=[loading_indicator] + group_components + [comp for container in output_containers for comp in [
1041
+ container['emotion'],
1042
+ container['transcription'],
1043
+ container['sentiment'],
1044
+ container['image'],
1045
+ container['image_360'],
1046
+ container['music']
1047
+ ]] + [viewer_html_output, js_output]
1048
+ )
1049
+
1050
+ clear_btn.click(
1051
+ fn=clear_all,
1052
+ inputs=[],
1053
+ outputs=[audio_input] + group_components + [comp for container in output_containers for comp in [
1054
+ container['emotion'],
1055
+ container['transcription'],
1056
+ container['sentiment'],
1057
+ container['image'],
1058
+ container['image_360'],
1059
+ container['music']
1060
+ ]] + [loading_indicator, chunk_duration_input, viewer_html_output, js_output]
1061
+ )
1062
+
1063
+ #load_example_btn.click(
1064
+ # fn=load_example,
1065
+ # inputs=[example_selector],
1066
+ # outputs=[audio_input, example_selector]
1067
+ #)
1068
+
1069
+ # Check if we're running on Hugging Face Spaces
1070
+ is_spaces = os.getenv('SPACE_ID') is not None
1071
+
1072
+ # Launch with appropriate settings
1073
+ interface.launch(share=True) # Only share when not on Spaces