jfforero commited on
Commit
28f6618
·
verified ·
1 Parent(s): af92dcb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +445 -681
app.py CHANGED
@@ -1,6 +1,4 @@
1
  import gradio as gr
2
- import pyvista as pv
3
- from pyvista import examples
4
  import numpy as np
5
  import librosa
6
  import requests
@@ -9,46 +7,39 @@ from PIL import Image
9
  import os
10
  from tensorflow.keras.models import load_model
11
  from faster_whisper import WhisperModel
12
- import random
13
  from textblob import TextBlob
14
  import torch
15
  import scipy.io.wavfile
16
  from transformers import AutoProcessor, MusicgenForConditionalGeneration
17
  import tempfile
18
  import base64
19
- import plotly.graph_objects as go
20
- from plotly.subplots import make_subplots
21
- import soundfile as sf
22
  from pydub import AudioSegment
23
  import math
24
  import json
25
- import imageio
26
- from PIL import Image, ImageFilter
27
- import matplotlib.pyplot as plt
28
- from matplotlib.animation import FuncAnimation
29
- import base64
30
- from io import BytesIO
31
  import struct
32
  import cv2
33
 
34
- # Load the emotion prediction model
 
 
 
35
  def load_emotion_model(model_path):
36
  try:
37
- model = load_model(model_path)
38
  print("Emotion model loaded successfully")
39
- return model
40
  except Exception as e:
41
  print("Error loading emotion prediction model:", e)
42
  return None
43
 
44
- model_path = 'mymodel_SER_LSTM_RAVDESS.h5'
45
  model = load_emotion_model(model_path)
46
 
47
- # Initialize WhisperModel
48
  model_size = "small"
49
  model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
50
 
51
- # Load MusicGen model
52
  def load_musicgen_model():
53
  try:
54
  device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -63,50 +54,41 @@ def load_musicgen_model():
63
 
64
  processor, music_model, device = load_musicgen_model()
65
 
66
- # Function to chunk audio into segments
 
 
 
67
  def chunk_audio(audio_path, chunk_duration=10):
68
  """Split audio into chunks and return list of chunk file paths"""
69
  try:
70
- # Load audio file
71
  audio = AudioSegment.from_file(audio_path)
72
  duration_ms = len(audio)
73
  chunk_ms = chunk_duration * 1000
74
-
75
- # Validate chunk duration
76
  if chunk_duration <= 0:
77
  raise ValueError("Chunk duration must be positive")
78
-
79
  if chunk_duration > duration_ms / 1000:
80
- # If chunk duration is longer than audio, return the whole audio
81
  return [audio_path], 1
82
-
83
- chunks = []
84
  chunk_files = []
85
-
86
- # Calculate number of chunks
87
  num_chunks = math.ceil(duration_ms / chunk_ms)
88
-
89
  for i in range(num_chunks):
90
  start_ms = i * chunk_ms
91
  end_ms = min((i + 1) * chunk_ms, duration_ms)
92
-
93
- # Extract chunk
94
  chunk = audio[start_ms:end_ms]
95
- chunks.append(chunk)
96
-
97
- # Save chunk to temporary file
98
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
99
  chunk.export(tmp_file.name, format="wav")
100
  chunk_files.append(tmp_file.name)
101
-
102
  return chunk_files, num_chunks
103
-
104
  except Exception as e:
105
  print("Error chunking audio:", e)
106
- # Return original file as single chunk if chunking fails
107
  return [audio_path], 1
108
 
109
- # Function to transcribe audio
110
  def transcribe(wav_filepath):
111
  try:
112
  segments, _ = model2.transcribe(wav_filepath, beam_size=5)
@@ -115,7 +97,6 @@ def transcribe(wav_filepath):
115
  print("Error transcribing audio:", e)
116
  return "Transcription failed"
117
 
118
- # Function to extract MFCC features from audio
119
  def extract_mfcc(wav_file_name):
120
  try:
121
  y, sr = librosa.load(wav_file_name)
@@ -125,306 +106,218 @@ def extract_mfcc(wav_file_name):
125
  print("Error extracting MFCC features:", e)
126
  return None
127
 
128
- # Emotions dictionary
129
- emotions = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful', 6: 'disgust', 7: 'surprised'}
 
 
 
 
 
 
 
 
130
 
131
- # Function to predict emotion from audio
132
  def predict_emotion_from_audio(wav_filepath):
133
  try:
134
  if model is None:
135
  return "Model not loaded"
136
-
137
  test_point = extract_mfcc(wav_filepath)
138
  if test_point is not None:
139
  test_point = np.reshape(test_point, newshape=(1, 40, 1))
140
  predictions = model.predict(test_point)
141
  predicted_emotion_label = np.argmax(predictions[0])
142
  return emotions.get(predicted_emotion_label, "Unknown emotion")
143
- else:
144
- return "Error: Unable to extract features"
145
  except Exception as e:
146
  print("Error predicting emotion:", e)
147
  return "Prediction error"
148
 
149
- # Function to analyze sentiment from text
150
  def analyze_sentiment(text):
151
  try:
152
  if not text or text.strip() == "":
153
  return "neutral", 0.0
154
-
155
  analysis = TextBlob(text)
156
  polarity = analysis.sentiment.polarity
157
-
158
  if polarity > 0.1:
159
  sentiment = "positive"
160
  elif polarity < -0.1:
161
  sentiment = "negative"
162
  else:
163
  sentiment = "neutral"
164
-
165
  return sentiment, polarity
166
  except Exception as e:
167
  print("Error analyzing sentiment:", e)
168
  return "neutral", 0.0
169
 
170
- # Function to get image prompt based on sentiment
 
 
 
171
  def get_image_prompt(sentiment, transcribed_text, chunk_idx, total_chunks):
172
- base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: "
173
-
174
  if sentiment == "positive":
175
- return f"Generate an equirectangular 360° panoramic graphite sketch drawing, detailed pencil texture with faint neon glows, cinematic lighting of:{transcribed_text}. Use low histogram frequency in bright bins, dominant color in high RGB range, and high brightness and color variance. Apply high-frequency texture with strong filter energy, pronounced gradient magnitude, and strong local contrast. Use high spatial complexity, increased horizontal and vertical symmetry, high edge density, bright gray levels, and high contrast. Emphasize rich visual structure, color variation, and texture intensity across spatial composition."
176
-
 
 
 
 
 
 
 
177
  elif sentiment == "negative":
178
- return f"Generate an equirectangular 360° panoramic graphite sketch drawing, detailed pencil texture with faint neon glows, cinematic lighting of:{transcribed_text}. Use high histogram frequency in dark bins, dominant color in low RGB range, and low brightness and color variance. Apply low-frequency texture with low filter energy, weak gradient magnitude, and low local contrast. Use low spatial complexity, reduced horizontal and vertical symmetry, low edge density, dark gray levels, and moderate contrast. Emphasize coarse structure and limited variation in color, texture, and spatial distribution."
179
-
180
- else: # neutral
181
- return f"Generate an equirectangular 360° panoramic graphite sketch drawing, detailed pencil texture with faint neon glows, cinematic lighting of:{transcribed_text}. Use a balanced histogram frequency across bins, dominant color in a mid RGB range, and moderate brightness and color variance. Apply medium-frequency texture with moderate filter energy, standard gradient magnitude, and average local contrast. Use medium spatial complexity, balanced horizontal and vertical symmetry, medium edge density, mid-range gray levels, and standard contrast. Emphasize naturalistic structure and typical variation in color, texture, and spatial distribution."
 
 
 
 
 
 
 
 
 
 
 
 
 
182
 
183
- # Function to get music prompt based on emotion
184
  def get_music_prompt(emotion, transcribed_text, chunk_idx, total_chunks):
185
- base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: "
186
-
187
  emotion_prompts = {
188
- 'neutral': f"Generate a neutral orchestral soundtrack with balanced energy and smooth spectral character. Use steady tempo, even rhythmic density, and low dissonance. Keep pitch clarity moderate and loudness stable. Maintain slow harmonic motion and tonal equilibrium. Emphasize balance, consistency, and calm tonal centers. The music should feel even, ambient, and unobtrusive, gently complementing: {transcribed_text}.",
189
-
190
- 'calm': f"Generate a calm orchestral soundtrack with slowed motion, sparse rhythmic activity, and warm timbral shading. Use minimal dissonance, smooth spectral texture, and gentle pitch presence. Keep dynamics restrained with rare harmonic shifts and stable tonality. Emphasize warmth, sustained harmonies, and flowing textures that evoke tranquility and serenity inspired by: {transcribed_text}.",
191
-
192
- 'happy': f"Generate a happy orchestral soundtrack with lively motion, energetic rhythmic density, and bright timbral color. Use controlled dissonance, vivid spectral texture, and clear melodic focus. Maintain dynamic expressiveness with active harmonic movement and stable tonal grounding. Emphasize joy through playful rhythms, ornamented melodies, and uplifting harmonic progressions inspired by: {transcribed_text}.",
193
-
194
- 'sad': f"Generate a sad orchestral soundtrack with reduced motion, sparse rhythmic events, and dark timbral color. Use gentle dissonance, softened spectral texture, and subdued pitch clarity. Keep dynamics restrained with minimal harmonic change and low tonal uncertainty. Emphasize minor coloration, sustained harmonies, and fragile phrasing in response to: {transcribed_text}.",
195
-
196
- 'angry': f"Generate an angry orchestral soundtrack with driving motion, dense rhythmic attack, and sharp timbral brightness. Use persistent dissonance, assertive pitch presence, and heightened dynamics. Maintain frequent harmonic shifts and unstable tonal grounding. Emphasize aggressive articulation, rhythmic force, and tension-laden progressions that amplify: {transcribed_text}.",
197
-
198
- 'fearful': f"Generate a fearful orchestral soundtrack with unstable motion, fluctuating rhythmic density, and highly variable timbre. Use shifting dissonance, blurred pitch focus, and volatile dynamics. Increase harmonic unpredictability and tonal instability. Emphasize eerie textures, spatial tension, and spectral motion to evoke suspense and anticipation inspired by: {transcribed_text}.",
199
-
200
- 'disgust': f"Generate a disgusted orchestral soundtrack with uneven motion, irregular rhythm, and dark, rough timbral texture. Use abrasive dissonance, unstable spectral character, and weakened pitch focus. Maintain uneasy dynamics and unsettled harmonic motion. Emphasize distorted textures, harsh intervals, and tonal ambiguity reflecting: {transcribed_text}.",
201
-
202
- 'surprised': f"Generate a surprised orchestral soundtrack with shifting motion, sudden rhythmic variation, and dynamically changing timbre. Use sharp contrasts, heightened pitch clarity, and expressive dynamic swings. Maintain irregular harmonic motion with agile tonal pivots. Emphasize abrupt transitions, playful gestures, and expressive color changes inspired by: {transcribed_text}."
203
  }
204
- return emotion_prompts.get(
205
- emotion.lower(),
206
- f"Create background music with {emotion} atmosphere that represents: {transcribed_text}"
207
- )
208
-
209
- # Function to generate music with MusicGen (using acoustic emotion prediction)
210
  def generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks):
211
  try:
212
  if processor is None or music_model is None:
213
  return None
214
-
215
- # Get specific prompt based on emotion
216
  prompt = get_music_prompt(emotion_prediction, transcribed_text, chunk_idx, total_chunks)
217
-
218
- # Limit prompt length to avoid model issues
219
  if len(prompt) > 200:
220
  prompt = prompt[:200] + "..."
221
-
222
- inputs = processor(
223
- text=[prompt],
224
- padding=True,
225
- return_tensors="pt",
226
- ).to(device)
227
-
228
- # Generate audio
229
  audio_values = music_model.generate(**inputs, max_new_tokens=512)
230
-
231
- # Convert to numpy array and sample rate
232
  sampling_rate = music_model.config.audio_encoder.sampling_rate
233
  audio_data = audio_values[0, 0].cpu().numpy()
234
-
235
- # Normalize audio data
236
- audio_data = audio_data / np.max(np.abs(audio_data))
237
-
238
- # Create a temporary file to save the audio
239
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
240
  scipy.io.wavfile.write(tmp_file.name, rate=sampling_rate, data=audio_data)
241
  return tmp_file.name
242
-
243
  except Exception as e:
244
  print("Error generating music:", e)
245
  return None
246
 
247
- # --- DeepAI Image Generation (Text2Img) ---
 
 
 
248
  api_key = os.getenv("DeepAI_api_key")
249
 
250
- # Function to upscale image using Lanczos interpolation
251
  def upscale_image(image, target_width=4096, target_height=2048):
252
  """
253
- Upscale image using DeepAI's Torch-SRGAN API for super resolution
 
254
  """
255
  try:
256
  if not api_key:
257
- print("No API key available for upscaling")
258
- # Fallback to OpenCV if no API key
259
  img_array = np.array(image)
260
- upscaled = cv2.resize(
261
- img_array,
262
- (target_width, target_height),
263
- interpolation=cv2.INTER_LANCZOS4
264
- )
265
  return Image.fromarray(upscaled)
266
-
267
- # Save the image to a temporary file
268
  with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_input:
269
  image.save(tmp_input.name, "JPEG", quality=95)
270
-
271
- # Make request to DeepAI torch-srgan API
272
  response = requests.post(
273
  "https://api.deepai.org/api/torch-srgan",
274
- files={'image': open(tmp_input.name, 'rb')},
275
- headers={'api-key': api_key}
276
  )
277
-
278
  data = response.json()
279
-
280
- if 'output_url' in data:
281
- # Download the upscaled image
282
- img_resp = requests.get(data['output_url'])
283
  upscaled_image = Image.open(BytesIO(img_resp.content))
284
-
285
- # Ensure the image meets our target dimensions
286
  if upscaled_image.size != (target_width, target_height):
287
- upscaled_image = upscaled_image.resize(
288
- (target_width, target_height),
289
- Image.Resampling.LANCZOS
290
- )
291
-
292
- # Clean up temporary file
293
- os.unlink(tmp_input.name)
294
  return upscaled_image
295
- else:
296
- print("Error in DeepAI upscaling response:", data)
297
- # Fallback to OpenCV if API fails
298
- img_array = np.array(image)
299
- upscaled = cv2.resize(
300
- img_array,
301
- (target_width, target_height),
302
- interpolation=cv2.INTER_LANCZOS4
303
- )
304
- return Image.fromarray(upscaled)
305
-
306
  except Exception as e:
307
  print(f"Error upscaling image with DeepAI: {e}")
308
- # Fallback to OpenCV if any error occurs
309
  img_array = np.array(image)
310
- upscaled = cv2.resize(
311
- img_array,
312
- (target_width, target_height),
313
- interpolation=cv2.INTER_LANCZOS4
314
- )
315
  return Image.fromarray(upscaled)
316
 
317
- # ADD THE MISSING generate_image FUNCTION HERE
318
  def generate_image(sentiment_prediction, transcribed_text, chunk_idx, total_chunks):
319
  try:
320
  if not api_key:
321
- # fallback white image if no API key
322
- base_image = Image.new('RGB', (1024,512), color='white')
323
  else:
324
- # Get specific prompt based on sentiment
325
  prompt = get_image_prompt(sentiment_prediction, transcribed_text, chunk_idx, total_chunks)
326
-
327
- # Make request to DeepAI text2img API
328
  response = requests.post(
329
  "https://api.deepai.org/api/text2img",
330
- data={
331
- 'text': prompt,
332
- 'width': 1024,
333
- 'height': 512,
334
- 'image_generator_version': 'hd'
335
- },
336
- headers={'api-key': api_key}
337
  )
338
-
339
  data = response.json()
340
- if 'output_url' in data:
341
- # Download the generated image
342
- img_resp = requests.get(data['output_url'])
343
  base_image = Image.open(BytesIO(img_resp.content))
344
  else:
345
  print("Error in DeepAI response:", data)
346
- # Return a fallback image
347
- base_image = Image.new('RGB', (1024,512), color='white')
348
-
349
- # Upscale the image for better quality in 360 viewer
350
  upscaled_image = upscale_image(base_image)
351
  return upscaled_image
352
-
353
- except Exception as e:
354
- print("Error generating image:", e)
355
- # Return a fallback image
356
- return Image.new('RGB', (1024,512), color='white')
357
 
358
- # Function to process a single chunk
359
- def process_chunk(chunk_path, chunk_idx, total_chunks, generate_audio=True):
360
- try:
361
- # Get acoustic emotion prediction (for music)
362
- emotion_prediction = predict_emotion_from_audio(chunk_path)
363
-
364
- # Get transcribed text
365
- transcribed_text = transcribe(chunk_path)
366
-
367
- # Analyze sentiment of transcribed text (for image)
368
- sentiment, polarity = analyze_sentiment(transcribed_text)
369
-
370
- # Generate image using SENTIMENT analysis with specific prompt
371
- image = generate_image(sentiment, transcribed_text, chunk_idx, total_chunks)
372
-
373
- # Add 360 metadata to the image
374
- image_with_360_path = add_360_metadata(image)
375
-
376
- # Generate music only if audio generation is enabled
377
- music_path = None
378
- if generate_audio:
379
- music_path = generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks)
380
-
381
- return {
382
- 'chunk_index': chunk_idx + 1,
383
- 'emotion': emotion_prediction,
384
- 'transcription': transcribed_text,
385
- 'sentiment': sentiment,
386
- 'image': image, # Original image for display in Gradio
387
- 'image_360': image_with_360_path, # Image with 360 metadata
388
- 'music': music_path
389
- }
390
  except Exception as e:
391
- print(f"Error processing chunk {chunk_idx + 1}:", e)
392
- # Return a fallback result with all required keys
393
- return {
394
- 'chunk_index': chunk_idx + 1,
395
- 'emotion': "Error",
396
- 'transcription': "Transcription failed",
397
- 'sentiment': "Sentiment: error",
398
- 'image': Image.new('RGB', (1440, 770), color='white'),
399
- 'image_360': None,
400
- 'music': None
401
- }
402
 
403
- # Function to get predictions for all chunks
404
- def get_predictions(audio_input, generate_audio=True, chunk_duration=10):
405
- # Chunk the audio into segments
406
- chunk_files, total_chunks = chunk_audio(audio_input, chunk_duration)
407
-
408
- results = []
409
-
410
- # Process each chunk
411
- for i, chunk_path in enumerate(chunk_files):
412
- print(f"Processing chunk {i+1}/{total_chunks} ({chunk_duration}s each)")
413
- result = process_chunk(chunk_path, i, total_chunks, generate_audio)
414
- results.append(result)
415
-
416
- # Clean up temporary chunk files
417
- for chunk_path in chunk_files:
418
- try:
419
- if chunk_path != audio_input: # Don't delete original input file
420
- os.unlink(chunk_path)
421
- except:
422
- pass
423
-
424
- return results
425
 
426
  def create_xmp_block(width, height):
427
- """Create XMP metadata block following ExifTool's exact format."""
428
  xmp = (
429
  f'<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>\n'
430
  f'<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="ExifTool">\n'
@@ -446,338 +339,299 @@ def create_xmp_block(width, height):
446
  return xmp
447
 
448
  def write_xmp_to_jpg(input_path, output_path, width, height):
449
- """Write XMP metadata to JPEG file following ExifTool's method."""
450
- # Read the original JPEG
451
- with open(input_path, 'rb') as f:
452
  data = f.read()
453
-
454
- # Find the start of image marker
455
- if data[0:2] != b'\xFF\xD8':
456
  raise ValueError("Not a valid JPEG file")
457
-
458
- # Create XMP data
459
  xmp_data = create_xmp_block(width, height)
460
-
461
- # Create APP1 segment for XMP
462
- app1_marker = b'\xFF\xE1'
463
- xmp_header = b'http://ns.adobe.com/xap/1.0/\x00'
464
- xmp_bytes = xmp_data.encode('utf-8')
465
- length = len(xmp_header) + len(xmp_bytes) + 2 # +2 for length bytes
466
- length_bytes = struct.pack('>H', length)
467
-
468
- # Construct new file content
469
  output = bytearray()
470
- output.extend(data[0:2]) # SOI marker
471
  output.extend(app1_marker)
472
  output.extend(length_bytes)
473
  output.extend(xmp_header)
474
  output.extend(xmp_bytes)
475
- output.extend(data[2:]) # Rest of the original file
476
-
477
- # Write the new file
478
- with open(output_path, 'wb') as f:
479
  f.write(output)
480
 
481
  def add_360_metadata(img):
482
- """Add 360 photo metadata to a PIL Image and return the path to the processed image."""
483
  try:
484
- # First, ensure the image is upscaled to 4096x2048
485
  target_width, target_height = 4096, 2048
486
  if img.width != target_width or img.height != target_height:
487
  img = img.resize((target_width, target_height), Image.Resampling.LANCZOS)
488
-
489
- # Create a temporary file
490
  with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
491
- # First save as high-quality JPEG
492
  img.save(tmp_file.name, "JPEG", quality=95)
493
-
494
- # Then inject XMP metadata directly into JPEG file
495
  write_xmp_to_jpg(tmp_file.name, tmp_file.name, img.width, img.height)
496
-
497
  return tmp_file.name
498
-
499
  except Exception as e:
500
  print(f"Error adding 360 metadata: {str(e)}")
501
- # Fallback: return the original image path
502
  with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
503
  img.save(tmp_file.name, "JPEG", quality=95)
504
  return tmp_file.name
505
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
506
  def create_360_viewer_html(image_paths, audio_paths, output_path):
507
- """Create an HTML file with a 360 viewer and audio player for the given images and audio."""
508
- # Create a list of image data URIs
509
  image_data_list = []
510
  for img_path in image_paths:
511
  with open(img_path, "rb") as f:
512
  img_data = base64.b64encode(f.read()).decode("utf-8")
513
  image_data_list.append(f"data:image/jpeg;base64,{img_data}")
514
-
515
- # Create a list of audio data URIs
516
  audio_data_list = []
517
  for audio_path in audio_paths:
518
- if audio_path: # Only process if audio exists
519
  with open(audio_path, "rb") as f:
520
  audio_data = base64.b64encode(f.read()).decode("utf-8")
521
  audio_data_list.append(f"data:audio/wav;base64,{audio_data}")
522
  else:
523
- audio_data_list.append(None) # Placeholder for chunks without audio
524
-
525
- # Create the HTML content
526
- html_content = f"""
527
- <!DOCTYPE html>
528
- <html lang="en">
529
- <head>
530
- <meta charset="UTF-8">
531
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
532
- <title>360 Panorama Viewer with Audio</title>
533
- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.css"/>
534
- <style>
535
- body {{
536
- margin: 0;
537
- overflow: hidden;
538
- font-family: Arial, sans-serif;
539
- }}
540
- #panorama {{
541
- width: 100vw;
542
- height: 80vh;
543
- }}
544
- .pnlm-hotspot.pnlm-info-hotspot {{
545
- background-color: rgba(0, 150, 255, 0.8);
546
- border-radius: 50%;
547
- width: 30px;
548
- height: 30px;
549
- }}
550
- .pnlm-hotspot.pnlm-info-hotspot .pnlm-sprite {{
551
- filter: brightness(0) invert(1);
552
- }}
553
- .pnlm-tooltip {{
554
- background-color: rgba(0, 0, 0, 0.7);
555
- color: white;
556
- border-radius: 3px;
557
- padding: 5px 10px;
558
- }}
559
- #controls {{
560
- position: absolute;
561
- top: 10px;
562
- right: 10px;
563
- z-index: 1000;
564
- background: rgba(0, 0, 0, 0.7);
565
- color: white;
566
- padding: 10px;
567
- border-radius: 5px;
568
- display: flex;
569
- flex-direction: column;
570
- gap: 10px;
571
- }}
572
- #audio-controls {{
573
- position: fixed;
574
- bottom: 0;
575
- left: 0;
576
- width: 100%;
577
- background: rgba(0, 0, 0, 0.8);
578
- color: white;
579
- padding: 15px;
580
- display: flex;
581
- flex-direction: column;
582
- align-items: center;
583
- z-index: 1000;
584
- }}
585
- #audio-player {{
586
- width: 80%;
587
- margin-bottom: 10px;
588
- }}
589
- #audio-info {{
590
- text-align: center;
591
- font-size: 14px;
592
- }}
593
- button {{
594
- background: #3498db;
595
- color: white;
596
- border: none;
597
- padding: 8px 15px;
598
- border-radius: 3px;
599
- cursor: pointer;
600
- margin: 5px;
601
- }}
602
- button:hover {{
603
- background: #2980b9;
604
- }}
605
- select {{
606
- padding: 5px;
607
- border-radius: 3px;
608
- border: 1px solid #ccc;
609
- }}
610
- </style>
611
- </head>
612
- <body>
613
- <div id="controls">
614
- <select id="image-selector">
615
- {"".join([f'<option value="{i}">Chunk {i+1}</option>' for i in range(len(image_data_list))])}
616
- </select>
617
- </div>
618
-
619
- <div id="panorama"></div>
620
-
621
- <div id="audio-controls">
622
- <audio id="audio-player" controls></audio>
623
- <div id="audio-info">No audio available for this chunk</div>
624
- </div>
625
 
626
- <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.js"></script>
627
- <script>
628
- const images = {json.dumps(image_data_list)};
629
- const audioFiles = {json.dumps(audio_data_list)};
630
- let currentViewer = null;
631
-
632
- function loadPanorama(index) {{
633
- if (currentViewer) {{
634
- currentViewer.destroy();
635
- }}
636
-
637
- currentViewer = pannellum.viewer('panorama', {{
638
- "type": "equirectangular",
639
- "panorama": images[index],
640
- "autoLoad": true,
641
- "autoRotate": -2,
642
- "showZoomCtrl": true,
643
- "showFullscreenCtrl": true,
644
- "hfov": 100
645
- }});
646
-
647
- // Update audio player
648
- updateAudioPlayer(index);
649
- }}
650
-
651
- function updateAudioPlayer(index) {{
652
- const audioPlayer = document.getElementById('audio-player');
653
- const audioInfo = document.getElementById('audio-info');
654
-
655
- if (audioFiles[index]) {{
656
- audioPlayer.src = audioFiles[index];
657
- audioInfo.textContent = 'Playing audio for Chunk ' + (index + 1);
658
- // Try to play automatically (may be blocked by browser policies)
659
- audioPlayer.play().catch(e => {{
660
- audioInfo.textContent = 'Click play to listen to audio for Chunk ' + (index + 1);
661
- }});
662
- }} else {{
663
- audioPlayer.src = '';
664
- audioInfo.textContent = 'No audio available for this chunk';
665
- }}
666
- }}
667
-
668
- // Load the first image initially
669
- loadPanorama(0);
670
-
671
- // Handle image selection changes
672
- document.getElementById('image-selector').addEventListener('change', function(e) {{
673
- const selectedIndex = parseInt(e.target.value);
674
- loadPanorama(selectedIndex);
675
- }});
676
- </script>
677
- </body>
678
- </html>
679
- """
680
-
681
- # Write the HTML to a file
682
- with open(output_path, 'w') as f:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683
  f.write(html_content)
684
-
685
  return output_path
686
 
687
- # Update the process_and_display function
 
 
 
 
 
 
 
688
  def process_and_display(audio_input, generate_audio, chunk_duration):
689
- # Validate chunk duration
690
  if chunk_duration is None or chunk_duration <= 0:
691
  chunk_duration = 10
692
-
693
- # Show loading indicator
694
- yield [gr.HTML(f"""
 
695
  <div style="text-align: center; margin: 20px;">
696
- <p style="font-size: 18px; color: #4a4a4a;">Processing audio in {chunk_duration}-second chunks...</p>
697
- <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
698
- <style>@keyframes spin {{ 0% {{ transform: rotate(0deg); }} 100% {{ transform: rotate(360deg); }} }}</style>
699
- <p style="font-size: 14px; color: #4a4a4a;">This may take several minutes depending on the audio length...</p>
700
  </div>
701
- """)] + [gr.Group(visible=False)] * len(group_components) + [None] * (len(output_containers) * 6) + [None, ""]
702
-
 
 
 
 
 
703
  results = get_predictions(audio_input, generate_audio, chunk_duration)
704
-
705
- # Initialize outputs list
706
  outputs = []
707
  group_visibility = []
708
- all_360_images = [] # Collect all 360 images for the viewer
709
- all_music_paths = [] # Collect all music paths for the viewer
710
-
711
- # Process each result
712
  for i, result in enumerate(results):
713
  if i < len(output_containers):
714
- group_visibility.append(gr.Group(visible=True))
715
- outputs.extend([
716
- result['emotion'],
717
- result['transcription'],
718
- result['sentiment'],
719
- result['image'],
720
- result['image_360'],
721
- result['music']
722
- ])
723
- # Collect the 360-processed images and music
724
- if result['image_360']:
725
- all_360_images.append(result['image_360']) # Use the 360-processed image
726
- all_music_paths.append(result['music']) # Can be None if no music generated
 
 
727
  else:
728
- # If we have more results than containers, just extend with None
729
- group_visibility.append(gr.Group(visible=False))
730
- outputs.extend([None] * 6)
731
-
732
- # Hide remaining containers
733
- for i in range(len(results), len(output_containers)):
734
- group_visibility.append(gr.Group(visible=False))
735
- outputs.extend([None] * 6)
736
-
737
- # Create 360 viewer HTML if we have 360 images
738
  viewer_html_path = None
739
  if all_360_images:
740
  with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as tmp_file:
741
  viewer_html_path = create_360_viewer_html(all_360_images, all_music_paths, tmp_file.name)
742
-
743
- # Hide loading indicator and show results
744
  yield [gr.HTML("")] + group_visibility + outputs + [viewer_html_path, ""]
745
 
746
- # Update the clear_all function to handle the new outputs
747
  def clear_all():
748
- # Create a list with None for all outputs
749
- outputs = [None] # For audio input
750
-
751
- # For group components (set to invisible)
752
- outputs.extend([gr.Group(visible=False)] * len(group_components))
753
-
754
- # For all output containers (set to None)
755
- outputs.extend([None] * (len(output_containers) * 6))
756
-
757
- # For loading indicator (empty HTML)
758
- outputs.append(gr.HTML(""))
759
-
760
- # For chunk duration (reset to 10)
761
- outputs.append(10)
762
-
763
- # For example selector (reset to None)
764
- outputs.append(None)
765
-
766
- # For viewer (set to None)
767
- outputs.append(None)
768
-
769
- # For JavaScript output (empty)
770
- outputs.append("")
771
-
772
  return outputs
773
 
774
- # Function to load example audio (placeholder - you need to implement this)
775
- def load_example_audio(example_name):
776
- # This is a placeholder - you need to implement this function
777
- # Return the path to the example audio file based on the example_name
778
- return None
779
 
780
- # Custom CSS for enhanced styling
781
  custom_css = """
782
  .download-section {
783
  background: rgba(255,255,255,255);
@@ -791,37 +645,6 @@ custom_css = """
791
  overflow: hidden;
792
  }
793
 
794
- .download-section::before {
795
- content: "";
796
- position: absolute;
797
- top: -50%;
798
- left: -50%;
799
- width: 200%;
800
- height: 200%;
801
- background: radial-gradient(circle, rgba(255,255,255,0.1) 0%, transparent 70%);
802
- animation: shimmer 3s infinite linear;
803
- pointer-events: none;
804
- }
805
-
806
- @keyframes shimmer {
807
- 0% { transform: rotate(0deg); }
808
- 100% { transform: rotate(360deg); }
809
- }
810
-
811
- .download-section h2 {
812
- color: white;
813
- font-size: 16px;
814
- margin-bottom: 15px;
815
- text-shadow: 1px 1px 3px rgba(0,0,0,0.3);
816
- }
817
-
818
- .download-section p {
819
- color: rgba(255,255,255,0.9);
820
- font-size: 16px;
821
- margin-bottom: 20px;
822
- line-height: 3.5;
823
- }
824
-
825
  .download-button {
826
  background: rgba(155,155,155,255) !important;
827
  color: white !important;
@@ -831,92 +654,19 @@ custom_css = """
831
  font-weight: bold !important;
832
  font-size: 16px !important;
833
  margin-top: 15px !important;
834
- transition: all 0.3s ease !important;
835
  cursor: pointer !important;
836
  display: inline-block !important;
837
  }
 
838
 
839
- .download-button:hover {
840
- transform: translateY(-3px) !important;
841
- box-shadow: 0 8px 20px rgba(0,0,0,0.6) !important;
842
- }
843
-
844
- .download-button:active {
845
- transform: translateY(1px) !important;
846
- }
847
-
848
- .download-icon {
849
- margin-right: 8px;
850
- font-size: 28px;
851
- }
852
-
853
- .feature-list {
854
- display: flex;
855
- justify-content: center;
856
- flex-wrap: wrap;
857
- gap: 15px;
858
- margin: 20px 0;
859
- }
860
-
861
- .feature-item {
862
- background: rgba(255,255,255,0.15);
863
- padding: 10px 15px;
864
- border-radius: 8px;
865
- display: flex;
866
- align-items: center;
867
- gap: 8px;
868
- color: white;
869
- font-size: 14px;
870
- }
871
-
872
- .feature-icon {
873
- font-size: 26px;
874
- }
875
-
876
- .viewer-preview {
877
- margin-top: 20px;
878
- border-radius: 10px;
879
- overflow: hidden;
880
- box-shadow: 0 5px 15px rgba(0,0,0,0.2);
881
- max-width: 400px;
882
- margin-left: auto;
883
- margin-right: auto;
884
- }
885
-
886
- .viewer-preview img {
887
- width: 100%;
888
- display: block;
889
- }
890
-
891
- .instructions {
892
- background: rgba(255,255,255,0.1);
893
- padding: 15px;
894
- border-radius: 8px;
895
- margin-top: 20px;
896
- text-align: left;
897
- }
898
-
899
- .instructions h3 {
900
- color: white;
901
- margin-top: 0;
902
- font-size: 16px;
903
- }
904
-
905
- .instructions ol {
906
- color: rgba(255,255,255,0.9);
907
- padding-left: 20px;
908
- margin-bottom: 0;
909
- }
910
 
911
- .instructions li {
912
- margin-bottom: 8px;
913
- }
914
- """
915
- # Create the Gradio interface with proper output handling
916
  with gr.Blocks(title="Entornos Virtuales Afectivos - Procesamiento por Segmentos", css=custom_css) as interface:
917
  gr.Markdown("# Bello")
918
  gr.Markdown(
919
- """
920
  ***Bello*** explora las sutilezas afectivas de la voz humana a través de la figura del **Teniente Bello**,
921
  el piloto chileno que desapareció misteriosamente en 1914 durante un vuelo de entrenamiento sobre la costa del Pacífico.
922
 
@@ -939,24 +689,13 @@ y semánticos del lenguaje hablado para generar entornos virtuales inmersivos en
939
  • Video Tutorial: [Cómo usar este espacio](https://youtu.be/eVD1lzwVhi8)
940
 
941
  • Para más detalles del proyecto, visita: [www.emotional-machines.com](https://www.emotional-machines.com)
942
- """
943
  )
944
 
945
-
946
  with gr.Row():
947
  with gr.Column(scale=2):
948
  audio_input = gr.Audio(label="Audio de Entrada", type="filepath", sources=["microphone", "upload"])
949
-
950
- # Ejemplos de audio (opcional)
951
- # example_selector = gr.Dropdown(
952
- # label="Seleccionar Audio de Ejemplo",
953
- # choices=["Discurso Feliz", "Historia Triste", "Noticias Neutrales"],
954
- # value=None,
955
- # info="Elige entre audios pregrabados de ejemplo"
956
- # )
957
-
958
- #load_example_btn = gr.Button("Cargar Ejemplo", variant="secondary")
959
-
960
  with gr.Column(scale=1):
961
  chunk_duration_input = gr.Number(
962
  label="Duración de Segmento (segundos)",
@@ -964,86 +703,111 @@ y semánticos del lenguaje hablado para generar entornos virtuales inmersivos en
964
  minimum=1,
965
  maximum=60,
966
  step=1,
967
- info="Duración de cada segmento de audio a procesar (1-60 segundos)"
968
  )
969
  generate_audio_checkbox = gr.Checkbox(
970
- label="Generar Audio (puede tardar más)",
971
  value=False,
972
- info="Desmarca para omitir la generación de música y acelerar el procesamiento"
973
  )
974
  with gr.Row():
975
  process_btn = gr.Button("Generar", variant="primary")
976
  clear_btn = gr.Button("Borrar Todo", variant="secondary")
977
-
978
- loading_indicator = gr.HTML("""
979
- <div id="loading" style="display: none; text-align: center; margin: 20px;">
980
- <p style="font-size: 18px; color: #4a4a4a;">Procesando segmentos de audio...</p>
981
- <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
982
- <style>@keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } }</style>
983
- </div>
984
- """)
985
-
986
- output_containers = []
987
- group_components = [] # Contenedores de grupos
988
-
989
  for i in range(20):
990
  with gr.Group(visible=False) as chunk_group:
991
  gr.Markdown(f"### Resultados del Segmento {i+1}")
 
992
  with gr.Row():
993
  emotion_output = gr.Label(label="Predicción de Emoción Acústica")
994
  transcription_output = gr.Label(label="Texto Transcrito")
995
  sentiment_output = gr.Label(label="Análisis Sentimental")
 
996
  with gr.Row():
997
  image_output = gr.Image(label="Imagen Equirectangular Generada")
998
  image_360_output = gr.File(label="Descargar Imagen 360", type="filepath")
 
999
  with gr.Row():
1000
  audio_output = gr.Audio(label="Música Generada")
 
 
1001
  gr.HTML("<hr style='margin: 20px 0; border: 1px solid #ccc;'>")
1002
-
1003
  group_components.append(chunk_group)
1004
- output_containers.append({
1005
- 'emotion': emotion_output,
1006
- 'transcription': transcription_output,
1007
- 'sentiment': sentiment_output,
1008
- 'image': image_output,
1009
- 'image_360': image_360_output,
1010
- 'music': audio_output
1011
- })
1012
-
 
 
 
1013
  with gr.Group(visible=True, elem_classes="download-section") as download_group:
1014
  viewer_html_output = gr.File(
1015
- label="Una vez finalizado el procesamiento, descarga tu EVA aquí 🚀",
1016
  type="filepath",
1017
  interactive=False,
1018
- elem_classes="download-button"
1019
  )
1020
-
1021
  js_output = gr.HTML(visible=False)
1022
-
 
1023
  process_btn.click(
1024
  fn=process_and_display,
1025
  inputs=[audio_input, generate_audio_checkbox, chunk_duration_input],
1026
- outputs=[loading_indicator] + group_components + [comp for container in output_containers for comp in [
1027
- container['emotion'],
1028
- container['transcription'],
1029
- container['sentiment'],
1030
- container['image'],
1031
- container['image_360'],
1032
- container['music']
1033
- ]] + [viewer_html_output, js_output]
 
 
 
 
 
 
 
 
1034
  )
1035
-
1036
  clear_btn.click(
1037
  fn=clear_all,
1038
  inputs=[],
1039
- outputs=[audio_input] + group_components + [comp for container in output_containers for comp in [
1040
- container['emotion'],
1041
- container['transcription'],
1042
- container['sentiment'],
1043
- container['image'],
1044
- container['image_360'],
1045
- container['music']
1046
- ]] + [loading_indicator, chunk_duration_input, viewer_html_output, js_output]
 
 
 
 
 
 
 
 
1047
  )
1048
-
1049
- interface.launch(share=True)
 
1
  import gradio as gr
 
 
2
  import numpy as np
3
  import librosa
4
  import requests
 
7
  import os
8
  from tensorflow.keras.models import load_model
9
  from faster_whisper import WhisperModel
 
10
  from textblob import TextBlob
11
  import torch
12
  import scipy.io.wavfile
13
  from transformers import AutoProcessor, MusicgenForConditionalGeneration
14
  import tempfile
15
  import base64
 
 
 
16
  from pydub import AudioSegment
17
  import math
18
  import json
 
 
 
 
 
 
19
  import struct
20
  import cv2
21
 
22
+ # =========================
23
+ # Models
24
+ # =========================
25
+
26
  def load_emotion_model(model_path):
27
  try:
28
+ m = load_model(model_path)
29
  print("Emotion model loaded successfully")
30
+ return m
31
  except Exception as e:
32
  print("Error loading emotion prediction model:", e)
33
  return None
34
 
35
+ model_path = "mymodel_SER_LSTM_RAVDESS.h5"
36
  model = load_emotion_model(model_path)
37
 
38
+ # Whisper
39
  model_size = "small"
40
  model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
41
 
42
+ # MusicGen
43
  def load_musicgen_model():
44
  try:
45
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
54
 
55
  processor, music_model, device = load_musicgen_model()
56
 
57
+ # =========================
58
+ # Audio utilities
59
+ # =========================
60
+
61
  def chunk_audio(audio_path, chunk_duration=10):
62
  """Split audio into chunks and return list of chunk file paths"""
63
  try:
 
64
  audio = AudioSegment.from_file(audio_path)
65
  duration_ms = len(audio)
66
  chunk_ms = chunk_duration * 1000
67
+
 
68
  if chunk_duration <= 0:
69
  raise ValueError("Chunk duration must be positive")
70
+
71
  if chunk_duration > duration_ms / 1000:
 
72
  return [audio_path], 1
73
+
 
74
  chunk_files = []
 
 
75
  num_chunks = math.ceil(duration_ms / chunk_ms)
76
+
77
  for i in range(num_chunks):
78
  start_ms = i * chunk_ms
79
  end_ms = min((i + 1) * chunk_ms, duration_ms)
 
 
80
  chunk = audio[start_ms:end_ms]
81
+
 
 
82
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
83
  chunk.export(tmp_file.name, format="wav")
84
  chunk_files.append(tmp_file.name)
85
+
86
  return chunk_files, num_chunks
87
+
88
  except Exception as e:
89
  print("Error chunking audio:", e)
 
90
  return [audio_path], 1
91
 
 
92
  def transcribe(wav_filepath):
93
  try:
94
  segments, _ = model2.transcribe(wav_filepath, beam_size=5)
 
97
  print("Error transcribing audio:", e)
98
  return "Transcription failed"
99
 
 
100
  def extract_mfcc(wav_file_name):
101
  try:
102
  y, sr = librosa.load(wav_file_name)
 
106
  print("Error extracting MFCC features:", e)
107
  return None
108
 
109
+ emotions = {
110
+ 0: "neutral",
111
+ 1: "calm",
112
+ 2: "happy",
113
+ 3: "sad",
114
+ 4: "angry",
115
+ 5: "fearful",
116
+ 6: "disgust",
117
+ 7: "surprised",
118
+ }
119
 
 
120
  def predict_emotion_from_audio(wav_filepath):
121
  try:
122
  if model is None:
123
  return "Model not loaded"
124
+
125
  test_point = extract_mfcc(wav_filepath)
126
  if test_point is not None:
127
  test_point = np.reshape(test_point, newshape=(1, 40, 1))
128
  predictions = model.predict(test_point)
129
  predicted_emotion_label = np.argmax(predictions[0])
130
  return emotions.get(predicted_emotion_label, "Unknown emotion")
131
+ return "Error: Unable to extract features"
 
132
  except Exception as e:
133
  print("Error predicting emotion:", e)
134
  return "Prediction error"
135
 
 
136
  def analyze_sentiment(text):
137
  try:
138
  if not text or text.strip() == "":
139
  return "neutral", 0.0
140
+
141
  analysis = TextBlob(text)
142
  polarity = analysis.sentiment.polarity
143
+
144
  if polarity > 0.1:
145
  sentiment = "positive"
146
  elif polarity < -0.1:
147
  sentiment = "negative"
148
  else:
149
  sentiment = "neutral"
150
+
151
  return sentiment, polarity
152
  except Exception as e:
153
  print("Error analyzing sentiment:", e)
154
  return "neutral", 0.0
155
 
156
+ # =========================
157
+ # Prompts
158
+ # =========================
159
+
160
  def get_image_prompt(sentiment, transcribed_text, chunk_idx, total_chunks):
161
+ _ = f"Chunk {chunk_idx+1}/{total_chunks}: " # kept for future use
162
+
163
  if sentiment == "positive":
164
+ return (
165
+ f"Generate an equirectangular 360° panoramic graphite sketch drawing, detailed pencil texture "
166
+ f"with faint neon glows, cinematic lighting of:{transcribed_text}. Use low histogram frequency "
167
+ f"in bright bins, dominant color in high RGB range, and high brightness and color variance. "
168
+ f"Apply high-frequency texture with strong filter energy, pronounced gradient magnitude, and "
169
+ f"strong local contrast. Use high spatial complexity, increased horizontal and vertical symmetry, "
170
+ f"high edge density, bright gray levels, and high contrast. Emphasize rich visual structure, "
171
+ f"color variation, and texture intensity across spatial composition."
172
+ )
173
  elif sentiment == "negative":
174
+ return (
175
+ f"Generate an equirectangular 360° panoramic graphite sketch drawing, detailed pencil texture "
176
+ f"with faint neon glows, cinematic lighting of:{transcribed_text}. Use high histogram frequency "
177
+ f"in dark bins, dominant color in low RGB range, and low brightness and color variance. "
178
+ f"Apply low-frequency texture with low filter energy, weak gradient magnitude, and low local contrast. "
179
+ f"Use low spatial complexity, reduced horizontal and vertical symmetry, low edge density, dark gray levels, "
180
+ f"and moderate contrast. Emphasize coarse structure and limited variation in color, texture, and spatial distribution."
181
+ )
182
+ else:
183
+ return (
184
+ f"Generate an equirectangular 360° panoramic graphite sketch drawing, detailed pencil texture "
185
+ f"with faint neon glows, cinematic lighting of:{transcribed_text}. Use a balanced histogram frequency "
186
+ f"across bins, dominant color in a mid RGB range, and moderate brightness and color variance. "
187
+ f"Apply medium-frequency texture with moderate filter energy, standard gradient magnitude, and average local contrast. "
188
+ f"Use medium spatial complexity, balanced horizontal and vertical symmetry, medium edge density, mid-range gray levels, "
189
+ f"and standard contrast. Emphasize naturalistic structure and typical variation in color, texture, and spatial distribution."
190
+ )
191
 
 
192
  def get_music_prompt(emotion, transcribed_text, chunk_idx, total_chunks):
193
+ _ = f"Chunk {chunk_idx+1}/{total_chunks}: " # kept for future use
194
+
195
  emotion_prompts = {
196
+ "neutral": f"Generate a neutral orchestral soundtrack with balanced energy and smooth spectral character. Use steady tempo, even rhythmic density, and low dissonance. Keep pitch clarity moderate and loudness stable. Maintain slow harmonic motion and tonal equilibrium. Emphasize balance, consistency, and calm tonal centers. The music should feel even, ambient, and unobtrusive, gently complementing: {transcribed_text}.",
197
+ "calm": f"Generate a calm orchestral soundtrack with slowed motion, sparse rhythmic activity, and warm timbral shading. Use minimal dissonance, smooth spectral texture, and gentle pitch presence. Keep dynamics restrained with rare harmonic shifts and stable tonality. Emphasize warmth, sustained harmonies, and flowing textures that evoke tranquility and serenity inspired by: {transcribed_text}.",
198
+ "happy": f"Generate a happy orchestral soundtrack with lively motion, energetic rhythmic density, and bright timbral color. Use controlled dissonance, vivid spectral texture, and clear melodic focus. Maintain dynamic expressiveness with active harmonic movement and stable tonal grounding. Emphasize joy through playful rhythms, ornamented melodies, and uplifting harmonic progressions inspired by: {transcribed_text}.",
199
+ "sad": f"Generate a sad orchestral soundtrack with reduced motion, sparse rhythmic events, and dark timbral color. Use gentle dissonance, softened spectral texture, and subdued pitch clarity. Keep dynamics restrained with minimal harmonic change and low tonal uncertainty. Emphasize minor coloration, sustained harmonies, and fragile phrasing in response to: {transcribed_text}.",
200
+ "angry": f"Generate an angry orchestral soundtrack with driving motion, dense rhythmic attack, and sharp timbral brightness. Use persistent dissonance, assertive pitch presence, and heightened dynamics. Maintain frequent harmonic shifts and unstable tonal grounding. Emphasize aggressive articulation, rhythmic force, and tension-laden progressions that amplify: {transcribed_text}.",
201
+ "fearful": f"Generate a fearful orchestral soundtrack with unstable motion, fluctuating rhythmic density, and highly variable timbre. Use shifting dissonance, blurred pitch focus, and volatile dynamics. Increase harmonic unpredictability and tonal instability. Emphasize eerie textures, spatial tension, and spectral motion to evoke suspense and anticipation inspired by: {transcribed_text}.",
202
+ "disgust": f"Generate a disgusted orchestral soundtrack with uneven motion, irregular rhythm, and dark, rough timbral texture. Use abrasive dissonance, unstable spectral character, and weakened pitch focus. Maintain uneasy dynamics and unsettled harmonic motion. Emphasize distorted textures, harsh intervals, and tonal ambiguity reflecting: {transcribed_text}.",
203
+ "surprised": f"Generate a surprised orchestral soundtrack with shifting motion, sudden rhythmic variation, and dynamically changing timbre. Use sharp contrasts, heightened pitch clarity, and expressive dynamic swings. Maintain irregular harmonic motion with agile tonal pivots. Emphasize abrupt transitions, playful gestures, and expressive color changes inspired by: {transcribed_text}.",
 
 
 
 
 
 
 
204
  }
205
+ return emotion_prompts.get(emotion.lower(), f"Create background music with {emotion} atmosphere that represents: {transcribed_text}")
206
+
207
+ # =========================
208
+ # Music generation
209
+ # =========================
210
+
211
  def generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks):
212
  try:
213
  if processor is None or music_model is None:
214
  return None
215
+
 
216
  prompt = get_music_prompt(emotion_prediction, transcribed_text, chunk_idx, total_chunks)
 
 
217
  if len(prompt) > 200:
218
  prompt = prompt[:200] + "..."
219
+
220
+ inputs = processor(text=[prompt], padding=True, return_tensors="pt").to(device)
 
 
 
 
 
 
221
  audio_values = music_model.generate(**inputs, max_new_tokens=512)
222
+
 
223
  sampling_rate = music_model.config.audio_encoder.sampling_rate
224
  audio_data = audio_values[0, 0].cpu().numpy()
225
+ audio_data = audio_data / max(1e-9, np.max(np.abs(audio_data)))
226
+
 
 
 
227
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
228
  scipy.io.wavfile.write(tmp_file.name, rate=sampling_rate, data=audio_data)
229
  return tmp_file.name
230
+
231
  except Exception as e:
232
  print("Error generating music:", e)
233
  return None
234
 
235
+ # =========================
236
+ # Image generation (DeepAI)
237
+ # =========================
238
+
239
  api_key = os.getenv("DeepAI_api_key")
240
 
 
241
  def upscale_image(image, target_width=4096, target_height=2048):
242
  """
243
+ Upscale image using DeepAI's Torch-SRGAN API for super resolution.
244
+ Falls back to OpenCV Lanczos if no API key or failure.
245
  """
246
  try:
247
  if not api_key:
 
 
248
  img_array = np.array(image)
249
+ upscaled = cv2.resize(img_array, (target_width, target_height), interpolation=cv2.INTER_LANCZOS4)
 
 
 
 
250
  return Image.fromarray(upscaled)
251
+
 
252
  with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_input:
253
  image.save(tmp_input.name, "JPEG", quality=95)
254
+
 
255
  response = requests.post(
256
  "https://api.deepai.org/api/torch-srgan",
257
+ files={"image": open(tmp_input.name, "rb")},
258
+ headers={"api-key": api_key},
259
  )
 
260
  data = response.json()
261
+
262
+ if "output_url" in data:
263
+ img_resp = requests.get(data["output_url"])
 
264
  upscaled_image = Image.open(BytesIO(img_resp.content))
265
+
 
266
  if upscaled_image.size != (target_width, target_height):
267
+ upscaled_image = upscaled_image.resize((target_width, target_height), Image.Resampling.LANCZOS)
268
+
269
+ try:
270
+ os.unlink(tmp_input.name)
271
+ except:
272
+ pass
273
+
274
  return upscaled_image
275
+
276
+ print("Error in DeepAI upscaling response:", data)
277
+
278
+ img_array = np.array(image)
279
+ upscaled = cv2.resize(img_array, (target_width, target_height), interpolation=cv2.INTER_LANCZOS4)
280
+ return Image.fromarray(upscaled)
281
+
 
 
 
 
282
  except Exception as e:
283
  print(f"Error upscaling image with DeepAI: {e}")
 
284
  img_array = np.array(image)
285
+ upscaled = cv2.resize(img_array, (target_width, target_height), interpolation=cv2.INTER_LANCZOS4)
 
 
 
 
286
  return Image.fromarray(upscaled)
287
 
 
288
  def generate_image(sentiment_prediction, transcribed_text, chunk_idx, total_chunks):
289
  try:
290
  if not api_key:
291
+ base_image = Image.new("RGB", (1024, 512), color="white")
 
292
  else:
 
293
  prompt = get_image_prompt(sentiment_prediction, transcribed_text, chunk_idx, total_chunks)
294
+
 
295
  response = requests.post(
296
  "https://api.deepai.org/api/text2img",
297
+ data={"text": prompt, "width": 1024, "height": 512, "image_generator_version": "hd"},
298
+ headers={"api-key": api_key},
 
 
 
 
 
299
  )
 
300
  data = response.json()
301
+
302
+ if "output_url" in data:
303
+ img_resp = requests.get(data["output_url"])
304
  base_image = Image.open(BytesIO(img_resp.content))
305
  else:
306
  print("Error in DeepAI response:", data)
307
+ base_image = Image.new("RGB", (1024, 512), color="white")
308
+
 
 
309
  upscaled_image = upscale_image(base_image)
310
  return upscaled_image
 
 
 
 
 
311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  except Exception as e:
313
+ print("Error generating image:", e)
314
+ return Image.new("RGB", (1024, 512), color="white")
 
 
 
 
 
 
 
 
 
315
 
316
+ # =========================
317
+ # 360 metadata injection (XMP)
318
+ # =========================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
  def create_xmp_block(width, height):
 
321
  xmp = (
322
  f'<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>\n'
323
  f'<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="ExifTool">\n'
 
339
  return xmp
340
 
341
  def write_xmp_to_jpg(input_path, output_path, width, height):
342
+ with open(input_path, "rb") as f:
 
 
343
  data = f.read()
344
+
345
+ if data[0:2] != b"\xFF\xD8":
 
346
  raise ValueError("Not a valid JPEG file")
347
+
 
348
  xmp_data = create_xmp_block(width, height)
349
+
350
+ app1_marker = b"\xFF\xE1"
351
+ xmp_header = b"http://ns.adobe.com/xap/1.0/\x00"
352
+ xmp_bytes = xmp_data.encode("utf-8")
353
+ length = len(xmp_header) + len(xmp_bytes) + 2
354
+ length_bytes = struct.pack(">H", length)
355
+
 
 
356
  output = bytearray()
357
+ output.extend(data[0:2]) # SOI
358
  output.extend(app1_marker)
359
  output.extend(length_bytes)
360
  output.extend(xmp_header)
361
  output.extend(xmp_bytes)
362
+ output.extend(data[2:])
363
+
364
+ with open(output_path, "wb") as f:
 
365
  f.write(output)
366
 
367
  def add_360_metadata(img):
 
368
  try:
 
369
  target_width, target_height = 4096, 2048
370
  if img.width != target_width or img.height != target_height:
371
  img = img.resize((target_width, target_height), Image.Resampling.LANCZOS)
372
+
 
373
  with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
 
374
  img.save(tmp_file.name, "JPEG", quality=95)
 
 
375
  write_xmp_to_jpg(tmp_file.name, tmp_file.name, img.width, img.height)
 
376
  return tmp_file.name
377
+
378
  except Exception as e:
379
  print(f"Error adding 360 metadata: {str(e)}")
 
380
  with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
381
  img.save(tmp_file.name, "JPEG", quality=95)
382
  return tmp_file.name
383
 
384
+ # =========================
385
+ # Chunk processing
386
+ # =========================
387
+
388
+ def process_chunk(chunk_path, chunk_idx, total_chunks, generate_audio=True):
389
+ try:
390
+ emotion_prediction = predict_emotion_from_audio(chunk_path)
391
+ transcribed_text = transcribe(chunk_path)
392
+ sentiment, polarity = analyze_sentiment(transcribed_text)
393
+
394
+ image = generate_image(sentiment, transcribed_text, chunk_idx, total_chunks)
395
+ image_with_360_path = add_360_metadata(image)
396
+
397
+ music_path = None
398
+ if generate_audio:
399
+ music_path = generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks)
400
+
401
+ return {
402
+ "chunk_index": chunk_idx + 1,
403
+ "emotion": emotion_prediction,
404
+ "transcription": transcribed_text,
405
+ "sentiment": sentiment,
406
+ "image": image,
407
+ "image_360": image_with_360_path,
408
+ "music": music_path,
409
+ }
410
+
411
+ except Exception as e:
412
+ print(f"Error processing chunk {chunk_idx + 1}:", e)
413
+ return {
414
+ "chunk_index": chunk_idx + 1,
415
+ "emotion": "Error",
416
+ "transcription": "Transcription failed",
417
+ "sentiment": "error",
418
+ "image": Image.new("RGB", (1024, 512), color="white"),
419
+ "image_360": None,
420
+ "music": None,
421
+ }
422
+
423
+ def get_predictions(audio_input, generate_audio=True, chunk_duration=10):
424
+ chunk_files, total_chunks = chunk_audio(audio_input, chunk_duration)
425
+
426
+ results = []
427
+ for i, chunk_path in enumerate(chunk_files):
428
+ print(f"Processing chunk {i+1}/{total_chunks} ({chunk_duration}s each)")
429
+ result = process_chunk(chunk_path, i, total_chunks, generate_audio)
430
+ results.append(result)
431
+
432
+ for chunk_path in chunk_files:
433
+ try:
434
+ if chunk_path != audio_input:
435
+ os.unlink(chunk_path)
436
+ except:
437
+ pass
438
+
439
+ return results
440
+
441
+ # =========================
442
+ # 360 viewer HTML (with audio)
443
+ # =========================
444
+
445
  def create_360_viewer_html(image_paths, audio_paths, output_path):
 
 
446
  image_data_list = []
447
  for img_path in image_paths:
448
  with open(img_path, "rb") as f:
449
  img_data = base64.b64encode(f.read()).decode("utf-8")
450
  image_data_list.append(f"data:image/jpeg;base64,{img_data}")
451
+
 
452
  audio_data_list = []
453
  for audio_path in audio_paths:
454
+ if audio_path:
455
  with open(audio_path, "rb") as f:
456
  audio_data = base64.b64encode(f.read()).decode("utf-8")
457
  audio_data_list.append(f"data:audio/wav;base64,{audio_data}")
458
  else:
459
+ audio_data_list.append(None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
 
461
+ html_content = f"""
462
+ <!DOCTYPE html>
463
+ <html lang="en">
464
+ <head>
465
+ <meta charset="UTF-8">
466
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
467
+ <title>360 Panorama Viewer with Audio</title>
468
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.css"/>
469
+ <style>
470
+ body {{ margin:0; overflow:hidden; font-family:Arial,sans-serif; }}
471
+ #panorama {{ width:100vw; height:80vh; }}
472
+ #controls {{
473
+ position:absolute; top:10px; right:10px; z-index:1000;
474
+ background:rgba(0,0,0,0.7); color:white; padding:10px; border-radius:5px;
475
+ display:flex; flex-direction:column; gap:10px;
476
+ }}
477
+ #audio-controls {{
478
+ position:fixed; bottom:0; left:0; width:100%;
479
+ background:rgba(0,0,0,0.8); color:white; padding:15px;
480
+ display:flex; flex-direction:column; align-items:center; z-index:1000;
481
+ }}
482
+ #audio-player {{ width:80%; margin-bottom:10px; }}
483
+ #audio-info {{ text-align:center; font-size:14px; }}
484
+ select {{ padding:5px; border-radius:3px; border:1px solid #ccc; }}
485
+ </style>
486
+ </head>
487
+ <body>
488
+ <div id="controls">
489
+ <select id="image-selector">
490
+ {"".join([f'<option value="{i}">Chunk {i+1}</option>' for i in range(len(image_data_list))])}
491
+ </select>
492
+ </div>
493
+
494
+ <div id="panorama"></div>
495
+
496
+ <div id="audio-controls">
497
+ <audio id="audio-player" controls></audio>
498
+ <div id="audio-info">No audio available for this chunk</div>
499
+ </div>
500
+
501
+ <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.js"></script>
502
+ <script>
503
+ const images = {json.dumps(image_data_list)};
504
+ const audioFiles = {json.dumps(audio_data_list)};
505
+ let currentViewer = null;
506
+
507
+ function loadPanorama(index) {{
508
+ if (currentViewer) currentViewer.destroy();
509
+
510
+ currentViewer = pannellum.viewer('panorama', {{
511
+ type: "equirectangular",
512
+ panorama: images[index],
513
+ autoLoad: true,
514
+ autoRotate: -2,
515
+ showZoomCtrl: true,
516
+ showFullscreenCtrl: true,
517
+ hfov: 100
518
+ }});
519
+
520
+ updateAudioPlayer(index);
521
+ }}
522
+
523
+ function updateAudioPlayer(index) {{
524
+ const audioPlayer = document.getElementById('audio-player');
525
+ const audioInfo = document.getElementById('audio-info');
526
+
527
+ if (audioFiles[index]) {{
528
+ audioPlayer.src = audioFiles[index];
529
+ audioInfo.textContent = 'Playing audio for Chunk ' + (index + 1);
530
+ audioPlayer.play().catch(e => {{
531
+ audioInfo.textContent = 'Click play to listen to audio for Chunk ' + (index + 1);
532
+ }});
533
+ }} else {{
534
+ audioPlayer.src = '';
535
+ audioInfo.textContent = 'No audio available for this chunk';
536
+ }}
537
+ }}
538
+
539
+ loadPanorama(0);
540
+
541
+ document.getElementById('image-selector').addEventListener('change', function(e) {{
542
+ const selectedIndex = parseInt(e.target.value);
543
+ loadPanorama(selectedIndex);
544
+ }});
545
+ </script>
546
+ </body>
547
+ </html>
548
+ """
549
+ with open(output_path, "w") as f:
550
  f.write(html_content)
 
551
  return output_path
552
 
553
+ # =========================
554
+ # Gradio streaming function
555
+ # =========================
556
+
557
+ # NOTE: We create these globals before defining process_and_display so the generator can reference them.
558
+ output_containers = []
559
+ group_components = []
560
+
561
  def process_and_display(audio_input, generate_audio, chunk_duration):
 
562
  if chunk_duration is None or chunk_duration <= 0:
563
  chunk_duration = 10
564
+
565
+ # Loading screen
566
+ yield (
567
+ [gr.HTML(f"""
568
  <div style="text-align: center; margin: 20px;">
569
+ <p style="font-size: 18px; color: #4a4a4a;">Processing audio in {chunk_duration}-second chunks...</p>
570
+ <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
571
+ <style>@keyframes spin {{ 0% {{ transform: rotate(0deg); }} 100% {{ transform: rotate(360deg); }} }}</style>
572
+ <p style="font-size: 14px; color: #4a4a4a;">This may take several minutes depending on the audio length...</p>
573
  </div>
574
+ """)]
575
+ + [gr.update(visible=False)] * len(group_components)
576
+ # 7 outputs per chunk: emotion, transcription, sentiment, image, image_360, music_player, music_file_download
577
+ + [None] * (len(output_containers) * 7)
578
+ + [None, ""]
579
+ )
580
+
581
  results = get_predictions(audio_input, generate_audio, chunk_duration)
582
+
 
583
  outputs = []
584
  group_visibility = []
585
+ all_360_images = []
586
+ all_music_paths = []
587
+
 
588
  for i, result in enumerate(results):
589
  if i < len(output_containers):
590
+ group_visibility.append(gr.update(visible=True))
591
+ outputs.extend(
592
+ [
593
+ result["emotion"],
594
+ result["transcription"],
595
+ result["sentiment"],
596
+ result["image"],
597
+ result["image_360"],
598
+ result["music"], # gr.Audio
599
+ result["music"], # gr.File download
600
+ ]
601
+ )
602
+ if result["image_360"]:
603
+ all_360_images.append(result["image_360"])
604
+ all_music_paths.append(result["music"])
605
  else:
606
+ group_visibility.append(gr.update(visible=False))
607
+ outputs.extend([None] * 7)
608
+
609
+ for _ in range(len(results), len(output_containers)):
610
+ group_visibility.append(gr.update(visible=False))
611
+ outputs.extend([None] * 7)
612
+
 
 
 
613
  viewer_html_path = None
614
  if all_360_images:
615
  with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as tmp_file:
616
  viewer_html_path = create_360_viewer_html(all_360_images, all_music_paths, tmp_file.name)
617
+
 
618
  yield [gr.HTML("")] + group_visibility + outputs + [viewer_html_path, ""]
619
 
 
620
  def clear_all():
621
+ # Must match outputs wiring in clear_btn.click(...)
622
+ outputs = [None] # audio_input
623
+ outputs.extend([gr.update(visible=False)] * len(group_components))
624
+ outputs.extend([None] * (len(output_containers) * 7))
625
+ outputs.append(gr.HTML("")) # loading_indicator
626
+ outputs.append(10) # chunk_duration_input reset
627
+ outputs.append(None) # viewer_html_output
628
+ outputs.append("") # js_output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
629
  return outputs
630
 
631
+ # =========================
632
+ # UI styling
633
+ # =========================
 
 
634
 
 
635
  custom_css = """
636
  .download-section {
637
  background: rgba(255,255,255,255);
 
645
  overflow: hidden;
646
  }
647
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
  .download-button {
649
  background: rgba(155,155,155,255) !important;
650
  color: white !important;
 
654
  font-weight: bold !important;
655
  font-size: 16px !important;
656
  margin-top: 15px !important;
 
657
  cursor: pointer !important;
658
  display: inline-block !important;
659
  }
660
+ """
661
 
662
+ # =========================
663
+ # Gradio app
664
+ # =========================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
665
 
 
 
 
 
 
666
  with gr.Blocks(title="Entornos Virtuales Afectivos - Procesamiento por Segmentos", css=custom_css) as interface:
667
  gr.Markdown("# Bello")
668
  gr.Markdown(
669
+ """
670
  ***Bello*** explora las sutilezas afectivas de la voz humana a través de la figura del **Teniente Bello**,
671
  el piloto chileno que desapareció misteriosamente en 1914 durante un vuelo de entrenamiento sobre la costa del Pacífico.
672
 
 
689
  • Video Tutorial: [Cómo usar este espacio](https://youtu.be/eVD1lzwVhi8)
690
 
691
  • Para más detalles del proyecto, visita: [www.emotional-machines.com](https://www.emotional-machines.com)
692
+ """
693
  )
694
 
 
695
  with gr.Row():
696
  with gr.Column(scale=2):
697
  audio_input = gr.Audio(label="Audio de Entrada", type="filepath", sources=["microphone", "upload"])
698
+
 
 
 
 
 
 
 
 
 
 
699
  with gr.Column(scale=1):
700
  chunk_duration_input = gr.Number(
701
  label="Duración de Segmento (segundos)",
 
703
  minimum=1,
704
  maximum=60,
705
  step=1,
706
+ info="Duración de cada segmento de audio a procesar (1-60 segundos)",
707
  )
708
  generate_audio_checkbox = gr.Checkbox(
709
+ label="Generar Audio (puede tardar más)",
710
  value=False,
711
+ info="Desmarca para omitir la generación de música y acelerar el procesamiento",
712
  )
713
  with gr.Row():
714
  process_btn = gr.Button("Generar", variant="primary")
715
  clear_btn = gr.Button("Borrar Todo", variant="secondary")
716
+
717
+ loading_indicator = gr.HTML(
718
+ """
719
+ <div id="loading" style="display: none; text-align: center; margin: 20px;">
720
+ <p style="font-size: 18px; color: #4a4a4a;">Procesando segmentos de audio...</p>
721
+ <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
722
+ <style>@keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } }</style>
723
+ </div>
724
+ """
725
+ )
726
+
727
+ # Build chunk outputs (20 slots)
728
  for i in range(20):
729
  with gr.Group(visible=False) as chunk_group:
730
  gr.Markdown(f"### Resultados del Segmento {i+1}")
731
+
732
  with gr.Row():
733
  emotion_output = gr.Label(label="Predicción de Emoción Acústica")
734
  transcription_output = gr.Label(label="Texto Transcrito")
735
  sentiment_output = gr.Label(label="Análisis Sentimental")
736
+
737
  with gr.Row():
738
  image_output = gr.Image(label="Imagen Equirectangular Generada")
739
  image_360_output = gr.File(label="Descargar Imagen 360", type="filepath")
740
+
741
  with gr.Row():
742
  audio_output = gr.Audio(label="Música Generada")
743
+ audio_file_output = gr.File(label="Descargar Música", type="filepath") # ✅ DOWNLOAD
744
+
745
  gr.HTML("<hr style='margin: 20px 0; border: 1px solid #ccc;'>")
746
+
747
  group_components.append(chunk_group)
748
+ output_containers.append(
749
+ {
750
+ "emotion": emotion_output,
751
+ "transcription": transcription_output,
752
+ "sentiment": sentiment_output,
753
+ "image": image_output,
754
+ "image_360": image_360_output,
755
+ "music": audio_output,
756
+ "music_file": audio_file_output,
757
+ }
758
+ )
759
+
760
  with gr.Group(visible=True, elem_classes="download-section") as download_group:
761
  viewer_html_output = gr.File(
762
+ label="Una vez finalizado el procesamiento, descarga tu EVA aquí 🚀",
763
  type="filepath",
764
  interactive=False,
765
+ elem_classes="download-button",
766
  )
767
+
768
  js_output = gr.HTML(visible=False)
769
+
770
+ # IMPORTANT: outputs order must match yields/returns exactly
771
  process_btn.click(
772
  fn=process_and_display,
773
  inputs=[audio_input, generate_audio_checkbox, chunk_duration_input],
774
+ outputs=[loading_indicator]
775
+ + group_components
776
+ + [
777
+ comp
778
+ for container in output_containers
779
+ for comp in [
780
+ container["emotion"],
781
+ container["transcription"],
782
+ container["sentiment"],
783
+ container["image"],
784
+ container["image_360"],
785
+ container["music"],
786
+ container["music_file"], # ✅ ADD
787
+ ]
788
+ ]
789
+ + [viewer_html_output, js_output],
790
  )
791
+
792
  clear_btn.click(
793
  fn=clear_all,
794
  inputs=[],
795
+ outputs=[audio_input]
796
+ + group_components
797
+ + [
798
+ comp
799
+ for container in output_containers
800
+ for comp in [
801
+ container["emotion"],
802
+ container["transcription"],
803
+ container["sentiment"],
804
+ container["image"],
805
+ container["image_360"],
806
+ container["music"],
807
+ container["music_file"], # ✅ ADD
808
+ ]
809
+ ]
810
+ + [loading_indicator, chunk_duration_input, viewer_html_output, js_output],
811
  )
812
+
813
+ interface.launch(share=True)