jfforero commited on
Commit
1b5ce3a
·
verified ·
1 Parent(s): c936d9b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -52
app.py CHANGED
@@ -55,9 +55,9 @@ def load_musicgen_model():
55
 
56
  processor, music_model, device = load_musicgen_model()
57
 
58
- # Function to chunk audio into 15-second segments
59
- def chunk_audio(audio_path, chunk_duration=15):
60
- """Split audio into 15-second chunks and return list of chunk file paths"""
61
  try:
62
  # Load audio file
63
  audio = AudioSegment.from_file(audio_path)
@@ -275,14 +275,12 @@ def process_chunk(chunk_path, chunk_idx, total_chunks):
275
 
276
  # Generate music using ACOUSTIC EMOTION prediction with specific prompt
277
  music_path = generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks)
278
-
279
- #'sentiment': f"Sentiment: {sentiment} (Polarity: {polarity:.2f})",
280
-
281
  return {
282
  'chunk_index': chunk_idx + 1,
283
  'emotion': emotion_prediction,
284
  'transcription': transcribed_text,
285
- 'sentiment': f"{sentiment}",
286
  'image': image,
287
  'music': music_path
288
  }
@@ -300,8 +298,8 @@ def process_chunk(chunk_path, chunk_idx, total_chunks):
300
 
301
  # Function to get predictions for all chunks
302
  def get_predictions(audio_input):
303
- # Chunk the audio into 15-second segments
304
- chunk_files, total_chunks = chunk_audio(audio_input, chunk_duration=15)
305
 
306
  results = []
307
 
@@ -321,8 +319,6 @@ def get_predictions(audio_input):
321
 
322
  return results
323
 
324
-
325
-
326
  # Create the Gradio interface with proper output handling
327
  with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as interface:
328
  gr.Markdown("# Affective Virtual Environments")
@@ -333,67 +329,91 @@ with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as i
333
  process_btn = gr.Button("Process Audio", variant="primary")
334
 
335
  # Add a loading indicator
336
- loading_indicator = gr.HTML("")
 
 
 
 
 
 
 
 
 
 
337
 
338
- # Create a container for results
339
- results_container = gr.Column()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
 
341
  def process_and_display(audio_input):
342
  # Show loading indicator
343
- yield gr.HTML("""
344
  <div style="text-align: center; margin: 20px;">
345
  <p style="font-size: 18px; color: #4a4a4a;">Processing audio chunks...</p>
346
  <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
347
  <style>@keyframes spin {0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); }}</style>
348
  </div>
349
- """) + gr.Column(visible=False)
350
 
351
  results = get_predictions(audio_input)
352
 
353
- # Create HTML content for all results
354
- results_html = ""
 
 
 
355
  for i, result in enumerate(results):
356
- results_html += f"""
357
- <div style="margin-bottom: 30px; padding: 20px; border: 1px solid #ddd; border-radius: 10px;">
358
- <h3>Chunk {i+1} Results</h3>
359
- <div style="display: flex; justify-content: space-between; margin-bottom: 20px;">
360
- <div style="flex: 1; margin-right: 20px;">
361
- <p><strong>Acoustic Emotion Prediction:</strong> {result['emotion']}</p>
362
- <p><strong>Transcribed Text:</strong> {result['transcription']}</p>
363
- <p><strong>Sentiment Analysis:</strong> {result['sentiment']}</p>
364
- </div>
365
- <div style="flex: 1;">
366
- <img src="data:image/png;base64,{image_to_base64(result['image'])}" style="width: 100%; max-width: 500px; height: auto;">
367
- </div>
368
- </div>
369
- <div>
370
- <p><strong>Generated Music:</strong></p>
371
- <audio controls style="width: 100%;">
372
- <source src="{result['music']}" type="audio/wav">
373
- Your browser does not support the audio element.
374
- </audio>
375
- </div>
376
- </div>
377
- <hr style="margin: 20px 0; border: 1px solid #ccc;">
378
- """
379
 
380
- # Hide loading indicator and show results
381
- yield gr.HTML("") + gr.Column(gr.HTML(results_html), visible=True)
382
-
383
- # Helper function to convert image to base64
384
- def image_to_base64(image):
385
- import base64
386
- from io import BytesIO
387
 
388
- buffered = BytesIO()
389
- image.save(buffered, format="PNG")
390
- return base64.b64encode(buffered.getvalue()).decode()
391
 
392
  # Set up the button click
393
  process_btn.click(
394
  fn=process_and_display,
395
  inputs=audio_input,
396
- outputs=[loading_indicator, results_container]
 
 
 
 
 
 
397
  )
398
 
399
  interface.launch()
 
55
 
56
  processor, music_model, device = load_musicgen_model()
57
 
58
+ # Function to chunk audio into 5-second segments
59
+ def chunk_audio(audio_path, chunk_duration=5):
60
+ """Split audio into 5-second chunks and return list of chunk file paths"""
61
  try:
62
  # Load audio file
63
  audio = AudioSegment.from_file(audio_path)
 
275
 
276
  # Generate music using ACOUSTIC EMOTION prediction with specific prompt
277
  music_path = generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks)
278
+
 
 
279
  return {
280
  'chunk_index': chunk_idx + 1,
281
  'emotion': emotion_prediction,
282
  'transcription': transcribed_text,
283
+ 'sentiment': f"Sentiment: {sentiment} (Polarity: {polarity:.2f})",
284
  'image': image,
285
  'music': music_path
286
  }
 
298
 
299
  # Function to get predictions for all chunks
300
  def get_predictions(audio_input):
301
+ # Chunk the audio into 5-second segments
302
+ chunk_files, total_chunks = chunk_audio(audio_input, chunk_duration=5)
303
 
304
  results = []
305
 
 
319
 
320
  return results
321
 
 
 
322
  # Create the Gradio interface with proper output handling
323
  with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as interface:
324
  gr.Markdown("# Affective Virtual Environments")
 
329
  process_btn = gr.Button("Process Audio", variant="primary")
330
 
331
  # Add a loading indicator
332
+ loading_indicator = gr.HTML("""
333
+ <div id="loading" style="display: none; text-align: center; margin: 20px;">
334
+ <p style="font-size: 18px; color: #4a4a4a;">Processing audio chunks...</p>
335
+ <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
336
+ <style>@keyframes spin {0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); }}</style>
337
+ </div>
338
+ """)
339
+
340
+ # Create output components for each chunk type
341
+ output_containers = []
342
+ group_components = [] # Store group components separately
343
 
344
+ # We'll create up to 10 chunk slots (adjust as needed)
345
+ for i in range(10):
346
+ with gr.Group(visible=False) as chunk_group:
347
+ gr.Markdown(f"### Chunk {i+1} Results")
348
+ with gr.Row():
349
+ emotion_output = gr.Label(label="Acoustic Emotion Prediction")
350
+ transcription_output = gr.Label(label="Transcribed Text")
351
+ sentiment_output = gr.Label(label="Sentiment Analysis")
352
+ with gr.Row():
353
+ image_output = gr.Image(label="Generated Equirectangular Image")
354
+ audio_output = gr.Audio(label="Generated Music")
355
+ gr.HTML("<hr style='margin: 20px 0; border: 1px solid #ccc;'>")
356
+
357
+ group_components.append(chunk_group) # Store the group component
358
+ output_containers.append({
359
+ 'emotion': emotion_output,
360
+ 'transcription': transcription_output,
361
+ 'sentiment': sentiment_output,
362
+ 'image': image_output,
363
+ 'music': audio_output
364
+ })
365
 
366
  def process_and_display(audio_input):
367
  # Show loading indicator
368
+ yield [gr.HTML("""
369
  <div style="text-align: center; margin: 20px;">
370
  <p style="font-size: 18px; color: #4a4a4a;">Processing audio chunks...</p>
371
  <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
372
  <style>@keyframes spin {0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); }}</style>
373
  </div>
374
+ """)] + [gr.Group(visible=False)] * len(group_components) + [None] * (len(output_containers) * 5)
375
 
376
  results = get_predictions(audio_input)
377
 
378
+ # Initialize outputs list
379
+ outputs = []
380
+ group_visibility = []
381
+
382
+ # Process each result
383
  for i, result in enumerate(results):
384
+ if i < len(output_containers):
385
+ group_visibility.append(gr.Group(visible=True))
386
+ outputs.extend([
387
+ result['emotion'],
388
+ result['transcription'],
389
+ result['sentiment'],
390
+ result['image'],
391
+ result['music']
392
+ ])
393
+ else:
394
+ # If we have more results than containers, just extend with None
395
+ group_visibility.append(gr.Group(visible=False))
396
+ outputs.extend([None] * 5)
 
 
 
 
 
 
 
 
 
 
397
 
398
+ # Hide remaining containers
399
+ for i in range(len(results), len(output_containers)):
400
+ group_visibility.append(gr.Group(visible=False))
401
+ outputs.extend([None] * 5)
 
 
 
402
 
403
+ # Hide loading indicator and show results
404
+ yield [gr.HTML("")] + group_visibility + outputs
 
405
 
406
  # Set up the button click
407
  process_btn.click(
408
  fn=process_and_display,
409
  inputs=audio_input,
410
+ outputs=[loading_indicator] + group_components + [comp for container in output_containers for comp in [
411
+ container['emotion'],
412
+ container['transcription'],
413
+ container['sentiment'],
414
+ container['image'],
415
+ container['music']
416
+ ]]
417
  )
418
 
419
  interface.launch()