Spaces:

jfforero
/

Speech2Scene

Sleeping

App Files Files Community

jfforero commited on Sep 4, 2025

Commit

1b5ce3a

verified ·

1 Parent(s): c936d9b

Update app.py

Browse files

Files changed (1) hide show

app.py +72 -52

app.py CHANGED Viewed

@@ -55,9 +55,9 @@ def load_musicgen_model():
 processor, music_model, device = load_musicgen_model()
-# Function to chunk audio into 15-second segments
-def chunk_audio(audio_path, chunk_duration=15):
-    """Split audio into 15-second chunks and return list of chunk file paths"""
     try:
         # Load audio file
         audio = AudioSegment.from_file(audio_path)
@@ -275,14 +275,12 @@ def process_chunk(chunk_path, chunk_idx, total_chunks):
         # Generate music using ACOUSTIC EMOTION prediction with specific prompt
         music_path = generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks)
-        #'sentiment': f"Sentiment: {sentiment} (Polarity: {polarity:.2f})",
         return {
             'chunk_index': chunk_idx + 1,
             'emotion': emotion_prediction,
             'transcription': transcribed_text,
-            'sentiment': f"{sentiment}",
             'image': image,
             'music': music_path
         }
@@ -300,8 +298,8 @@ def process_chunk(chunk_path, chunk_idx, total_chunks):
 # Function to get predictions for all chunks
 def get_predictions(audio_input):
-    # Chunk the audio into 15-second segments
-    chunk_files, total_chunks = chunk_audio(audio_input, chunk_duration=15)
     results = []
@@ -321,8 +319,6 @@ def get_predictions(audio_input):
     return results
 # Create the Gradio interface with proper output handling
 with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as interface:
     gr.Markdown("# Affective Virtual Environments")
@@ -333,67 +329,91 @@ with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as i
         process_btn = gr.Button("Process Audio", variant="primary")
     # Add a loading indicator
-    loading_indicator = gr.HTML("")
-    # Create a container for results
-    results_container = gr.Column()
     def process_and_display(audio_input):
         # Show loading indicator
-        yield gr.HTML("""
             <div style="text-align: center; margin: 20px;">
                 <p style="font-size: 18px; color: #4a4a4a;">Processing audio chunks...</p>
                 <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
                 <style>@keyframes spin {0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); }}</style>
             </div>
-        """) + gr.Column(visible=False)
         results = get_predictions(audio_input)
-        # Create HTML content for all results
-        results_html = ""
         for i, result in enumerate(results):
-            results_html += f"""
-            <div style="margin-bottom: 30px; padding: 20px; border: 1px solid #ddd; border-radius: 10px;">
-                <h3>Chunk {i+1} Results</h3>
-                <div style="display: flex; justify-content: space-between; margin-bottom: 20px;">
-                    <div style="flex: 1; margin-right: 20px;">
-                        <p><strong>Acoustic Emotion Prediction:</strong> {result['emotion']}</p>
-                        <p><strong>Transcribed Text:</strong> {result['transcription']}</p>
-                        <p><strong>Sentiment Analysis:</strong> {result['sentiment']}</p>
-                    </div>
-                    <div style="flex: 1;">
-                        <img src="data:image/png;base64,{image_to_base64(result['image'])}" style="width: 100%; max-width: 500px; height: auto;">
-                    </div>
-                </div>
-                <div>
-                    <p><strong>Generated Music:</strong></p>
-                    <audio controls style="width: 100%;">
-                        <source src="{result['music']}" type="audio/wav">
-                        Your browser does not support the audio element.
-                    </audio>
-                </div>
-            </div>
-            <hr style="margin: 20px 0; border: 1px solid #ccc;">
-            """
-        # Hide loading indicator and show results
-        yield gr.HTML("") + gr.Column(gr.HTML(results_html), visible=True)
-    # Helper function to convert image to base64
-    def image_to_base64(image):
-        import base64
-        from io import BytesIO
-        buffered = BytesIO()
-        image.save(buffered, format="PNG")
-        return base64.b64encode(buffered.getvalue()).decode()
     # Set up the button click
     process_btn.click(
         fn=process_and_display,
         inputs=audio_input,
-        outputs=[loading_indicator, results_container]
     )
 interface.launch()

 processor, music_model, device = load_musicgen_model()
+# Function to chunk audio into 5-second segments
+def chunk_audio(audio_path, chunk_duration=5):
+    """Split audio into 5-second chunks and return list of chunk file paths"""
     try:
         # Load audio file
         audio = AudioSegment.from_file(audio_path)
         # Generate music using ACOUSTIC EMOTION prediction with specific prompt
         music_path = generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks)
         return {
             'chunk_index': chunk_idx + 1,
             'emotion': emotion_prediction,
             'transcription': transcribed_text,
+            'sentiment': f"Sentiment: {sentiment} (Polarity: {polarity:.2f})",
             'image': image,
             'music': music_path
         }
 # Function to get predictions for all chunks
 def get_predictions(audio_input):
+    # Chunk the audio into 5-second segments
+    chunk_files, total_chunks = chunk_audio(audio_input, chunk_duration=5)
     results = []
     return results
 # Create the Gradio interface with proper output handling
 with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as interface:
     gr.Markdown("# Affective Virtual Environments")
         process_btn = gr.Button("Process Audio", variant="primary")
     # Add a loading indicator
+    loading_indicator = gr.HTML("""
+        <div id="loading" style="display: none; text-align: center; margin: 20px;">
+            <p style="font-size: 18px; color: #4a4a4a;">Processing audio chunks...</p>
+            <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
+            <style>@keyframes spin {0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); }}</style>
+        </div>
+    """)
+    # Create output components for each chunk type
+    output_containers = []
+    group_components = []  # Store group components separately
+    # We'll create up to 10 chunk slots (adjust as needed)
+    for i in range(10):
+        with gr.Group(visible=False) as chunk_group:
+            gr.Markdown(f"### Chunk {i+1} Results")
+            with gr.Row():
+                emotion_output = gr.Label(label="Acoustic Emotion Prediction")
+                transcription_output = gr.Label(label="Transcribed Text")
+                sentiment_output = gr.Label(label="Sentiment Analysis")
+            with gr.Row():
+                image_output = gr.Image(label="Generated Equirectangular Image")
+                audio_output = gr.Audio(label="Generated Music")
+            gr.HTML("<hr style='margin: 20px 0; border: 1px solid #ccc;'>")
+        group_components.append(chunk_group)  # Store the group component
+        output_containers.append({
+            'emotion': emotion_output,
+            'transcription': transcription_output,
+            'sentiment': sentiment_output,
+            'image': image_output,
+            'music': audio_output
+        })
     def process_and_display(audio_input):
         # Show loading indicator
+        yield [gr.HTML("""
             <div style="text-align: center; margin: 20px;">
                 <p style="font-size: 18px; color: #4a4a4a;">Processing audio chunks...</p>
                 <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
                 <style>@keyframes spin {0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); }}</style>
             </div>
+        """)] + [gr.Group(visible=False)] * len(group_components) + [None] * (len(output_containers) * 5)
         results = get_predictions(audio_input)
+        # Initialize outputs list
+        outputs = []
+        group_visibility = []
+        # Process each result
         for i, result in enumerate(results):
+            if i < len(output_containers):
+                group_visibility.append(gr.Group(visible=True))
+                outputs.extend([
+                    result['emotion'],
+                    result['transcription'],
+                    result['sentiment'],
+                    result['image'],
+                    result['music']
+                ])
+            else:
+                # If we have more results than containers, just extend with None
+                group_visibility.append(gr.Group(visible=False))
+                outputs.extend([None] * 5)
+        # Hide remaining containers
+        for i in range(len(results), len(output_containers)):
+            group_visibility.append(gr.Group(visible=False))
+            outputs.extend([None] * 5)
+        # Hide loading indicator and show results
+        yield [gr.HTML("")] + group_visibility + outputs
     # Set up the button click
     process_btn.click(
         fn=process_and_display,
         inputs=audio_input,
+        outputs=[loading_indicator] + group_components + [comp for container in output_containers for comp in [
+            container['emotion'],
+            container['transcription'],
+            container['sentiment'],
+            container['image'],
+            container['music']
+        ]]
     )
 interface.launch()