Spaces:

jfforero
/

Speech2Scene3

Sleeping

App Files Files Community

jfforero commited on Sep 5, 2025

Commit

fd5a39d

verified ·

1 Parent(s): 145a57f

Update app.py

Browse files

Files changed (1) hide show

app.py +242 -11

app.py CHANGED Viewed

@@ -287,6 +287,9 @@ def process_chunk(chunk_path, chunk_idx, total_chunks, generate_audio=True):
         # Generate image using SENTIMENT analysis with specific prompt
         image = generate_image(sentiment, transcribed_text, chunk_idx, total_chunks)
         # Generate music only if audio generation is enabled
         music_path = None
         if generate_audio:
@@ -297,7 +300,8 @@ def process_chunk(chunk_path, chunk_idx, total_chunks, generate_audio=True):
             'emotion': emotion_prediction,
             'transcription': transcribed_text,
             'sentiment': sentiment,
-            'image': image,
             'music': music_path
         }
     except Exception as e:
@@ -309,6 +313,7 @@ def process_chunk(chunk_path, chunk_idx, total_chunks, generate_audio=True):
             'transcription': "Transcription failed",
             'sentiment': "Sentiment: error",
             'image': Image.new('RGB', (1024, 512), color='white'),
             'music': None
         }
@@ -335,8 +340,192 @@ def get_predictions(audio_input, generate_audio=True, chunk_duration=10):
     return results
 # Replace the create_fade_transition function with this updated version
 def create_fade_transition(images, fade_duration=1.0, fps=24):
@@ -419,7 +608,7 @@ def process_and_display(audio_input, generate_audio, chunk_duration):
             <style>@keyframes spin {{ 0% {{ transform: rotate(0deg); }} 100% {{ transform: rotate(360deg); }} }}</style>
             <p style="font-size: 14px; color: #4a4a4a;">This may take several minutes depending on the audio length...</p>
         </div>
-    """)] + [gr.Group(visible=False)] * len(group_components) + [None] * (len(output_containers) * 5) + [None, None]
     results = get_predictions(audio_input, generate_audio, chunk_duration)
@@ -427,6 +616,7 @@ def process_and_display(audio_input, generate_audio, chunk_duration):
     outputs = []
     group_visibility = []
     all_images = []  # Collect all generated images for the fade animation
     # Process each result
     for i, result in enumerate(results):
@@ -437,19 +627,22 @@ def process_and_display(audio_input, generate_audio, chunk_duration):
                 result['transcription'],
                 result['sentiment'],
                 result['image'],
                 result['music']
             ])
-            # Collect the image for the fade animation
             all_images.append(result['image'])
         else:
             # If we have more results than containers, just extend with None
             group_visibility.append(gr.Group(visible=False))
-            outputs.extend([None] * 5)
     # Hide remaining containers
     for i in range(len(results), len(output_containers)):
         group_visibility.append(gr.Group(visible=False))
-        outputs.extend([None] * 5)
     # Create fade animation if we have multiple images
     fade_preview = None
@@ -458,8 +651,14 @@ def process_and_display(audio_input, generate_audio, chunk_duration):
         # Create a fade animation (GIF)
         fade_preview, fade_animation_path = create_fade_transition(all_images, fade_duration=1.5, fps=15)
     # Hide loading indicator and show results
-    yield [gr.HTML("")] + group_visibility + outputs + [fade_preview, fade_animation_path]
 # Update the clear_all function to handle the new outputs
 def clear_all():
@@ -470,7 +669,7 @@ def clear_all():
     outputs.extend([gr.Group(visible=False)] * len(group_components))
     # For all output containers (set to None)
-    outputs.extend([None] * (len(output_containers) * 5))
     # For loading indicator (empty HTML)
     outputs.append(gr.HTML(""))
@@ -481,9 +680,10 @@ def clear_all():
     # For example selector (reset to None)
     outputs.append(None)
-    # For fade preview and animation (set to None)
     outputs.append(None)
     outputs.append(None)
     return outputs
@@ -574,7 +774,36 @@ with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as i
             type="binary",
             interactive=False
         )
     # Function to handle example selection
@@ -624,8 +853,9 @@ with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as i
             container['transcription'],
             container['sentiment'],
             container['image'],
             container['music']
-        ]] + [fade_preview_output, fade_animation_output]
     )
     # Set up the clear button
@@ -637,8 +867,9 @@ with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as i
             container['transcription'],
             container['sentiment'],
             container['image'],
             container['music']
-        ]] + [loading_indicator] + [chunk_duration_input] + [example_selector] + [fade_preview_output, fade_animation_output]
     )
     # Set up the example loading button

         # Generate image using SENTIMENT analysis with specific prompt
         image = generate_image(sentiment, transcribed_text, chunk_idx, total_chunks)
+        # Add 360 metadata to the image
+        image_with_360_path = add_360_metadata(image)
         # Generate music only if audio generation is enabled
         music_path = None
         if generate_audio:
             'emotion': emotion_prediction,
             'transcription': transcribed_text,
             'sentiment': sentiment,
+            'image': image,  # Original image for display in Gradio
+            'image_360': image_with_360_path,  # Image with 360 metadata
             'music': music_path
         }
     except Exception as e:
             'transcription': "Transcription failed",
             'sentiment': "Sentiment: error",
             'image': Image.new('RGB', (1024, 512), color='white'),
+            'image_360': None,
             'music': None
         }
     return results
+#
+def create_xmp_block(width, height):
+    """Create XMP metadata block following ExifTool's exact format."""
+    xmp = (
+        f'<?xpacket begin="ï»¿" id="W5M0MpCehiHzreSzNTczkc9d"?>\n'
+        f'<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="ExifTool">\n'
+        f'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">\n'
+        f'<rdf:Description rdf:about=""\n'
+        f'xmlns:GPano="http://ns.google.com/photos/1.0/panorama/"\n'
+        f'GPano:ProjectionType="equirectangular"\n'
+        f'GPano:UsePanoramaViewer="True"\n'
+        f'GPano:FullPanoWidthPixels="{width}"\n'
+        f'GPano:FullPanoHeightPixels="{height}"\n'
+        f'GPano:CroppedAreaImageWidthPixels="{width}"\n'
+        f'GPano:CroppedAreaImageHeightPixels="{height}"\n'
+        f'GPano:CroppedAreaLeftPixels="0"\n'
+        f'GPano:CroppedAreaTopPixels="0"/>\n'
+        f'</rdf:RDF>\n'
+        f'</x:xmpmeta>\n'
+        f'<?xpacket end="w"?>'
+    )
+    return xmp
+def write_xmp_to_jpg(input_path, output_path, width, height):
+    """Write XMP metadata to JPEG file following ExifTool's method."""
+    # Read the original JPEG
+    with open(input_path, 'rb') as f:
+        data = f.read()
+    # Find the start of image marker
+    if data[0:2] != b'\xFF\xD8':
+        raise ValueError("Not a valid JPEG file")
+    # Create XMP data
+    xmp_data = create_xmp_block(width, height)
+    # Create APP1 segment for XMP
+    app1_marker = b'\xFF\xE1'
+    xmp_header = b'http://ns.adobe.com/xap/1.0/\x00'
+    xmp_bytes = xmp_data.encode('utf-8')
+    length = len(xmp_header) + len(xmp_bytes) + 2  # +2 for length bytes
+    length_bytes = struct.pack('>H', length)
+    # Construct new file content
+    output = bytearray()
+    output.extend(data[0:2])  # SOI marker
+    output.extend(app1_marker)
+    output.extend(length_bytes)
+    output.extend(xmp_header)
+    output.extend(xmp_bytes)
+    output.extend(data[2:])  # Rest of the original file
+    # Write the new file
+    with open(output_path, 'wb') as f:
+        f.write(output)
+def add_360_metadata(img):
+    """Add 360 photo metadata to a PIL Image and return the path to the processed image."""
+    try:
+        # Verify the image
+        if img.width != 2 * img.height:
+            # Resize to 2:1 aspect ratio if needed
+            new_width = 2 * img.height
+            img = img.resize((new_width, img.height), Image.Resampling.LANCZOS)
+        # Create a temporary file
+        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
+            # First save as high-quality JPEG
+            img.save(tmp_file.name, "JPEG", quality=95)
+            # Then inject XMP metadata directly into JPEG file
+            write_xmp_to_jpg(tmp_file.name, tmp_file.name, img.width, img.height)
+            return tmp_file.name
+    except Exception as e:
+        print(f"Error adding 360 metadata: {str(e)}")
+        # Fallback: return the original image path
+        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
+            img.save(tmp_file.name, "JPEG", quality=95)
+            return tmp_file.name
+def create_360_viewer_html(image_paths, output_path):
+    """Create an HTML file with a 360 viewer for the given images."""
+    # Create a list of image data URIs
+    image_data_list = []
+    for img_path in image_paths:
+        with open(img_path, "rb") as f:
+            img_data = base64.b64encode(f.read()).decode("utf-8")
+            image_data_list.append(f"data:image/jpeg;base64,{img_data}")
+    # Create the HTML content
+    html_content = f"""
+    <!DOCTYPE html>
+    <html lang="en">
+    <head>
+        <meta charset="UTF-8">
+        <meta name="viewport" content="width=device-width, initial-scale=1.0">
+        <title>360 Panorama Viewer</title>
+        <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.css"/>
+        <style>
+            body {{
+                margin: 0;
+                overflow: hidden;
+            }}
+            #panorama {{
+                width: 100vw;
+                height: 100vh;
+            }}
+            .pnlm-hotspot.pnlm-info-hotspot {{
+                background-color: rgba(0, 150, 255, 0.8);
+                border-radius: 50%;
+                width: 30px;
+                height: 30px;
+            }}
+            .pnlm-hotspot.pnlm-info-hotspot .pnlm-sprite {{
+                filter: brightness(0) invert(1);
+            }}
+            .pnlm-tooltip {{
+                background-color: rgba(0, 0, 0, 0.7);
+                color: white;
+                border-radius: 3px;
+                padding: 5px 10px;
+            }}
+            #image-selector {{
+                position: absolute;
+                top: 10px;
+                left: 10px;
+                z-index: 1000;
+                background: rgba(0, 0, 0, 0.7);
+                color: white;
+                padding: 5px 10px;
+                border-radius: 3px;
+            }}
+        </style>
+    </head>
+    <body>
+        <select id="image-selector">
+            {"".join([f'<option value="{i}">Chunk {i+1}</option>' for i in range(len(image_data_list))])}
+        </select>
+        <div id="panorama"></div>
+        <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.js"></script>
+        <script>
+            const images = {json.dumps(image_data_list)};
+            let currentViewer = null;
+            function loadPanorama(index) {{
+                if (currentViewer) {{
+                    currentViewer.destroy();
+                }}
+                currentViewer = pannellum.viewer('panorama', {{
+                    "type": "equirectangular",
+                    "panorama": images[index],
+                    "autoLoad": true,
+                    "autoRotate": -2,
+                    "showZoomCtrl": true,
+                    "showFullscreenCtrl": true,
+                    "hfov": 100
+                }});
+            }}
+            // Load the first image initially
+            loadPanorama(0);
+            // Handle image selection changes
+            document.getElementById('image-selector').addEventListener('change', function(e) {{
+                loadPanorama(parseInt(e.target.value));
+            }});
+        </script>
+    </body>
+    </html>
+    """
+    # Write the HTML to a file
+    with open(output_path, 'w') as f:
+        f.write(html_content)
+    return output_path
 # Replace the create_fade_transition function with this updated version
 def create_fade_transition(images, fade_duration=1.0, fps=24):
             <style>@keyframes spin {{ 0% {{ transform: rotate(0deg); }} 100% {{ transform: rotate(360deg); }} }}</style>
             <p style="font-size: 14px; color: #4a4a4a;">This may take several minutes depending on the audio length...</p>
         </div>
+    """)] + [gr.Group(visible=False)] * len(group_components) + [None] * (len(output_containers) * 6) + [None, None, None]
     results = get_predictions(audio_input, generate_audio, chunk_duration)
     outputs = []
     group_visibility = []
     all_images = []  # Collect all generated images for the fade animation
+    all_360_images = []  # Collect all 360 images for the viewer
     # Process each result
     for i, result in enumerate(results):
                 result['transcription'],
                 result['sentiment'],
                 result['image'],
+                result['image_360'],
                 result['music']
             ])
+            # Collect the images
             all_images.append(result['image'])
+            if result['image_360']:
+                all_360_images.append(result['image_360'])
         else:
             # If we have more results than containers, just extend with None
             group_visibility.append(gr.Group(visible=False))
+            outputs.extend([None] * 6)
     # Hide remaining containers
     for i in range(len(results), len(output_containers)):
         group_visibility.append(gr.Group(visible=False))
+        outputs.extend([None] * 6)
     # Create fade animation if we have multiple images
     fade_preview = None
         # Create a fade animation (GIF)
         fade_preview, fade_animation_path = create_fade_transition(all_images, fade_duration=1.5, fps=15)
+    # Create 360 viewer HTML if we have 360 images
+    viewer_html_path = None
+    if all_360_images:
+        with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as tmp_file:
+            viewer_html_path = create_360_viewer_html(all_360_images, tmp_file.name)
     # Hide loading indicator and show results
+    yield [gr.HTML("")] + group_visibility + outputs + [fade_preview, fade_animation_path, viewer_html_path]
 # Update the clear_all function to handle the new outputs
 def clear_all():
     outputs.extend([gr.Group(visible=False)] * len(group_components))
     # For all output containers (set to None)
+    outputs.extend([None] * (len(output_containers) * 6))  # Changed from 5 to 6
     # For loading indicator (empty HTML)
     outputs.append(gr.HTML(""))
     # For example selector (reset to None)
     outputs.append(None)
+    # For fade preview, animation, and viewer (set to None)
     outputs.append(None)
     outputs.append(None)
+    outputs.append(None)  # New output for viewer
     return outputs
             type="binary",
             interactive=False
         )
+    # In your output_containers, add the image_360 output
+    with gr.Group(visible=False) as chunk_group:
+        gr.Markdown(f"### Chunk {i+1} Results")
+        with gr.Row():
+            emotion_output = gr.Label(label="Acoustic Emotion Prediction")
+            transcription_output = gr.Label(label="Transcribed Text")
+            sentiment_output = gr.Label(label="Sentiment Analysis")
+        with gr.Row():
+            image_output = gr.Image(label="Generated Equirectangular Image")
+            image_360_output = gr.File(label="Download 360 Image", type="filepath")  # New component
+        with gr.Row():
+            audio_output = gr.Audio(label="Generated Music")
+        gr.HTML("<hr style='margin: 20px 0; border: 1px solid #ccc;'>")
+    # Add the 360 viewer component
+    with gr.Row():
+        fade_preview_output = gr.Image(
+            label="Fade Animation Preview",
+            interactive=False
+        )
+        fade_animation_output = gr.File(
+            label="Download Fade Animation",
+            type="binary",
+            interactive=False
+        )
+        viewer_html_output = gr.File(  # New component
+            label="Download 360 Viewer",
+            type="filepath",
+            interactive=False
+        )
     # Function to handle example selection
             container['transcription'],
             container['sentiment'],
             container['image'],
+            container['image_360'],  # New output
             container['music']
+        ]] + [fade_preview_output, fade_animation_output, viewer_html_output]  # Added viewer_html_output
     )
     # Set up the clear button
             container['transcription'],
             container['sentiment'],
             container['image'],
+            container['image_360'],  # New output
             container['music']
+        ]] + [loading_indicator] + [chunk_duration_input] + [example_selector] + [fade_preview_output, fade_animation_output, viewer_html_output]  # Added viewer_html_output
     )
     # Set up the example loading button