Spaces:

jfforero
/

Bello

Sleeping

App Files Files Community

jfforero commited on Jan 6

Commit

28f6618

verified ·

1 Parent(s): af92dcb

Update app.py

Browse files

Files changed (1) hide show

app.py +445 -681

app.py CHANGED Viewed

@@ -1,6 +1,4 @@
 import gradio as gr
-import pyvista as pv
-from pyvista import examples
 import numpy as np
 import librosa
 import requests
@@ -9,46 +7,39 @@ from PIL import Image
 import os
 from tensorflow.keras.models import load_model
 from faster_whisper import WhisperModel
-import random
 from textblob import TextBlob
 import torch
 import scipy.io.wavfile
 from transformers import AutoProcessor, MusicgenForConditionalGeneration
 import tempfile
 import base64
-import plotly.graph_objects as go
-from plotly.subplots import make_subplots
-import soundfile as sf
 from pydub import AudioSegment
 import math
 import json
-import imageio
-from PIL import Image, ImageFilter
-import matplotlib.pyplot as plt
-from matplotlib.animation import FuncAnimation
-import base64
-from io import BytesIO
 import struct
 import cv2
-# Load the emotion prediction model
 def load_emotion_model(model_path):
     try:
-        model = load_model(model_path)
         print("Emotion model loaded successfully")
-        return model
     except Exception as e:
         print("Error loading emotion prediction model:", e)
         return None
-model_path = 'mymodel_SER_LSTM_RAVDESS.h5'
 model = load_emotion_model(model_path)
-# Initialize WhisperModel
 model_size = "small"
 model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
-# Load MusicGen model
 def load_musicgen_model():
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -63,50 +54,41 @@ def load_musicgen_model():
 processor, music_model, device = load_musicgen_model()
-# Function to chunk audio into segments
 def chunk_audio(audio_path, chunk_duration=10):
     """Split audio into chunks and return list of chunk file paths"""
     try:
-        # Load audio file
         audio = AudioSegment.from_file(audio_path)
         duration_ms = len(audio)
         chunk_ms = chunk_duration * 1000
-        # Validate chunk duration
         if chunk_duration <= 0:
             raise ValueError("Chunk duration must be positive")
         if chunk_duration > duration_ms / 1000:
-            # If chunk duration is longer than audio, return the whole audio
             return [audio_path], 1
-        chunks = []
         chunk_files = []
-        # Calculate number of chunks
         num_chunks = math.ceil(duration_ms / chunk_ms)
         for i in range(num_chunks):
             start_ms = i * chunk_ms
             end_ms = min((i + 1) * chunk_ms, duration_ms)
-            # Extract chunk
             chunk = audio[start_ms:end_ms]
-            chunks.append(chunk)
-            # Save chunk to temporary file
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                 chunk.export(tmp_file.name, format="wav")
                 chunk_files.append(tmp_file.name)
         return chunk_files, num_chunks
     except Exception as e:
         print("Error chunking audio:", e)
-        # Return original file as single chunk if chunking fails
         return [audio_path], 1
-# Function to transcribe audio
 def transcribe(wav_filepath):
     try:
         segments, _ = model2.transcribe(wav_filepath, beam_size=5)
@@ -115,7 +97,6 @@ def transcribe(wav_filepath):
         print("Error transcribing audio:", e)
         return "Transcription failed"
-# Function to extract MFCC features from audio
 def extract_mfcc(wav_file_name):
     try:
         y, sr = librosa.load(wav_file_name)
@@ -125,306 +106,218 @@ def extract_mfcc(wav_file_name):
         print("Error extracting MFCC features:", e)
         return None
-# Emotions dictionary
-emotions = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful', 6: 'disgust', 7: 'surprised'}
-# Function to predict emotion from audio
 def predict_emotion_from_audio(wav_filepath):
     try:
         if model is None:
             return "Model not loaded"
         test_point = extract_mfcc(wav_filepath)
         if test_point is not None:
             test_point = np.reshape(test_point, newshape=(1, 40, 1))
             predictions = model.predict(test_point)
             predicted_emotion_label = np.argmax(predictions[0])
             return emotions.get(predicted_emotion_label, "Unknown emotion")
-        else:
-            return "Error: Unable to extract features"
     except Exception as e:
         print("Error predicting emotion:", e)
         return "Prediction error"
-# Function to analyze sentiment from text
 def analyze_sentiment(text):
     try:
         if not text or text.strip() == "":
             return "neutral", 0.0
         analysis = TextBlob(text)
         polarity = analysis.sentiment.polarity
         if polarity > 0.1:
             sentiment = "positive"
         elif polarity < -0.1:
             sentiment = "negative"
         else:
             sentiment = "neutral"
         return sentiment, polarity
     except Exception as e:
         print("Error analyzing sentiment:", e)
         return "neutral", 0.0
-# Function to get image prompt based on sentiment
 def get_image_prompt(sentiment, transcribed_text, chunk_idx, total_chunks):
-    base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: "
     if sentiment == "positive":
-        return   f"Generate an equirectangular 360° panoramic graphite sketch drawing, detailed pencil texture with faint neon glows, cinematic lighting of:{transcribed_text}. Use low histogram frequency in bright bins, dominant color in high RGB range, and high brightness and color variance. Apply high-frequency texture with strong filter energy, pronounced gradient magnitude, and strong local contrast. Use high spatial complexity, increased horizontal and vertical symmetry, high edge density, bright gray levels, and high contrast. Emphasize rich visual structure, color variation, and texture intensity across spatial composition."
     elif sentiment == "negative":
-        return  f"Generate an equirectangular 360° panoramic graphite sketch drawing, detailed pencil texture with faint neon glows, cinematic lighting of:{transcribed_text}. Use high histogram frequency in dark bins, dominant color in low RGB range, and low brightness and color variance. Apply low-frequency texture with low filter energy, weak gradient magnitude, and low local contrast. Use low spatial complexity, reduced horizontal and vertical symmetry, low edge density, dark gray levels, and moderate contrast. Emphasize coarse structure and limited variation in color, texture, and spatial distribution."
-    else:  # neutral
-        return  f"Generate an equirectangular 360° panoramic graphite sketch drawing, detailed pencil texture with faint neon glows, cinematic lighting of:{transcribed_text}. Use a balanced histogram frequency across bins, dominant color in a mid RGB range, and moderate brightness and color variance. Apply medium-frequency texture with moderate filter energy, standard gradient magnitude, and average local contrast. Use medium spatial complexity, balanced horizontal and vertical symmetry, medium edge density, mid-range gray levels, and standard contrast. Emphasize naturalistic structure and typical variation in color, texture, and spatial distribution."
-# Function to get music prompt based on emotion
 def get_music_prompt(emotion, transcribed_text, chunk_idx, total_chunks):
-    base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: "
     emotion_prompts = {
-        'neutral': f"Generate a neutral orchestral soundtrack with balanced energy and smooth spectral character. Use steady tempo, even rhythmic density, and low dissonance. Keep pitch clarity moderate and loudness stable. Maintain slow harmonic motion and tonal equilibrium. Emphasize balance, consistency, and calm tonal centers. The music should feel even, ambient, and unobtrusive, gently complementing: {transcribed_text}.",
-        'calm': f"Generate a calm orchestral soundtrack with slowed motion, sparse rhythmic activity, and warm timbral shading. Use minimal dissonance, smooth spectral texture, and gentle pitch presence. Keep dynamics restrained with rare harmonic shifts and stable tonality. Emphasize warmth, sustained harmonies, and flowing textures that evoke tranquility and serenity inspired by: {transcribed_text}.",
-        'happy': f"Generate a happy orchestral soundtrack with lively motion, energetic rhythmic density, and bright timbral color. Use controlled dissonance, vivid spectral texture, and clear melodic focus. Maintain dynamic expressiveness with active harmonic movement and stable tonal grounding. Emphasize joy through playful rhythms, ornamented melodies, and uplifting harmonic progressions inspired by: {transcribed_text}.",
-        'sad': f"Generate a sad orchestral soundtrack with reduced motion, sparse rhythmic events, and dark timbral color. Use gentle dissonance, softened spectral texture, and subdued pitch clarity. Keep dynamics restrained with minimal harmonic change and low tonal uncertainty. Emphasize minor coloration, sustained harmonies, and fragile phrasing in response to: {transcribed_text}.",
-        'angry': f"Generate an angry orchestral soundtrack with driving motion, dense rhythmic attack, and sharp timbral brightness. Use persistent dissonance, assertive pitch presence, and heightened dynamics. Maintain frequent harmonic shifts and unstable tonal grounding. Emphasize aggressive articulation, rhythmic force, and tension-laden progressions that amplify: {transcribed_text}.",
-        'fearful': f"Generate a fearful orchestral soundtrack with unstable motion, fluctuating rhythmic density, and highly variable timbre. Use shifting dissonance, blurred pitch focus, and volatile dynamics. Increase harmonic unpredictability and tonal instability. Emphasize eerie textures, spatial tension, and spectral motion to evoke suspense and anticipation inspired by: {transcribed_text}.",
-        'disgust': f"Generate a disgusted orchestral soundtrack with uneven motion, irregular rhythm, and dark, rough timbral texture. Use abrasive dissonance, unstable spectral character, and weakened pitch focus. Maintain uneasy dynamics and unsettled harmonic motion. Emphasize distorted textures, harsh intervals, and tonal ambiguity reflecting: {transcribed_text}.",
-        'surprised': f"Generate a surprised orchestral soundtrack with shifting motion, sudden rhythmic variation, and dynamically changing timbre. Use sharp contrasts, heightened pitch clarity, and expressive dynamic swings. Maintain irregular harmonic motion with agile tonal pivots. Emphasize abrupt transitions, playful gestures, and expressive color changes inspired by: {transcribed_text}."
     }
-    return emotion_prompts.get(
-        emotion.lower(),
-        f"Create background music with {emotion} atmosphere that represents: {transcribed_text}"
-    )
-# Function to generate music with MusicGen (using acoustic emotion prediction)
 def generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks):
     try:
         if processor is None or music_model is None:
             return None
-        # Get specific prompt based on emotion
         prompt = get_music_prompt(emotion_prediction, transcribed_text, chunk_idx, total_chunks)
-        # Limit prompt length to avoid model issues
         if len(prompt) > 200:
             prompt = prompt[:200] + "..."
-        inputs = processor(
-            text=[prompt],
-            padding=True,
-            return_tensors="pt",
-        ).to(device)
-        # Generate audio
         audio_values = music_model.generate(**inputs, max_new_tokens=512)
-        # Convert to numpy array and sample rate
         sampling_rate = music_model.config.audio_encoder.sampling_rate
         audio_data = audio_values[0, 0].cpu().numpy()
-        # Normalize audio data
-        audio_data = audio_data / np.max(np.abs(audio_data))
-        # Create a temporary file to save the audio
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             scipy.io.wavfile.write(tmp_file.name, rate=sampling_rate, data=audio_data)
             return tmp_file.name
     except Exception as e:
         print("Error generating music:", e)
         return None
-# --- DeepAI Image Generation (Text2Img) ---
 api_key = os.getenv("DeepAI_api_key")
-# Function to upscale image using Lanczos interpolation
 def upscale_image(image, target_width=4096, target_height=2048):
     """
-    Upscale image using DeepAI's Torch-SRGAN API for super resolution
     """
     try:
         if not api_key:
-            print("No API key available for upscaling")
-            # Fallback to OpenCV if no API key
             img_array = np.array(image)
-            upscaled = cv2.resize(
-                img_array,
-                (target_width, target_height),
-                interpolation=cv2.INTER_LANCZOS4
-            )
             return Image.fromarray(upscaled)
-        # Save the image to a temporary file
         with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_input:
             image.save(tmp_input.name, "JPEG", quality=95)
-            # Make request to DeepAI torch-srgan API
             response = requests.post(
                 "https://api.deepai.org/api/torch-srgan",
-                files={'image': open(tmp_input.name, 'rb')},
-                headers={'api-key': api_key}
             )
             data = response.json()
-            if 'output_url' in data:
-                # Download the upscaled image
-                img_resp = requests.get(data['output_url'])
                 upscaled_image = Image.open(BytesIO(img_resp.content))
-                # Ensure the image meets our target dimensions
                 if upscaled_image.size != (target_width, target_height):
-                    upscaled_image = upscaled_image.resize(
-                        (target_width, target_height),
-                        Image.Resampling.LANCZOS
-                    )
-                # Clean up temporary file
-                os.unlink(tmp_input.name)
                 return upscaled_image
-            else:
-                print("Error in DeepAI upscaling response:", data)
-                # Fallback to OpenCV if API fails
-                img_array = np.array(image)
-                upscaled = cv2.resize(
-                    img_array,
-                    (target_width, target_height),
-                    interpolation=cv2.INTER_LANCZOS4
-                )
-                return Image.fromarray(upscaled)
     except Exception as e:
         print(f"Error upscaling image with DeepAI: {e}")
-        # Fallback to OpenCV if any error occurs
         img_array = np.array(image)
-        upscaled = cv2.resize(
-            img_array,
-            (target_width, target_height),
-            interpolation=cv2.INTER_LANCZOS4
-        )
         return Image.fromarray(upscaled)
-# ADD THE MISSING generate_image FUNCTION HERE
 def generate_image(sentiment_prediction, transcribed_text, chunk_idx, total_chunks):
     try:
         if not api_key:
-            # fallback white image if no API key
-            base_image = Image.new('RGB', (1024,512), color='white')
         else:
-            # Get specific prompt based on sentiment
             prompt = get_image_prompt(sentiment_prediction, transcribed_text, chunk_idx, total_chunks)
-            # Make request to DeepAI text2img API
             response = requests.post(
                 "https://api.deepai.org/api/text2img",
-                data={
-                    'text': prompt,
-                    'width': 1024,
-                    'height': 512,
-                    'image_generator_version': 'hd'
-                },
-                headers={'api-key': api_key}
             )
             data = response.json()
-            if 'output_url' in data:
-                # Download the generated image
-                img_resp = requests.get(data['output_url'])
                 base_image = Image.open(BytesIO(img_resp.content))
             else:
                 print("Error in DeepAI response:", data)
-                # Return a fallback image
-                base_image = Image.new('RGB', (1024,512), color='white')
-        # Upscale the image for better quality in 360 viewer
         upscaled_image = upscale_image(base_image)
         return upscaled_image
-    except Exception as e:
-        print("Error generating image:", e)
-        # Return a fallback image
-        return Image.new('RGB', (1024,512), color='white')
-# Function to process a single chunk
-def process_chunk(chunk_path, chunk_idx, total_chunks, generate_audio=True):
-    try:
-        # Get acoustic emotion prediction (for music)
-        emotion_prediction = predict_emotion_from_audio(chunk_path)
-        # Get transcribed text
-        transcribed_text = transcribe(chunk_path)
-        # Analyze sentiment of transcribed text (for image)
-        sentiment, polarity = analyze_sentiment(transcribed_text)
-        # Generate image using SENTIMENT analysis with specific prompt
-        image = generate_image(sentiment, transcribed_text, chunk_idx, total_chunks)
-        # Add 360 metadata to the image
-        image_with_360_path = add_360_metadata(image)
-        # Generate music only if audio generation is enabled
-        music_path = None
-        if generate_audio:
-            music_path = generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks)
-        return {
-            'chunk_index': chunk_idx + 1,
-            'emotion': emotion_prediction,
-            'transcription': transcribed_text,
-            'sentiment': sentiment,
-            'image': image,  # Original image for display in Gradio
-            'image_360': image_with_360_path,  # Image with 360 metadata
-            'music': music_path
-        }
     except Exception as e:
-        print(f"Error processing chunk {chunk_idx + 1}:", e)
-        # Return a fallback result with all required keys
-        return {
-            'chunk_index': chunk_idx + 1,
-            'emotion': "Error",
-            'transcription': "Transcription failed",
-            'sentiment': "Sentiment: error",
-            'image': Image.new('RGB', (1440, 770), color='white'),
-            'image_360': None,
-            'music': None
-        }
-# Function to get predictions for all chunks
-def get_predictions(audio_input, generate_audio=True, chunk_duration=10):
-    # Chunk the audio into segments
-    chunk_files, total_chunks = chunk_audio(audio_input, chunk_duration)
-    results = []
-    # Process each chunk
-    for i, chunk_path in enumerate(chunk_files):
-        print(f"Processing chunk {i+1}/{total_chunks} ({chunk_duration}s each)")
-        result = process_chunk(chunk_path, i, total_chunks, generate_audio)
-        results.append(result)
-    # Clean up temporary chunk files
-    for chunk_path in chunk_files:
-        try:
-            if chunk_path != audio_input:  # Don't delete original input file
-                os.unlink(chunk_path)
-        except:
-            pass
-    return results
 def create_xmp_block(width, height):
-    """Create XMP metadata block following ExifTool's exact format."""
     xmp = (
         f'<?xpacket begin="ï»¿" id="W5M0MpCehiHzreSzNTczkc9d"?>\n'
         f'<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="ExifTool">\n'
@@ -446,338 +339,299 @@ def create_xmp_block(width, height):
     return xmp
 def write_xmp_to_jpg(input_path, output_path, width, height):
-    """Write XMP metadata to JPEG file following ExifTool's method."""
-    # Read the original JPEG
-    with open(input_path, 'rb') as f:
         data = f.read()
-    # Find the start of image marker
-    if data[0:2] != b'\xFF\xD8':
         raise ValueError("Not a valid JPEG file")
-    # Create XMP data
     xmp_data = create_xmp_block(width, height)
-    # Create APP1 segment for XMP
-    app1_marker = b'\xFF\xE1'
-    xmp_header = b'http://ns.adobe.com/xap/1.0/\x00'
-    xmp_bytes = xmp_data.encode('utf-8')
-    length = len(xmp_header) + len(xmp_bytes) + 2  # +2 for length bytes
-    length_bytes = struct.pack('>H', length)
-    # Construct new file content
     output = bytearray()
-    output.extend(data[0:2])  # SOI marker
     output.extend(app1_marker)
     output.extend(length_bytes)
     output.extend(xmp_header)
     output.extend(xmp_bytes)
-    output.extend(data[2:])  # Rest of the original file
-    # Write the new file
-    with open(output_path, 'wb') as f:
         f.write(output)
 def add_360_metadata(img):
-    """Add 360 photo metadata to a PIL Image and return the path to the processed image."""
     try:
-        # First, ensure the image is upscaled to 4096x2048
         target_width, target_height = 4096, 2048
         if img.width != target_width or img.height != target_height:
             img = img.resize((target_width, target_height), Image.Resampling.LANCZOS)
-        # Create a temporary file
         with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
-            # First save as high-quality JPEG
             img.save(tmp_file.name, "JPEG", quality=95)
-            # Then inject XMP metadata directly into JPEG file
             write_xmp_to_jpg(tmp_file.name, tmp_file.name, img.width, img.height)
             return tmp_file.name
     except Exception as e:
         print(f"Error adding 360 metadata: {str(e)}")
-        # Fallback: return the original image path
         with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
             img.save(tmp_file.name, "JPEG", quality=95)
             return tmp_file.name
 def create_360_viewer_html(image_paths, audio_paths, output_path):
-    """Create an HTML file with a 360 viewer and audio player for the given images and audio."""
-    # Create a list of image data URIs
     image_data_list = []
     for img_path in image_paths:
         with open(img_path, "rb") as f:
             img_data = base64.b64encode(f.read()).decode("utf-8")
             image_data_list.append(f"data:image/jpeg;base64,{img_data}")
-    # Create a list of audio data URIs
     audio_data_list = []
     for audio_path in audio_paths:
-        if audio_path:  # Only process if audio exists
             with open(audio_path, "rb") as f:
                 audio_data = base64.b64encode(f.read()).decode("utf-8")
                 audio_data_list.append(f"data:audio/wav;base64,{audio_data}")
         else:
-            audio_data_list.append(None)  # Placeholder for chunks without audio
-    # Create the HTML content
-    html_content = f"""
-    <!DOCTYPE html>
-    <html lang="en">
-    <head>
-        <meta charset="UTF-8">
-        <meta name="viewport" content="width=device-width, initial-scale=1.0">
-        <title>360 Panorama Viewer with Audio</title>
-        <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.css"/>
-        <style>
-            body {{
-                margin: 0;
-                overflow: hidden;
-                font-family: Arial, sans-serif;
-            }}
-            #panorama {{
-                width: 100vw;
-                height: 80vh;
-            }}
-            .pnlm-hotspot.pnlm-info-hotspot {{
-                background-color: rgba(0, 150, 255, 0.8);
-                border-radius: 50%;
-                width: 30px;
-                height: 30px;
-            }}
-            .pnlm-hotspot.pnlm-info-hotspot .pnlm-sprite {{
-                filter: brightness(0) invert(1);
-            }}
-            .pnlm-tooltip {{
-                background-color: rgba(0, 0, 0, 0.7);
-                color: white;
-                border-radius: 3px;
-                padding: 5px 10px;
-            }}
-            #controls {{
-                position: absolute;
-                top: 10px;
-                right: 10px;
-                z-index: 1000;
-                background: rgba(0, 0, 0, 0.7);
-                color: white;
-                padding: 10px;
-                border-radius: 5px;
-                display: flex;
-                flex-direction: column;
-                gap: 10px;
-            }}
-            #audio-controls {{
-                position: fixed;
-                bottom: 0;
-                left: 0;
-                width: 100%;
-                background: rgba(0, 0, 0, 0.8);
-                color: white;
-                padding: 15px;
-                display: flex;
-                flex-direction: column;
-                align-items: center;
-                z-index: 1000;
-            }}
-            #audio-player {{
-                width: 80%;
-                margin-bottom: 10px;
-            }}
-            #audio-info {{
-                text-align: center;
-                font-size: 14px;
-            }}
-            button {{
-                background: #3498db;
-                color: white;
-                border: none;
-                padding: 8px 15px;
-                border-radius: 3px;
-                cursor: pointer;
-                margin: 5px;
-            }}
-            button:hover {{
-                background: #2980b9;
-            }}
-            select {{
-                padding: 5px;
-                border-radius: 3px;
-                border: 1px solid #ccc;
-            }}
-        </style>
-    </head>
-    <body>
-        <div id="controls">
-            <select id="image-selector">
-                {"".join([f'<option value="{i}">Chunk {i+1}</option>' for i in range(len(image_data_list))])}
-            </select>
-        </div>
-        <div id="panorama"></div>
-        <div id="audio-controls">
-            <audio id="audio-player" controls></audio>
-            <div id="audio-info">No audio available for this chunk</div>
-        </div>
-        <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.js"></script>
-        <script>
-            const images = {json.dumps(image_data_list)};
-            const audioFiles = {json.dumps(audio_data_list)};
-            let currentViewer = null;
-            function loadPanorama(index) {{
-                if (currentViewer) {{
-                    currentViewer.destroy();
-                }}
-                currentViewer = pannellum.viewer('panorama', {{
-                    "type": "equirectangular",
-                    "panorama": images[index],
-                    "autoLoad": true,
-                    "autoRotate": -2,
-                    "showZoomCtrl": true,
-                    "showFullscreenCtrl": true,
-                    "hfov": 100
-                }});
-                // Update audio player
-                updateAudioPlayer(index);
-            }}
-            function updateAudioPlayer(index) {{
-                const audioPlayer = document.getElementById('audio-player');
-                const audioInfo = document.getElementById('audio-info');
-                if (audioFiles[index]) {{
-                    audioPlayer.src = audioFiles[index];
-                    audioInfo.textContent = 'Playing audio for Chunk ' + (index + 1);
-                    // Try to play automatically (may be blocked by browser policies)
-                    audioPlayer.play().catch(e => {{
-                        audioInfo.textContent = 'Click play to listen to audio for Chunk ' + (index + 1);
-                    }});
-                }} else {{
-                    audioPlayer.src = '';
-                    audioInfo.textContent = 'No audio available for this chunk';
-                }}
-            }}
-            // Load the first image initially
-            loadPanorama(0);
-            // Handle image selection changes
-            document.getElementById('image-selector').addEventListener('change', function(e) {{
-                const selectedIndex = parseInt(e.target.value);
-                loadPanorama(selectedIndex);
-            }});
-        </script>
-    </body>
-    </html>
-    """
-    # Write the HTML to a file
-    with open(output_path, 'w') as f:
         f.write(html_content)
     return output_path
-# Update the process_and_display function
 def process_and_display(audio_input, generate_audio, chunk_duration):
-    # Validate chunk duration
     if chunk_duration is None or chunk_duration <= 0:
         chunk_duration = 10
-    # Show loading indicator
-    yield [gr.HTML(f"""
         <div style="text-align: center; margin: 20px;">
-            <p style="font-size: 18px; color: #4a4a4a;">Processing audio in {chunk_duration}-second chunks...</p>
-            <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
-            <style>@keyframes spin {{ 0% {{ transform: rotate(0deg); }} 100% {{ transform: rotate(360deg); }} }}</style>
-            <p style="font-size: 14px; color: #4a4a4a;">This may take several minutes depending on the audio length...</p>
         </div>
-    """)] + [gr.Group(visible=False)] * len(group_components) + [None] * (len(output_containers) * 6) + [None, ""]
     results = get_predictions(audio_input, generate_audio, chunk_duration)
-    # Initialize outputs list
     outputs = []
     group_visibility = []
-    all_360_images = []  # Collect all 360 images for the viewer
-    all_music_paths = []  # Collect all music paths for the viewer
-    # Process each result
     for i, result in enumerate(results):
         if i < len(output_containers):
-            group_visibility.append(gr.Group(visible=True))
-            outputs.extend([
-                result['emotion'],
-                result['transcription'],
-                result['sentiment'],
-                result['image'],
-                result['image_360'],
-                result['music']
-            ])
-            # Collect the 360-processed images and music
-            if result['image_360']:
-                all_360_images.append(result['image_360'])  # Use the 360-processed image
-            all_music_paths.append(result['music'])  # Can be None if no music generated
         else:
-            # If we have more results than containers, just extend with None
-            group_visibility.append(gr.Group(visible=False))
-            outputs.extend([None] * 6)
-    # Hide remaining containers
-    for i in range(len(results), len(output_containers)):
-        group_visibility.append(gr.Group(visible=False))
-        outputs.extend([None] * 6)
-    # Create 360 viewer HTML if we have 360 images
     viewer_html_path = None
     if all_360_images:
         with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as tmp_file:
             viewer_html_path = create_360_viewer_html(all_360_images, all_music_paths, tmp_file.name)
-    # Hide loading indicator and show results
     yield [gr.HTML("")] + group_visibility + outputs + [viewer_html_path, ""]
-# Update the clear_all function to handle the new outputs
 def clear_all():
-    # Create a list with None for all outputs
-    outputs = [None]  # For audio input
-    # For group components (set to invisible)
-    outputs.extend([gr.Group(visible=False)] * len(group_components))
-    # For all output containers (set to None)
-    outputs.extend([None] * (len(output_containers) * 6))
-    # For loading indicator (empty HTML)
-    outputs.append(gr.HTML(""))
-    # For chunk duration (reset to 10)
-    outputs.append(10)
-    # For example selector (reset to None)
-    outputs.append(None)
-    # For viewer (set to None)
-    outputs.append(None)
-    # For JavaScript output (empty)
-    outputs.append("")
     return outputs
-# Function to load example audio (placeholder - you need to implement this)
-def load_example_audio(example_name):
-    # This is a placeholder - you need to implement this function
-    # Return the path to the example audio file based on the example_name
-    return None
-# Custom CSS for enhanced styling
 custom_css = """
 .download-section {
     background: rgba(255,255,255,255);
@@ -791,37 +645,6 @@ custom_css = """
     overflow: hidden;
 }
-.download-section::before {
-    content: "";
-    position: absolute;
-    top: -50%;
-    left: -50%;
-    width: 200%;
-    height: 200%;
-    background: radial-gradient(circle, rgba(255,255,255,0.1) 0%, transparent 70%);
-    animation: shimmer 3s infinite linear;
-    pointer-events: none;
-}
-@keyframes shimmer {
-    0% { transform: rotate(0deg); }
-    100% { transform: rotate(360deg); }
-}
-.download-section h2 {
-    color: white;
-    font-size: 16px;
-    margin-bottom: 15px;
-    text-shadow: 1px 1px 3px rgba(0,0,0,0.3);
-}
-.download-section p {
-    color: rgba(255,255,255,0.9);
-    font-size: 16px;
-    margin-bottom: 20px;
-    line-height: 3.5;
-}
 .download-button {
     background: rgba(155,155,155,255) !important;
     color: white !important;
@@ -831,92 +654,19 @@ custom_css = """
     font-weight: bold !important;
     font-size: 16px !important;
     margin-top: 15px !important;
-    transition: all 0.3s ease !important;
     cursor: pointer !important;
     display: inline-block !important;
 }
-.download-button:hover {
-    transform: translateY(-3px) !important;
-    box-shadow: 0 8px 20px rgba(0,0,0,0.6) !important;
-}
-.download-button:active {
-    transform: translateY(1px) !important;
-}
-.download-icon {
-    margin-right: 8px;
-    font-size: 28px;
-}
-.feature-list {
-    display: flex;
-    justify-content: center;
-    flex-wrap: wrap;
-    gap: 15px;
-    margin: 20px 0;
-}
-.feature-item {
-    background: rgba(255,255,255,0.15);
-    padding: 10px 15px;
-    border-radius: 8px;
-    display: flex;
-    align-items: center;
-    gap: 8px;
-    color: white;
-    font-size: 14px;
-}
-.feature-icon {
-    font-size: 26px;
-}
-.viewer-preview {
-    margin-top: 20px;
-    border-radius: 10px;
-    overflow: hidden;
-    box-shadow: 0 5px 15px rgba(0,0,0,0.2);
-    max-width: 400px;
-    margin-left: auto;
-    margin-right: auto;
-}
-.viewer-preview img {
-    width: 100%;
-    display: block;
-}
-.instructions {
-    background: rgba(255,255,255,0.1);
-    padding: 15px;
-    border-radius: 8px;
-    margin-top: 20px;
-    text-align: left;
-}
-.instructions h3 {
-    color: white;
-    margin-top: 0;
-    font-size: 16px;
-}
-.instructions ol {
-    color: rgba(255,255,255,0.9);
-    padding-left: 20px;
-    margin-bottom: 0;
-}
-.instructions li {
-    margin-bottom: 8px;
-}
-"""
-# Create the Gradio interface with proper output handling
 with gr.Blocks(title="Entornos Virtuales Afectivos - Procesamiento por Segmentos", css=custom_css) as interface:
     gr.Markdown("# Bello")
     gr.Markdown(
-    """
 ***Bello*** explora las sutilezas afectivas de la voz humana a través de la figura del **Teniente Bello**,
 el piloto chileno que desapareció misteriosamente en 1914 durante un vuelo de entrenamiento sobre la costa del Pacífico.
@@ -939,24 +689,13 @@ y semánticos del lenguaje hablado para generar entornos virtuales inmersivos en
 • Video Tutorial: [Cómo usar este espacio](https://youtu.be/eVD1lzwVhi8)
 • Para más detalles del proyecto, visita: [www.emotional-machines.com](https://www.emotional-machines.com)
-    """
     )
     with gr.Row():
         with gr.Column(scale=2):
             audio_input = gr.Audio(label="Audio de Entrada", type="filepath", sources=["microphone", "upload"])
-            # Ejemplos de audio (opcional)
-            # example_selector = gr.Dropdown(
-            #     label="Seleccionar Audio de Ejemplo",
-            #     choices=["Discurso Feliz", "Historia Triste", "Noticias Neutrales"],
-            #     value=None,
-            #     info="Elige entre audios pregrabados de ejemplo"
-            # )
-            #load_example_btn = gr.Button("Cargar Ejemplo", variant="secondary")
         with gr.Column(scale=1):
             chunk_duration_input = gr.Number(
                 label="Duración de Segmento (segundos)",
@@ -964,86 +703,111 @@ y semánticos del lenguaje hablado para generar entornos virtuales inmersivos en
                 minimum=1,
                 maximum=60,
                 step=1,
-                info="Duración de cada segmento de audio a procesar (1-60 segundos)"
             )
             generate_audio_checkbox = gr.Checkbox(
-                label="Generar Audio (puede tardar más)",
                 value=False,
-                info="Desmarca para omitir la generación de música y acelerar el procesamiento"
             )
             with gr.Row():
                 process_btn = gr.Button("Generar", variant="primary")
                 clear_btn = gr.Button("Borrar Todo", variant="secondary")
-    loading_indicator = gr.HTML("""
-        <div id="loading" style="display: none; text-align: center; margin: 20px;">
-            <p style="font-size: 18px; color: #4a4a4a;">Procesando segmentos de audio...</p>
-            <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
-            <style>@keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } }</style>
-        </div>
-    """)
-    output_containers = []
-    group_components = []  # Contenedores de grupos
     for i in range(20):
         with gr.Group(visible=False) as chunk_group:
             gr.Markdown(f"### Resultados del Segmento {i+1}")
             with gr.Row():
                 emotion_output = gr.Label(label="Predicción de Emoción Acústica")
                 transcription_output = gr.Label(label="Texto Transcrito")
                 sentiment_output = gr.Label(label="Análisis Sentimental")
             with gr.Row():
                 image_output = gr.Image(label="Imagen Equirectangular Generada")
                 image_360_output = gr.File(label="Descargar Imagen 360", type="filepath")
             with gr.Row():
                 audio_output = gr.Audio(label="Música Generada")
             gr.HTML("<hr style='margin: 20px 0; border: 1px solid #ccc;'>")
         group_components.append(chunk_group)
-        output_containers.append({
-            'emotion': emotion_output,
-            'transcription': transcription_output,
-            'sentiment': sentiment_output,
-            'image': image_output,
-            'image_360': image_360_output,
-            'music': audio_output
-        })
     with gr.Group(visible=True, elem_classes="download-section") as download_group:
         viewer_html_output = gr.File(
-            label="Una vez finalizado el procesamiento, descarga tu EVA aquí 🚀",
             type="filepath",
             interactive=False,
-            elem_classes="download-button"
         )
     js_output = gr.HTML(visible=False)
     process_btn.click(
         fn=process_and_display,
         inputs=[audio_input, generate_audio_checkbox, chunk_duration_input],
-        outputs=[loading_indicator] + group_components + [comp for container in output_containers for comp in [
-            container['emotion'],
-            container['transcription'],
-            container['sentiment'],
-            container['image'],
-            container['image_360'],
-            container['music']
-        ]] + [viewer_html_output, js_output]
     )
     clear_btn.click(
         fn=clear_all,
         inputs=[],
-        outputs=[audio_input] + group_components + [comp for container in output_containers for comp in [
-            container['emotion'],
-            container['transcription'],
-            container['sentiment'],
-            container['image'],
-            container['image_360'],
-            container['music']
-        ]] + [loading_indicator, chunk_duration_input, viewer_html_output, js_output]
     )
-    interface.launch(share=True)

 import gradio as gr
 import numpy as np
 import librosa
 import requests
 import os
 from tensorflow.keras.models import load_model
 from faster_whisper import WhisperModel
 from textblob import TextBlob
 import torch
 import scipy.io.wavfile
 from transformers import AutoProcessor, MusicgenForConditionalGeneration
 import tempfile
 import base64
 from pydub import AudioSegment
 import math
 import json
 import struct
 import cv2
+# =========================
+# Models
+# =========================
 def load_emotion_model(model_path):
     try:
+        m = load_model(model_path)
         print("Emotion model loaded successfully")
+        return m
     except Exception as e:
         print("Error loading emotion prediction model:", e)
         return None
+model_path = "mymodel_SER_LSTM_RAVDESS.h5"
 model = load_emotion_model(model_path)
+# Whisper
 model_size = "small"
 model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
+# MusicGen
 def load_musicgen_model():
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
 processor, music_model, device = load_musicgen_model()
+# =========================
+# Audio utilities
+# =========================
 def chunk_audio(audio_path, chunk_duration=10):
     """Split audio into chunks and return list of chunk file paths"""
     try:
         audio = AudioSegment.from_file(audio_path)
         duration_ms = len(audio)
         chunk_ms = chunk_duration * 1000
         if chunk_duration <= 0:
             raise ValueError("Chunk duration must be positive")
         if chunk_duration > duration_ms / 1000:
             return [audio_path], 1
         chunk_files = []
         num_chunks = math.ceil(duration_ms / chunk_ms)
         for i in range(num_chunks):
             start_ms = i * chunk_ms
             end_ms = min((i + 1) * chunk_ms, duration_ms)
             chunk = audio[start_ms:end_ms]
             with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                 chunk.export(tmp_file.name, format="wav")
                 chunk_files.append(tmp_file.name)
         return chunk_files, num_chunks
     except Exception as e:
         print("Error chunking audio:", e)
         return [audio_path], 1
 def transcribe(wav_filepath):
     try:
         segments, _ = model2.transcribe(wav_filepath, beam_size=5)
         print("Error transcribing audio:", e)
         return "Transcription failed"
 def extract_mfcc(wav_file_name):
     try:
         y, sr = librosa.load(wav_file_name)
         print("Error extracting MFCC features:", e)
         return None
+emotions = {
+    0: "neutral",
+    1: "calm",
+    2: "happy",
+    3: "sad",
+    4: "angry",
+    5: "fearful",
+    6: "disgust",
+    7: "surprised",
+}
 def predict_emotion_from_audio(wav_filepath):
     try:
         if model is None:
             return "Model not loaded"
         test_point = extract_mfcc(wav_filepath)
         if test_point is not None:
             test_point = np.reshape(test_point, newshape=(1, 40, 1))
             predictions = model.predict(test_point)
             predicted_emotion_label = np.argmax(predictions[0])
             return emotions.get(predicted_emotion_label, "Unknown emotion")
+        return "Error: Unable to extract features"
     except Exception as e:
         print("Error predicting emotion:", e)
         return "Prediction error"
 def analyze_sentiment(text):
     try:
         if not text or text.strip() == "":
             return "neutral", 0.0
         analysis = TextBlob(text)
         polarity = analysis.sentiment.polarity
         if polarity > 0.1:
             sentiment = "positive"
         elif polarity < -0.1:
             sentiment = "negative"
         else:
             sentiment = "neutral"
         return sentiment, polarity
     except Exception as e:
         print("Error analyzing sentiment:", e)
         return "neutral", 0.0
+# =========================
+# Prompts
+# =========================
 def get_image_prompt(sentiment, transcribed_text, chunk_idx, total_chunks):
+    _ = f"Chunk {chunk_idx+1}/{total_chunks}: "  # kept for future use
     if sentiment == "positive":
+        return (
+            f"Generate an equirectangular 360° panoramic graphite sketch drawing, detailed pencil texture "
+            f"with faint neon glows, cinematic lighting of:{transcribed_text}. Use low histogram frequency "
+            f"in bright bins, dominant color in high RGB range, and high brightness and color variance. "
+            f"Apply high-frequency texture with strong filter energy, pronounced gradient magnitude, and "
+            f"strong local contrast. Use high spatial complexity, increased horizontal and vertical symmetry, "
+            f"high edge density, bright gray levels, and high contrast. Emphasize rich visual structure, "
+            f"color variation, and texture intensity across spatial composition."
+        )
     elif sentiment == "negative":
+        return (
+            f"Generate an equirectangular 360° panoramic graphite sketch drawing, detailed pencil texture "
+            f"with faint neon glows, cinematic lighting of:{transcribed_text}. Use high histogram frequency "
+            f"in dark bins, dominant color in low RGB range, and low brightness and color variance. "
+            f"Apply low-frequency texture with low filter energy, weak gradient magnitude, and low local contrast. "
+            f"Use low spatial complexity, reduced horizontal and vertical symmetry, low edge density, dark gray levels, "
+            f"and moderate contrast. Emphasize coarse structure and limited variation in color, texture, and spatial distribution."
+        )
+    else:
+        return (
+            f"Generate an equirectangular 360° panoramic graphite sketch drawing, detailed pencil texture "
+            f"with faint neon glows, cinematic lighting of:{transcribed_text}. Use a balanced histogram frequency "
+            f"across bins, dominant color in a mid RGB range, and moderate brightness and color variance. "
+            f"Apply medium-frequency texture with moderate filter energy, standard gradient magnitude, and average local contrast. "
+            f"Use medium spatial complexity, balanced horizontal and vertical symmetry, medium edge density, mid-range gray levels, "
+            f"and standard contrast. Emphasize naturalistic structure and typical variation in color, texture, and spatial distribution."
+        )
 def get_music_prompt(emotion, transcribed_text, chunk_idx, total_chunks):
+    _ = f"Chunk {chunk_idx+1}/{total_chunks}: "  # kept for future use
     emotion_prompts = {
+        "neutral": f"Generate a neutral orchestral soundtrack with balanced energy and smooth spectral character. Use steady tempo, even rhythmic density, and low dissonance. Keep pitch clarity moderate and loudness stable. Maintain slow harmonic motion and tonal equilibrium. Emphasize balance, consistency, and calm tonal centers. The music should feel even, ambient, and unobtrusive, gently complementing: {transcribed_text}.",
+        "calm": f"Generate a calm orchestral soundtrack with slowed motion, sparse rhythmic activity, and warm timbral shading. Use minimal dissonance, smooth spectral texture, and gentle pitch presence. Keep dynamics restrained with rare harmonic shifts and stable tonality. Emphasize warmth, sustained harmonies, and flowing textures that evoke tranquility and serenity inspired by: {transcribed_text}.",
+        "happy": f"Generate a happy orchestral soundtrack with lively motion, energetic rhythmic density, and bright timbral color. Use controlled dissonance, vivid spectral texture, and clear melodic focus. Maintain dynamic expressiveness with active harmonic movement and stable tonal grounding. Emphasize joy through playful rhythms, ornamented melodies, and uplifting harmonic progressions inspired by: {transcribed_text}.",
+        "sad": f"Generate a sad orchestral soundtrack with reduced motion, sparse rhythmic events, and dark timbral color. Use gentle dissonance, softened spectral texture, and subdued pitch clarity. Keep dynamics restrained with minimal harmonic change and low tonal uncertainty. Emphasize minor coloration, sustained harmonies, and fragile phrasing in response to: {transcribed_text}.",
+        "angry": f"Generate an angry orchestral soundtrack with driving motion, dense rhythmic attack, and sharp timbral brightness. Use persistent dissonance, assertive pitch presence, and heightened dynamics. Maintain frequent harmonic shifts and unstable tonal grounding. Emphasize aggressive articulation, rhythmic force, and tension-laden progressions that amplify: {transcribed_text}.",
+        "fearful": f"Generate a fearful orchestral soundtrack with unstable motion, fluctuating rhythmic density, and highly variable timbre. Use shifting dissonance, blurred pitch focus, and volatile dynamics. Increase harmonic unpredictability and tonal instability. Emphasize eerie textures, spatial tension, and spectral motion to evoke suspense and anticipation inspired by: {transcribed_text}.",
+        "disgust": f"Generate a disgusted orchestral soundtrack with uneven motion, irregular rhythm, and dark, rough timbral texture. Use abrasive dissonance, unstable spectral character, and weakened pitch focus. Maintain uneasy dynamics and unsettled harmonic motion. Emphasize distorted textures, harsh intervals, and tonal ambiguity reflecting: {transcribed_text}.",
+        "surprised": f"Generate a surprised orchestral soundtrack with shifting motion, sudden rhythmic variation, and dynamically changing timbre. Use sharp contrasts, heightened pitch clarity, and expressive dynamic swings. Maintain irregular harmonic motion with agile tonal pivots. Emphasize abrupt transitions, playful gestures, and expressive color changes inspired by: {transcribed_text}.",
     }
+    return emotion_prompts.get(emotion.lower(), f"Create background music with {emotion} atmosphere that represents: {transcribed_text}")
+# =========================
+# Music generation
+# =========================
 def generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks):
     try:
         if processor is None or music_model is None:
             return None
         prompt = get_music_prompt(emotion_prediction, transcribed_text, chunk_idx, total_chunks)
         if len(prompt) > 200:
             prompt = prompt[:200] + "..."
+        inputs = processor(text=[prompt], padding=True, return_tensors="pt").to(device)
         audio_values = music_model.generate(**inputs, max_new_tokens=512)
         sampling_rate = music_model.config.audio_encoder.sampling_rate
         audio_data = audio_values[0, 0].cpu().numpy()
+        audio_data = audio_data / max(1e-9, np.max(np.abs(audio_data)))
         with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
             scipy.io.wavfile.write(tmp_file.name, rate=sampling_rate, data=audio_data)
             return tmp_file.name
     except Exception as e:
         print("Error generating music:", e)
         return None
+# =========================
+# Image generation (DeepAI)
+# =========================
 api_key = os.getenv("DeepAI_api_key")
 def upscale_image(image, target_width=4096, target_height=2048):
     """
+    Upscale image using DeepAI's Torch-SRGAN API for super resolution.
+    Falls back to OpenCV Lanczos if no API key or failure.
     """
     try:
         if not api_key:
             img_array = np.array(image)
+            upscaled = cv2.resize(img_array, (target_width, target_height), interpolation=cv2.INTER_LANCZOS4)
             return Image.fromarray(upscaled)
         with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_input:
             image.save(tmp_input.name, "JPEG", quality=95)
             response = requests.post(
                 "https://api.deepai.org/api/torch-srgan",
+                files={"image": open(tmp_input.name, "rb")},
+                headers={"api-key": api_key},
             )
             data = response.json()
+            if "output_url" in data:
+                img_resp = requests.get(data["output_url"])
                 upscaled_image = Image.open(BytesIO(img_resp.content))
                 if upscaled_image.size != (target_width, target_height):
+                    upscaled_image = upscaled_image.resize((target_width, target_height), Image.Resampling.LANCZOS)
+                try:
+                    os.unlink(tmp_input.name)
+                except:
+                    pass
                 return upscaled_image
+            print("Error in DeepAI upscaling response:", data)
+        img_array = np.array(image)
+        upscaled = cv2.resize(img_array, (target_width, target_height), interpolation=cv2.INTER_LANCZOS4)
+        return Image.fromarray(upscaled)
     except Exception as e:
         print(f"Error upscaling image with DeepAI: {e}")
         img_array = np.array(image)
+        upscaled = cv2.resize(img_array, (target_width, target_height), interpolation=cv2.INTER_LANCZOS4)
         return Image.fromarray(upscaled)
 def generate_image(sentiment_prediction, transcribed_text, chunk_idx, total_chunks):
     try:
         if not api_key:
+            base_image = Image.new("RGB", (1024, 512), color="white")
         else:
             prompt = get_image_prompt(sentiment_prediction, transcribed_text, chunk_idx, total_chunks)
             response = requests.post(
                 "https://api.deepai.org/api/text2img",
+                data={"text": prompt, "width": 1024, "height": 512, "image_generator_version": "hd"},
+                headers={"api-key": api_key},
             )
             data = response.json()
+            if "output_url" in data:
+                img_resp = requests.get(data["output_url"])
                 base_image = Image.open(BytesIO(img_resp.content))
             else:
                 print("Error in DeepAI response:", data)
+                base_image = Image.new("RGB", (1024, 512), color="white")
         upscaled_image = upscale_image(base_image)
         return upscaled_image
     except Exception as e:
+        print("Error generating image:", e)
+        return Image.new("RGB", (1024, 512), color="white")
+# =========================
+# 360 metadata injection (XMP)
+# =========================
 def create_xmp_block(width, height):
     xmp = (
         f'<?xpacket begin="ï»¿" id="W5M0MpCehiHzreSzNTczkc9d"?>\n'
         f'<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="ExifTool">\n'
     return xmp
 def write_xmp_to_jpg(input_path, output_path, width, height):
+    with open(input_path, "rb") as f:
         data = f.read()
+    if data[0:2] != b"\xFF\xD8":
         raise ValueError("Not a valid JPEG file")
     xmp_data = create_xmp_block(width, height)
+    app1_marker = b"\xFF\xE1"
+    xmp_header = b"http://ns.adobe.com/xap/1.0/\x00"
+    xmp_bytes = xmp_data.encode("utf-8")
+    length = len(xmp_header) + len(xmp_bytes) + 2
+    length_bytes = struct.pack(">H", length)
     output = bytearray()
+    output.extend(data[0:2])  # SOI
     output.extend(app1_marker)
     output.extend(length_bytes)
     output.extend(xmp_header)
     output.extend(xmp_bytes)
+    output.extend(data[2:])
+    with open(output_path, "wb") as f:
         f.write(output)
 def add_360_metadata(img):
     try:
         target_width, target_height = 4096, 2048
         if img.width != target_width or img.height != target_height:
             img = img.resize((target_width, target_height), Image.Resampling.LANCZOS)
         with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
             img.save(tmp_file.name, "JPEG", quality=95)
             write_xmp_to_jpg(tmp_file.name, tmp_file.name, img.width, img.height)
             return tmp_file.name
     except Exception as e:
         print(f"Error adding 360 metadata: {str(e)}")
         with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
             img.save(tmp_file.name, "JPEG", quality=95)
             return tmp_file.name
+# =========================
+# Chunk processing
+# =========================
+def process_chunk(chunk_path, chunk_idx, total_chunks, generate_audio=True):
+    try:
+        emotion_prediction = predict_emotion_from_audio(chunk_path)
+        transcribed_text = transcribe(chunk_path)
+        sentiment, polarity = analyze_sentiment(transcribed_text)
+        image = generate_image(sentiment, transcribed_text, chunk_idx, total_chunks)
+        image_with_360_path = add_360_metadata(image)
+        music_path = None
+        if generate_audio:
+            music_path = generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks)
+        return {
+            "chunk_index": chunk_idx + 1,
+            "emotion": emotion_prediction,
+            "transcription": transcribed_text,
+            "sentiment": sentiment,
+            "image": image,
+            "image_360": image_with_360_path,
+            "music": music_path,
+        }
+    except Exception as e:
+        print(f"Error processing chunk {chunk_idx + 1}:", e)
+        return {
+            "chunk_index": chunk_idx + 1,
+            "emotion": "Error",
+            "transcription": "Transcription failed",
+            "sentiment": "error",
+            "image": Image.new("RGB", (1024, 512), color="white"),
+            "image_360": None,
+            "music": None,
+        }
+def get_predictions(audio_input, generate_audio=True, chunk_duration=10):
+    chunk_files, total_chunks = chunk_audio(audio_input, chunk_duration)
+    results = []
+    for i, chunk_path in enumerate(chunk_files):
+        print(f"Processing chunk {i+1}/{total_chunks} ({chunk_duration}s each)")
+        result = process_chunk(chunk_path, i, total_chunks, generate_audio)
+        results.append(result)
+    for chunk_path in chunk_files:
+        try:
+            if chunk_path != audio_input:
+                os.unlink(chunk_path)
+        except:
+            pass
+    return results
+# =========================
+# 360 viewer HTML (with audio)
+# =========================
 def create_360_viewer_html(image_paths, audio_paths, output_path):
     image_data_list = []
     for img_path in image_paths:
         with open(img_path, "rb") as f:
             img_data = base64.b64encode(f.read()).decode("utf-8")
             image_data_list.append(f"data:image/jpeg;base64,{img_data}")
     audio_data_list = []
     for audio_path in audio_paths:
+        if audio_path:
             with open(audio_path, "rb") as f:
                 audio_data = base64.b64encode(f.read()).decode("utf-8")
                 audio_data_list.append(f"data:audio/wav;base64,{audio_data}")
         else:
+            audio_data_list.append(None)
+    html_content = f"""
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>360 Panorama Viewer with Audio</title>
+  <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.css"/>
+  <style>
+    body {{ margin:0; overflow:hidden; font-family:Arial,sans-serif; }}
+    #panorama {{ width:100vw; height:80vh; }}
+    #controls {{
+      position:absolute; top:10px; right:10px; z-index:1000;
+      background:rgba(0,0,0,0.7); color:white; padding:10px; border-radius:5px;
+      display:flex; flex-direction:column; gap:10px;
+    }}
+    #audio-controls {{
+      position:fixed; bottom:0; left:0; width:100%;
+      background:rgba(0,0,0,0.8); color:white; padding:15px;
+      display:flex; flex-direction:column; align-items:center; z-index:1000;
+    }}
+    #audio-player {{ width:80%; margin-bottom:10px; }}
+    #audio-info {{ text-align:center; font-size:14px; }}
+    select {{ padding:5px; border-radius:3px; border:1px solid #ccc; }}
+  </style>
+</head>
+<body>
+  <div id="controls">
+    <select id="image-selector">
+      {"".join([f'<option value="{i}">Chunk {i+1}</option>' for i in range(len(image_data_list))])}
+    </select>
+  </div>
+  <div id="panorama"></div>
+  <div id="audio-controls">
+    <audio id="audio-player" controls></audio>
+    <div id="audio-info">No audio available for this chunk</div>
+  </div>
+  <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.js"></script>
+  <script>
+    const images = {json.dumps(image_data_list)};
+    const audioFiles = {json.dumps(audio_data_list)};
+    let currentViewer = null;
+    function loadPanorama(index) {{
+      if (currentViewer) currentViewer.destroy();
+      currentViewer = pannellum.viewer('panorama', {{
+        type: "equirectangular",
+        panorama: images[index],
+        autoLoad: true,
+        autoRotate: -2,
+        showZoomCtrl: true,
+        showFullscreenCtrl: true,
+        hfov: 100
+      }});
+      updateAudioPlayer(index);
+    }}
+    function updateAudioPlayer(index) {{
+      const audioPlayer = document.getElementById('audio-player');
+      const audioInfo = document.getElementById('audio-info');
+      if (audioFiles[index]) {{
+        audioPlayer.src = audioFiles[index];
+        audioInfo.textContent = 'Playing audio for Chunk ' + (index + 1);
+        audioPlayer.play().catch(e => {{
+          audioInfo.textContent = 'Click play to listen to audio for Chunk ' + (index + 1);
+        }});
+      }} else {{
+        audioPlayer.src = '';
+        audioInfo.textContent = 'No audio available for this chunk';
+      }}
+    }}
+    loadPanorama(0);
+    document.getElementById('image-selector').addEventListener('change', function(e) {{
+      const selectedIndex = parseInt(e.target.value);
+      loadPanorama(selectedIndex);
+    }});
+  </script>
+</body>
+</html>
+"""
+    with open(output_path, "w") as f:
         f.write(html_content)
     return output_path
+# =========================
+# Gradio streaming function
+# =========================
+# NOTE: We create these globals before defining process_and_display so the generator can reference them.
+output_containers = []
+group_components = []
 def process_and_display(audio_input, generate_audio, chunk_duration):
     if chunk_duration is None or chunk_duration <= 0:
         chunk_duration = 10
+    # Loading screen
+    yield (
+        [gr.HTML(f"""
         <div style="text-align: center; margin: 20px;">
+          <p style="font-size: 18px; color: #4a4a4a;">Processing audio in {chunk_duration}-second chunks...</p>
+          <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
+          <style>@keyframes spin {{ 0% {{ transform: rotate(0deg); }} 100% {{ transform: rotate(360deg); }} }}</style>
+          <p style="font-size: 14px; color: #4a4a4a;">This may take several minutes depending on the audio length...</p>
         </div>
+        """)]
+        + [gr.update(visible=False)] * len(group_components)
+        # 7 outputs per chunk: emotion, transcription, sentiment, image, image_360, music_player, music_file_download
+        + [None] * (len(output_containers) * 7)
+        + [None, ""]
+    )
     results = get_predictions(audio_input, generate_audio, chunk_duration)
     outputs = []
     group_visibility = []
+    all_360_images = []
+    all_music_paths = []
     for i, result in enumerate(results):
         if i < len(output_containers):
+            group_visibility.append(gr.update(visible=True))
+            outputs.extend(
+                [
+                    result["emotion"],
+                    result["transcription"],
+                    result["sentiment"],
+                    result["image"],
+                    result["image_360"],
+                    result["music"],  # gr.Audio
+                    result["music"],  # gr.File download
+                ]
+            )
+            if result["image_360"]:
+                all_360_images.append(result["image_360"])
+            all_music_paths.append(result["music"])
         else:
+            group_visibility.append(gr.update(visible=False))
+            outputs.extend([None] * 7)
+    for _ in range(len(results), len(output_containers)):
+        group_visibility.append(gr.update(visible=False))
+        outputs.extend([None] * 7)
     viewer_html_path = None
     if all_360_images:
         with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as tmp_file:
             viewer_html_path = create_360_viewer_html(all_360_images, all_music_paths, tmp_file.name)
     yield [gr.HTML("")] + group_visibility + outputs + [viewer_html_path, ""]
 def clear_all():
+    # Must match outputs wiring in clear_btn.click(...)
+    outputs = [None]  # audio_input
+    outputs.extend([gr.update(visible=False)] * len(group_components))
+    outputs.extend([None] * (len(output_containers) * 7))
+    outputs.append(gr.HTML(""))  # loading_indicator
+    outputs.append(10)           # chunk_duration_input reset
+    outputs.append(None)         # viewer_html_output
+    outputs.append("")           # js_output
     return outputs
+# =========================
+# UI styling
+# =========================
 custom_css = """
 .download-section {
     background: rgba(255,255,255,255);
     overflow: hidden;
 }
 .download-button {
     background: rgba(155,155,155,255) !important;
     color: white !important;
     font-weight: bold !important;
     font-size: 16px !important;
     margin-top: 15px !important;
     cursor: pointer !important;
     display: inline-block !important;
 }
+"""
+# =========================
+# Gradio app
+# =========================
 with gr.Blocks(title="Entornos Virtuales Afectivos - Procesamiento por Segmentos", css=custom_css) as interface:
     gr.Markdown("# Bello")
     gr.Markdown(
+        """
 ***Bello*** explora las sutilezas afectivas de la voz humana a través de la figura del **Teniente Bello**,
 el piloto chileno que desapareció misteriosamente en 1914 durante un vuelo de entrenamiento sobre la costa del Pacífico.
 • Video Tutorial: [Cómo usar este espacio](https://youtu.be/eVD1lzwVhi8)
 • Para más detalles del proyecto, visita: [www.emotional-machines.com](https://www.emotional-machines.com)
+        """
     )
     with gr.Row():
         with gr.Column(scale=2):
             audio_input = gr.Audio(label="Audio de Entrada", type="filepath", sources=["microphone", "upload"])
         with gr.Column(scale=1):
             chunk_duration_input = gr.Number(
                 label="Duración de Segmento (segundos)",
                 minimum=1,
                 maximum=60,
                 step=1,
+                info="Duración de cada segmento de audio a procesar (1-60 segundos)",
             )
             generate_audio_checkbox = gr.Checkbox(
+                label="Generar Audio (puede tardar más)",
                 value=False,
+                info="Desmarca para omitir la generación de música y acelerar el procesamiento",
             )
             with gr.Row():
                 process_btn = gr.Button("Generar", variant="primary")
                 clear_btn = gr.Button("Borrar Todo", variant="secondary")
+    loading_indicator = gr.HTML(
+        """
+<div id="loading" style="display: none; text-align: center; margin: 20px;">
+  <p style="font-size: 18px; color: #4a4a4a;">Procesando segmentos de audio...</p>
+  <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
+  <style>@keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } }</style>
+</div>
+"""
+    )
+    # Build chunk outputs (20 slots)
     for i in range(20):
         with gr.Group(visible=False) as chunk_group:
             gr.Markdown(f"### Resultados del Segmento {i+1}")
             with gr.Row():
                 emotion_output = gr.Label(label="Predicción de Emoción Acústica")
                 transcription_output = gr.Label(label="Texto Transcrito")
                 sentiment_output = gr.Label(label="Análisis Sentimental")
             with gr.Row():
                 image_output = gr.Image(label="Imagen Equirectangular Generada")
                 image_360_output = gr.File(label="Descargar Imagen 360", type="filepath")
             with gr.Row():
                 audio_output = gr.Audio(label="Música Generada")
+                audio_file_output = gr.File(label="Descargar Música", type="filepath")  # ✅ DOWNLOAD
             gr.HTML("<hr style='margin: 20px 0; border: 1px solid #ccc;'>")
         group_components.append(chunk_group)
+        output_containers.append(
+            {
+                "emotion": emotion_output,
+                "transcription": transcription_output,
+                "sentiment": sentiment_output,
+                "image": image_output,
+                "image_360": image_360_output,
+                "music": audio_output,
+                "music_file": audio_file_output,
+            }
+        )
     with gr.Group(visible=True, elem_classes="download-section") as download_group:
         viewer_html_output = gr.File(
+            label="Una vez finalizado el procesamiento, descarga tu EVA aquí 🚀",
             type="filepath",
             interactive=False,
+            elem_classes="download-button",
         )
     js_output = gr.HTML(visible=False)
+    # IMPORTANT: outputs order must match yields/returns exactly
     process_btn.click(
         fn=process_and_display,
         inputs=[audio_input, generate_audio_checkbox, chunk_duration_input],
+        outputs=[loading_indicator]
+        + group_components
+        + [
+            comp
+            for container in output_containers
+            for comp in [
+                container["emotion"],
+                container["transcription"],
+                container["sentiment"],
+                container["image"],
+                container["image_360"],
+                container["music"],
+                container["music_file"],  # ✅ ADD
+            ]
+        ]
+        + [viewer_html_output, js_output],
     )
     clear_btn.click(
         fn=clear_all,
         inputs=[],
+        outputs=[audio_input]
+        + group_components
+        + [
+            comp
+            for container in output_containers
+            for comp in [
+                container["emotion"],
+                container["transcription"],
+                container["sentiment"],
+                container["image"],
+                container["image_360"],
+                container["music"],
+                container["music_file"],  # ✅ ADD
+            ]
+        ]
+        + [loading_indicator, chunk_duration_input, viewer_html_output, js_output],
     )
+    interface.launch(share=True)