import gradio as gr
import numpy as np
import librosa
import requests
from io import BytesIO
from PIL import Image
import os
from tensorflow.keras.models import load_model
from faster_whisper import WhisperModel
from textblob import TextBlob
import torch
import scipy.io.wavfile
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import tempfile
import base64
from pydub import AudioSegment
import math
import json
import struct
import cv2

# =========================
# Models
# =========================

def load_emotion_model(model_path):
    try:
        m = load_model(model_path)
        print("Emotion model loaded successfully")
        return m
    except Exception as e:
        print("Error loading emotion prediction model:", e)
        return None

model_path = "mymodel_SER_LSTM_RAVDESS.h5"
model = load_emotion_model(model_path)

# Whisper
model_size = "small"
model2 = WhisperModel(model_size, device="cpu", compute_type="int8")

# MusicGen
def load_musicgen_model():
    try:
        device = "cuda" if torch.cuda.is_available() else "cpu"
        processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
        music_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
        music_model.to(device)
        print("MusicGen model loaded successfully")
        return processor, music_model, device
    except Exception as e:
        print("Error loading MusicGen model:", e)
        return None, None, None

processor, music_model, device = load_musicgen_model()

# =========================
# Audio utilities
# =========================

def chunk_audio(audio_path, chunk_duration=10):
    """Split audio into chunks and return list of chunk file paths"""
    try:
        audio = AudioSegment.from_file(audio_path)
        duration_ms = len(audio)
        chunk_ms = chunk_duration * 1000

        if chunk_duration <= 0:
            raise ValueError("Chunk duration must be positive")

        if chunk_duration > duration_ms / 1000:
            return [audio_path], 1

        chunk_files = []
        num_chunks = math.ceil(duration_ms / chunk_ms)

        for i in range(num_chunks):
            start_ms = i * chunk_ms
            end_ms = min((i + 1) * chunk_ms, duration_ms)
            chunk = audio[start_ms:end_ms]

            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                chunk.export(tmp_file.name, format="wav")
                chunk_files.append(tmp_file.name)

        return chunk_files, num_chunks

    except Exception as e:
        print("Error chunking audio:", e)
        return [audio_path], 1

def transcribe(wav_filepath):
    try:
        segments, _ = model2.transcribe(wav_filepath, beam_size=5)
        return "".join([segment.text for segment in segments])
    except Exception as e:
        print("Error transcribing audio:", e)
        return "Transcription failed"

def extract_mfcc(wav_file_name):
    try:
        y, sr = librosa.load(wav_file_name)
        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
        return mfccs
    except Exception as e:
        print("Error extracting MFCC features:", e)
        return None

emotions = {
    0: "neutral",
    1: "calm",
    2: "happy",
    3: "sad",
    4: "angry",
    5: "fearful",
    6: "disgust",
    7: "surprised",
}

def predict_emotion_from_audio(wav_filepath):
    try:
        if model is None:
            return "Model not loaded"

        test_point = extract_mfcc(wav_filepath)
        if test_point is not None:
            test_point = np.reshape(test_point, newshape=(1, 40, 1))
            predictions = model.predict(test_point)
            predicted_emotion_label = np.argmax(predictions[0])
            return emotions.get(predicted_emotion_label, "Unknown emotion")
        return "Error: Unable to extract features"
    except Exception as e:
        print("Error predicting emotion:", e)
        return "Prediction error"

def analyze_sentiment(text):
    try:
        if not text or text.strip() == "":
            return "neutral", 0.0

        analysis = TextBlob(text)
        polarity = analysis.sentiment.polarity

        if polarity > 0.1:
            sentiment = "positive"
        elif polarity < -0.1:
            sentiment = "negative"
        else:
            sentiment = "neutral"

        return sentiment, polarity
    except Exception as e:
        print("Error analyzing sentiment:", e)
        return "neutral", 0.0

# =========================
# Prompts
# =========================

def get_image_prompt(sentiment, transcribed_text, chunk_idx, total_chunks):
    _ = f"Chunk {chunk_idx+1}/{total_chunks}: "  # kept for future use

    if sentiment == "positive":
        return (
            f"Generate an equirectangular 360° panoramic graphite sketch drawing, detailed pencil texture "
            f"with faint neon glows, cinematic lighting of:{transcribed_text}. Use low histogram frequency "
            f"in bright bins, dominant color in high RGB range, and high brightness and color variance. "
            f"Apply high-frequency texture with strong filter energy, pronounced gradient magnitude, and "
            f"strong local contrast. Use high spatial complexity, increased horizontal and vertical symmetry, "
            f"high edge density, bright gray levels, and high contrast. Emphasize rich visual structure, "
            f"color variation, and texture intensity across spatial composition."
        )
    elif sentiment == "negative":
        return (
            f"Generate an equirectangular 360° panoramic graphite sketch drawing, detailed pencil texture "
            f"with faint neon glows, cinematic lighting of:{transcribed_text}. Use high histogram frequency "
            f"in dark bins, dominant color in low RGB range, and low brightness and color variance. "
            f"Apply low-frequency texture with low filter energy, weak gradient magnitude, and low local contrast. "
            f"Use low spatial complexity, reduced horizontal and vertical symmetry, low edge density, dark gray levels, "
            f"and moderate contrast. Emphasize coarse structure and limited variation in color, texture, and spatial distribution."
        )
    else:
        return (
            f"Generate an equirectangular 360° panoramic graphite sketch drawing, detailed pencil texture "
            f"with faint neon glows, cinematic lighting of:{transcribed_text}. Use a balanced histogram frequency "
            f"across bins, dominant color in a mid RGB range, and moderate brightness and color variance. "
            f"Apply medium-frequency texture with moderate filter energy, standard gradient magnitude, and average local contrast. "
            f"Use medium spatial complexity, balanced horizontal and vertical symmetry, medium edge density, mid-range gray levels, "
            f"and standard contrast. Emphasize naturalistic structure and typical variation in color, texture, and spatial distribution."
        )

def get_music_prompt(emotion, transcribed_text, chunk_idx, total_chunks):
    _ = f"Chunk {chunk_idx+1}/{total_chunks}: "  # kept for future use

    emotion_prompts = {
        "neutral": f"Generate a neutral orchestral soundtrack with balanced energy and smooth spectral character. Use steady tempo, even rhythmic density, and low dissonance. Keep pitch clarity moderate and loudness stable. Maintain slow harmonic motion and tonal equilibrium. Emphasize balance, consistency, and calm tonal centers. The music should feel even, ambient, and unobtrusive, gently complementing: {transcribed_text}.",
        "calm": f"Generate a calm orchestral soundtrack with slowed motion, sparse rhythmic activity, and warm timbral shading. Use minimal dissonance, smooth spectral texture, and gentle pitch presence. Keep dynamics restrained with rare harmonic shifts and stable tonality. Emphasize warmth, sustained harmonies, and flowing textures that evoke tranquility and serenity inspired by: {transcribed_text}.",
        "happy": f"Generate a happy orchestral soundtrack with lively motion, energetic rhythmic density, and bright timbral color. Use controlled dissonance, vivid spectral texture, and clear melodic focus. Maintain dynamic expressiveness with active harmonic movement and stable tonal grounding. Emphasize joy through playful rhythms, ornamented melodies, and uplifting harmonic progressions inspired by: {transcribed_text}.",
        "sad": f"Generate a sad orchestral soundtrack with reduced motion, sparse rhythmic events, and dark timbral color. Use gentle dissonance, softened spectral texture, and subdued pitch clarity. Keep dynamics restrained with minimal harmonic change and low tonal uncertainty. Emphasize minor coloration, sustained harmonies, and fragile phrasing in response to: {transcribed_text}.",
        "angry": f"Generate an angry orchestral soundtrack with driving motion, dense rhythmic attack, and sharp timbral brightness. Use persistent dissonance, assertive pitch presence, and heightened dynamics. Maintain frequent harmonic shifts and unstable tonal grounding. Emphasize aggressive articulation, rhythmic force, and tension-laden progressions that amplify: {transcribed_text}.",
        "fearful": f"Generate a fearful orchestral soundtrack with unstable motion, fluctuating rhythmic density, and highly variable timbre. Use shifting dissonance, blurred pitch focus, and volatile dynamics. Increase harmonic unpredictability and tonal instability. Emphasize eerie textures, spatial tension, and spectral motion to evoke suspense and anticipation inspired by: {transcribed_text}.",
        "disgust": f"Generate a disgusted orchestral soundtrack with uneven motion, irregular rhythm, and dark, rough timbral texture. Use abrasive dissonance, unstable spectral character, and weakened pitch focus. Maintain uneasy dynamics and unsettled harmonic motion. Emphasize distorted textures, harsh intervals, and tonal ambiguity reflecting: {transcribed_text}.",
        "surprised": f"Generate a surprised orchestral soundtrack with shifting motion, sudden rhythmic variation, and dynamically changing timbre. Use sharp contrasts, heightened pitch clarity, and expressive dynamic swings. Maintain irregular harmonic motion with agile tonal pivots. Emphasize abrupt transitions, playful gestures, and expressive color changes inspired by: {transcribed_text}.",
    }
    return emotion_prompts.get(emotion.lower(), f"Create background music with {emotion} atmosphere that represents: {transcribed_text}")

# =========================
# Music generation
# =========================

def generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks):
    try:
        if processor is None or music_model is None:
            return None

        prompt = get_music_prompt(emotion_prediction, transcribed_text, chunk_idx, total_chunks)
        if len(prompt) > 200:
            prompt = prompt[:200] + "..."

        inputs = processor(text=[prompt], padding=True, return_tensors="pt").to(device)
        audio_values = music_model.generate(**inputs, max_new_tokens=512)

        sampling_rate = music_model.config.audio_encoder.sampling_rate
        audio_data = audio_values[0, 0].cpu().numpy()
        audio_data = audio_data / max(1e-9, np.max(np.abs(audio_data)))

        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
            scipy.io.wavfile.write(tmp_file.name, rate=sampling_rate, data=audio_data)
            return tmp_file.name

    except Exception as e:
        print("Error generating music:", e)
        return None

# =========================
# Image generation (DeepAI)
# =========================

api_key = os.getenv("DeepAI_api_key")

def upscale_image(image, target_width=4096, target_height=2048):
    """
    Upscale image using DeepAI's Torch-SRGAN API for super resolution.
    Falls back to OpenCV Lanczos if no API key or failure.
    """
    try:
        if not api_key:
            img_array = np.array(image)
            upscaled = cv2.resize(img_array, (target_width, target_height), interpolation=cv2.INTER_LANCZOS4)
            return Image.fromarray(upscaled)

        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_input:
            image.save(tmp_input.name, "JPEG", quality=95)

            response = requests.post(
                "https://api.deepai.org/api/torch-srgan",
                files={"image": open(tmp_input.name, "rb")},
                headers={"api-key": api_key},
            )
            data = response.json()

            if "output_url" in data:
                img_resp = requests.get(data["output_url"])
                upscaled_image = Image.open(BytesIO(img_resp.content))

                if upscaled_image.size != (target_width, target_height):
                    upscaled_image = upscaled_image.resize((target_width, target_height), Image.Resampling.LANCZOS)

                try:
                    os.unlink(tmp_input.name)
                except:
                    pass

                return upscaled_image

            print("Error in DeepAI upscaling response:", data)

        img_array = np.array(image)
        upscaled = cv2.resize(img_array, (target_width, target_height), interpolation=cv2.INTER_LANCZOS4)
        return Image.fromarray(upscaled)

    except Exception as e:
        print(f"Error upscaling image with DeepAI: {e}")
        img_array = np.array(image)
        upscaled = cv2.resize(img_array, (target_width, target_height), interpolation=cv2.INTER_LANCZOS4)
        return Image.fromarray(upscaled)

def generate_image(sentiment_prediction, transcribed_text, chunk_idx, total_chunks):
    try:
        if not api_key:
            base_image = Image.new("RGB", (1024, 512), color="white")
        else:
            prompt = get_image_prompt(sentiment_prediction, transcribed_text, chunk_idx, total_chunks)

            response = requests.post(
                "https://api.deepai.org/api/text2img",
                data={"text": prompt, "width": 1024, "height": 512, "image_generator_version": "hd"},
                headers={"api-key": api_key},
            )
            data = response.json()

            if "output_url" in data:
                img_resp = requests.get(data["output_url"])
                base_image = Image.open(BytesIO(img_resp.content))
            else:
                print("Error in DeepAI response:", data)
                base_image = Image.new("RGB", (1024, 512), color="white")

        upscaled_image = upscale_image(base_image)
        return upscaled_image

    except Exception as e:
        print("Error generating image:", e)
        return Image.new("RGB", (1024, 512), color="white")

# =========================
# 360 metadata injection (XMP)
# =========================

def create_xmp_block(width, height):
    xmp = (
        f'<?xpacket begin="ï»¿" id="W5M0MpCehiHzreSzNTczkc9d"?>\n'
        f'<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="ExifTool">\n'
        f'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">\n'
        f'<rdf:Description rdf:about=""\n'
        f'xmlns:GPano="http://ns.google.com/photos/1.0/panorama/"\n'
        f'GPano:ProjectionType="equirectangular"\n'
        f'GPano:UsePanoramaViewer="True"\n'
        f'GPano:FullPanoWidthPixels="{width}"\n'
        f'GPano:FullPanoHeightPixels="{height}"\n'
        f'GPano:CroppedAreaImageWidthPixels="{width}"\n'
        f'GPano:CroppedAreaImageHeightPixels="{height}"\n'
        f'GPano:CroppedAreaLeftPixels="0"\n'
        f'GPano:CroppedAreaTopPixels="0"/>\n'
        f'</rdf:RDF>\n'
        f'</x:xmpmeta>\n'
        f'<?xpacket end="w"?>'
    )
    return xmp

def write_xmp_to_jpg(input_path, output_path, width, height):
    with open(input_path, "rb") as f:
        data = f.read()

    if data[0:2] != b"\xFF\xD8":
        raise ValueError("Not a valid JPEG file")

    xmp_data = create_xmp_block(width, height)

    app1_marker = b"\xFF\xE1"
    xmp_header = b"http://ns.adobe.com/xap/1.0/\x00"
    xmp_bytes = xmp_data.encode("utf-8")
    length = len(xmp_header) + len(xmp_bytes) + 2
    length_bytes = struct.pack(">H", length)

    output = bytearray()
    output.extend(data[0:2])  # SOI
    output.extend(app1_marker)
    output.extend(length_bytes)
    output.extend(xmp_header)
    output.extend(xmp_bytes)
    output.extend(data[2:])

    with open(output_path, "wb") as f:
        f.write(output)

def add_360_metadata(img):
    try:
        target_width, target_height = 4096, 2048
        if img.width != target_width or img.height != target_height:
            img = img.resize((target_width, target_height), Image.Resampling.LANCZOS)

        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
            img.save(tmp_file.name, "JPEG", quality=95)
            write_xmp_to_jpg(tmp_file.name, tmp_file.name, img.width, img.height)
            return tmp_file.name

    except Exception as e:
        print(f"Error adding 360 metadata: {str(e)}")
        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
            img.save(tmp_file.name, "JPEG", quality=95)
            return tmp_file.name

# =========================
# Chunk processing
# =========================

def process_chunk(chunk_path, chunk_idx, total_chunks, generate_audio=True):
    try:
        emotion_prediction = predict_emotion_from_audio(chunk_path)
        transcribed_text = transcribe(chunk_path)
        sentiment, polarity = analyze_sentiment(transcribed_text)

        image = generate_image(sentiment, transcribed_text, chunk_idx, total_chunks)
        image_with_360_path = add_360_metadata(image)

        music_path = None
        if generate_audio:
            music_path = generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks)

        return {
            "chunk_index": chunk_idx + 1,
            "emotion": emotion_prediction,
            "transcription": transcribed_text,
            "sentiment": sentiment,
            "image": image,
            "image_360": image_with_360_path,
            "music": music_path,
        }

    except Exception as e:
        print(f"Error processing chunk {chunk_idx + 1}:", e)
        return {
            "chunk_index": chunk_idx + 1,
            "emotion": "Error",
            "transcription": "Transcription failed",
            "sentiment": "error",
            "image": Image.new("RGB", (1024, 512), color="white"),
            "image_360": None,
            "music": None,
        }

def get_predictions(audio_input, generate_audio=True, chunk_duration=10):
    chunk_files, total_chunks = chunk_audio(audio_input, chunk_duration)

    results = []
    for i, chunk_path in enumerate(chunk_files):
        print(f"Processing chunk {i+1}/{total_chunks} ({chunk_duration}s each)")
        result = process_chunk(chunk_path, i, total_chunks, generate_audio)
        results.append(result)

    for chunk_path in chunk_files:
        try:
            if chunk_path != audio_input:
                os.unlink(chunk_path)
        except:
            pass

    return results

# =========================
# 360 viewer HTML (with audio)
# =========================

def create_360_viewer_html(image_paths, audio_paths, output_path):
    """
    Generates an HTML file with A-Frame (WebXR) viewer that:
    - Displays 360° equirectangular images in immersive VR.
    - Plays corresponding audio for each chunk.
    - Supports continuous/random playback with play/pause.
    """
    # Read all images into base64 data URLs
    image_data_list = []
    for img_path in image_paths:
        with open(img_path, "rb") as f:
            img_data = base64.b64encode(f.read()).decode("utf-8")
            image_data_list.append(f"data:image/jpeg;base64,{img_data}")

    # Read all audio files into base64 strings (will be converted to Blob URLs at runtime)
    audio_base64_list = []
    for audio_path in audio_paths:
        if audio_path and os.path.exists(audio_path):
            with open(audio_path, "rb") as f:
                audio_base64_list.append(base64.b64encode(f.read()).decode("utf-8"))
        else:
            audio_base64_list.append(None)

    html_content = f"""<!DOCTYPE html>
<html lang="es">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
    <title>EVA 360 - Visualizador Afectivo (WebXR)</title>
    <!-- A-Frame with WebXR support -->
    <script src="https://aframe.io/releases/1.7.1/aframe.min.js"></script>
    <style>
        * {{
            margin: 0;
            padding: 0;
            box-sizing: border-box;
            user-select: none;
        }}
        body {{
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            overflow: hidden;
            background-color: #050816;
        }}
        /* UI overlay */
        .ui-bar {{
            position: fixed;
            bottom: 20px;
            left: 20px;
            right: 20px;
            background: rgba(10, 15, 40, 0.9);
            backdrop-filter: blur(8px);
            border-radius: 48px;
            padding: 12px 24px;
            display: flex;
            flex-wrap: wrap;
            align-items: center;
            justify-content: center;
            gap: 16px;
            z-index: 100;
            border: 1px solid rgba(255,255,255,0.2);
            box-shadow: 0 4px 20px rgba(0,0,0,0.3);
            font-size: 14px;
            color: white;
        }}
        button {{
            background: rgba(100, 140, 255, 0.25);
            border: none;
            color: white;
            padding: 8px 20px;
            border-radius: 40px;
            font-weight: bold;
            cursor: pointer;
            transition: all 0.2s ease;
            font-size: 14px;
            backdrop-filter: blur(4px);
        }}
        button:hover {{
            background: rgba(100, 140, 255, 0.5);
            transform: scale(1.02);
        }}
        button.active {{
            background: #2ecc71;
            color: #000;
        }}
        .toggle-group {{
            display: flex;
            gap: 8px;
            align-items: center;
            background: rgba(0,0,0,0.4);
            padding: 4px 12px;
            border-radius: 40px;
        }}
        .toggle-group label {{
            font-size: 12px;
            opacity: 0.8;
        }}
        #status {{
            margin-left: auto;
            font-size: 12px;
            background: rgba(0,0,0,0.5);
            padding: 6px 14px;
            border-radius: 40px;
            font-family: monospace;
        }}
        @media (max-width: 700px) {{
            .ui-bar {{ padding: 8px 16px; gap: 10px; }}
            button {{ padding: 6px 14px; font-size: 12px; }}
            .toggle-group label {{ font-size: 10px; }}
        }}
        /* A-Frame VR button repositioning (optional) */
        .a-enter-vr {{
            top: 20px;
            right: 20px;
            left: auto;
        }}
    </style>
</head>
<body>

    <!-- A-Frame Scene -->
    <a-scene vr-mode-ui="enabled: true; enterVRButton: default" 
             background="color: #050816"
             renderer="antialias: true"
             keyboard-shortcuts="enterVR: true">
        
        <!-- Sky with 360° image -->
        <a-sky id="sky" src="{image_data_list[0]}" rotation="0 -90 0"></a-sky>
        
        <!-- Optional: subtle ambient lighting -->
        <a-light type="ambient" intensity="0.3"></a-light>
    </a-scene>

    <!-- UI Controls -->
    <div class="ui-bar">
        <button id="playPauseBtn">▶ Play</button>
        
        <div class="toggle-group">
            <label>Continuous</label>
            <button id="continuousBtn" class="active">ON</button>
        </div>
        
        <div class="toggle-group">
            <label>Random</label>
            <button id="randomBtn">OFF</button>
        </div>
        
        <div id="status">Scene 1 / {len(image_data_list)}</div>
    </div>

    <script>
        // ----- Configuration -----
        const images = {image_data_list};
        const audioBase64 = {audio_base64_list};
        const totalScenes = images.length;
        
        // ----- State -----
        let state = {{
            currentIndex: 0,
            isPlaying: false,
            continuous: true,
            random: false,
            audioElement: null,
            currentAudioBlobUrl: null,
            waitingForAudio: false,
            loopTimeout: null
        }};
        
        // DOM elements
        const skyEl = document.getElementById('sky');
        const playPauseBtn = document.getElementById('playPauseBtn');
        const continuousBtn = document.getElementById('continuousBtn');
        const randomBtn = document.getElementById('randomBtn');
        const statusDiv = document.getElementById('status');
        
        // Helper: convert base64 to Blob URL
        function base64ToAudioUrl(base64Data) {{
            if (!base64Data) return null;
            const binary = atob(base64Data);
            const array = new Uint8Array(binary.length);
            for (let i = 0; i < binary.length; i++) {{
                array[i] = binary.charCodeAt(i);
            }}
            const blob = new Blob([array], {{ type: 'audio/wav' }});
            return URL.createObjectURL(blob);
        }}
        
        // Load scene by index (update sky texture)
        function loadScene(index) {{
            if (index < 0 || index >= totalScenes) return;
            state.currentIndex = index;
            // Update sky texture
            skyEl.setAttribute('src', images[index]);
            // Update status text
            statusDiv.innerText = `Scene ${{index + 1}} / ${{totalScenes}}`;
        }}
        
        // Stop current audio and clean up
        function stopAudio() {{
            if (state.audioElement) {{
                state.audioElement.pause();
                state.audioElement.src = '';
                state.audioElement.onended = null;
            }}
            if (state.currentAudioBlobUrl) {{
                URL.revokeObjectURL(state.currentAudioBlobUrl);
                state.currentAudioBlobUrl = null;
            }}
            if (state.loopTimeout) {{
                clearTimeout(state.loopTimeout);
                state.loopTimeout = null;
            }}
            state.waitingForAudio = false;
        }}
        
        // Play audio for current scene (if exists)
        function playCurrentAudio() {{
            return new Promise((resolve) => {{
                stopAudio(); // clean previous
                
                const audioBase = audioBase64[state.currentIndex];
                if (!audioBase) {{
                    // No audio for this scene -> wait random delay then resolve
                    const delay = 1000 + Math.random() * 4000;
                    state.loopTimeout = setTimeout(() => {{
                        resolve();
                    }}, delay);
                    return;
                }}
                
                // Create audio element
                const audio = new Audio();
                state.audioElement = audio;
                const audioUrl = base64ToAudioUrl(audioBase);
                state.currentAudioBlobUrl = audioUrl;
                audio.src = audioUrl;
                audio.preload = 'auto';
                
                audio.onended = () => {{
                    resolve();
                }};
                
                audio.onerror = (e) => {{
                    console.warn("Audio error", e);
                    resolve(); // continue even if audio fails
                }};
                
                // Play (must be called within user gesture, but we already have the Play button)
                audio.play().catch(err => {{
                    console.warn("Autoplay blocked, but continuing", err);
                    resolve(); // continue without audio
                }});
            }});
        }}
        
        // Determine next index based on random/continuous
        function getNextIndex() {{
            if (state.random) {{
                let newIdx = Math.floor(Math.random() * totalScenes);
                while (totalScenes > 1 && newIdx === state.currentIndex) {{
                    newIdx = Math.floor(Math.random() * totalScenes);
                }}
                return newIdx;
            }} else {{
                return (state.currentIndex + 1) % totalScenes;
            }}
        }}
        
        // Main loop: play current scene audio, then move to next if continuous
        async function runLoop() {{
            if (!state.isPlaying) return;
            
            // Load the current scene (already set, but ensure)
            loadScene(state.currentIndex);
            
            // Wait for audio (or random delay)
            await playCurrentAudio();
            
            // After audio ends, check if still playing and continuous mode
            if (state.isPlaying && state.continuous) {{
                const next = getNextIndex();
                state.currentIndex = next;
                runLoop(); // continue to next scene
            }} else {{
                // If not continuous, stop playing
                if (!state.continuous) {{
                    state.isPlaying = false;
                    playPauseBtn.innerText = "▶ Play";
                }}
            }}
        }}
        
        // Start / Restart playback from current scene
        function startPlayback() {{
            if (state.isPlaying) return;
            state.isPlaying = true;
            playPauseBtn.innerText = "⏸ Pause";
            // Cancel any pending timeouts
            if (state.loopTimeout) clearTimeout(state.loopTimeout);
            // Begin loop
            runLoop();
        }}
        
        // Pause playback
        function pausePlayback() {{
            state.isPlaying = false;
            playPauseBtn.innerText = "▶ Play";
            stopAudio();
        }}
        
        // Toggle play/pause
        function togglePlayback() {{
            if (state.isPlaying) {{
                pausePlayback();
            }} else {{
                startPlayback();
            }}
        }}
        
        // Toggle continuous mode
        function toggleContinuous() {{
            state.continuous = !state.continuous;
            continuousBtn.classList.toggle('active', state.continuous);
            continuousBtn.innerText = state.continuous ? "ON" : "OFF";
            // If continuous is turned OFF while playing, we let current audio finish but then stop
            // (handled in runLoop)
        }}
        
        // Toggle random mode
        function toggleRandom() {{
            state.random = !state.random;
            randomBtn.classList.toggle('active', state.random);
            randomBtn.innerText = state.random ? "ON" : "OFF";
        }}
        
        // ----- Event Listeners -----
        playPauseBtn.addEventListener('click', togglePlayback);
        continuousBtn.addEventListener('click', toggleContinuous);
        randomBtn.addEventListener('click', toggleRandom);
        
        // Optional: Reset scene index when user manually? Keep as is.
        // Preload first scene
        loadScene(0);
        
        // Small warning if no audio files and continuous: will just rotate on delay
        console.log("WebXR viewer ready. Images:", totalScenes, "Audio tracks:", audioBase64.filter(a => a).length);
    </script>
</body>
</html>"""

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(html_content)
    return output_path

# =========================
# Gradio streaming function
# =========================

# NOTE: We create these globals before defining process_and_display so the generator can reference them.
output_containers = []
group_components = []

def process_and_display(audio_input, generate_audio, chunk_duration):
    if chunk_duration is None or chunk_duration <= 0:
        chunk_duration = 10

    # Loading screen
    yield (
        [gr.HTML(f"""
        <div style="text-align: center; margin: 20px;">
          <p style="font-size: 18px; color: #4a4a4a;">Processing audio in {chunk_duration}-second chunks...</p>
          <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
          <style>@keyframes spin {{ 0% {{ transform: rotate(0deg); }} 100% {{ transform: rotate(360deg); }} }}</style>
          <p style="font-size: 14px; color: #4a4a4a;">This may take several minutes depending on the audio length...</p>
        </div>
        """)]
        + [gr.update(visible=False)] * len(group_components)
        # 7 outputs per chunk: emotion, transcription, sentiment, image, image_360, music_player, music_file_download
        + [None] * (len(output_containers) * 7)
        + [None, ""]
    )

    results = get_predictions(audio_input, generate_audio, chunk_duration)

    outputs = []
    group_visibility = []
    all_360_images = []
    all_music_paths = []

    for i, result in enumerate(results):
        if i < len(output_containers):
            group_visibility.append(gr.update(visible=True))
            outputs.extend(
                [
                    result["emotion"],
                    result["transcription"],
                    result["sentiment"],
                    result["image"],
                    result["image_360"],
                    result["music"],  # gr.Audio
                    result["music"],  # gr.File download
                ]
            )
            if result["image_360"]:
                all_360_images.append(result["image_360"])
            all_music_paths.append(result["music"])
        else:
            group_visibility.append(gr.update(visible=False))
            outputs.extend([None] * 7)

    for _ in range(len(results), len(output_containers)):
        group_visibility.append(gr.update(visible=False))
        outputs.extend([None] * 7)

    viewer_html_path = None
    if all_360_images:
        with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as tmp_file:
            viewer_html_path = create_360_viewer_html(all_360_images, all_music_paths, tmp_file.name)

    yield [gr.HTML("")] + group_visibility + outputs + [viewer_html_path, ""]

def clear_all():
    # Must match outputs wiring in clear_btn.click(...)
    outputs = [None]  # audio_input
    outputs.extend([gr.update(visible=False)] * len(group_components))
    outputs.extend([None] * (len(output_containers) * 7))
    outputs.append(gr.HTML(""))  # loading_indicator
    outputs.append(10)           # chunk_duration_input reset
    outputs.append(None)         # viewer_html_output
    outputs.append("")           # js_output
    return outputs

# =========================
# UI styling
# =========================

custom_css = """
.download-section {
    background: rgba(255,255,255,255);
    padding: 25px;
    border-radius: 15px;
    border: 3px solid #764ba2;
    text-align: left;
    margin: 25px 0;
    box-shadow: 0 10px 30px rgba(0,0,0,0.15);
    position: relative;
    overflow: hidden;
}

.download-button {
    background: rgba(155,155,155,255) !important;
    color: white !important;
    border: none !important;
    padding: 12px 30px !important;
    border-radius: 0px !important;
    font-weight: bold !important;
    font-size: 16px !important;
    margin-top: 15px !important;
    cursor: pointer !important;
    display: inline-block !important;
}
"""

# =========================
# Gradio app
# =========================

with gr.Blocks(title="Entornos Virtuales Afectivos - Procesamiento por Segmentos", css=custom_css) as interface:
    gr.Markdown("# Bello")
    gr.Markdown(
        """
***Bello*** explora las sutilezas afectivas de la voz humana a través de la figura del **Teniente Bello**,  
el piloto chileno que desapareció misteriosamente en 1914 durante un vuelo de entrenamiento sobre la costa del Pacífico.  

Este espacio invita a habitar lo desconocido, desde la emoción y la palabra.
Usando técnicas multimodales de reconocimiento de emociones en el habla, el proyecto analiza parámetros acústicos, prosódicos  
y semánticos del lenguaje hablado para generar entornos virtuales inmersivos en 360°.

### Cómo interactuar

1. Graba tu voz (o sube un audio) imaginando qué pudo haberle sucedido al Teniente Alejandro Bello.  
2. Establece la duración de cada segmento para dividir tu grabación en trozos.  
3. Marca la casilla si quieres generar audio para cada segmento.  
4. Genera tu Entorno Virtual Afectivo **EVA** y espera los resultados.  
5. Descarga el archivo HTML.  
6. Abre tu creación con cualquier navegador web.  

---
**Más información:**  

• Video Tutorial: [Cómo usar este espacio](https://youtu.be/eVD1lzwVhi8)  

• Para más detalles del proyecto, visita: [www.emotional-machines.com](https://www.emotional-machines.com)
        """
    )

    with gr.Row():
        with gr.Column(scale=2):
            audio_input = gr.Audio(label="Audio de Entrada", type="filepath", sources=["microphone", "upload"])

        with gr.Column(scale=1):
            chunk_duration_input = gr.Number(
                label="Duración de Segmento (segundos)",
                value=10,
                minimum=1,
                maximum=60,
                step=1,
                info="Duración de cada segmento de audio a procesar (1-60 segundos)",
            )
            generate_audio_checkbox = gr.Checkbox(
                label="Generar Audio (puede tardar más)",
                value=False,
                info="Desmarca para omitir la generación de música y acelerar el procesamiento",
            )
            with gr.Row():
                process_btn = gr.Button("Generar", variant="primary")
                clear_btn = gr.Button("Borrar Todo", variant="secondary")

    loading_indicator = gr.HTML(
        """
<div id="loading" style="display: none; text-align: center; margin: 20px;">
  <p style="font-size: 18px; color: #4a4a4a;">Procesando segmentos de audio...</p>
  <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
  <style>@keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } }</style>
</div>
"""
    )

    # Build chunk outputs (20 slots)
    for i in range(20):
        with gr.Group(visible=False) as chunk_group:
            gr.Markdown(f"### Resultados del Segmento {i+1}")

            with gr.Row():
                emotion_output = gr.Label(label="Predicción de Emoción Acústica")
                transcription_output = gr.Label(label="Texto Transcrito")
                sentiment_output = gr.Label(label="Análisis Sentimental")

            with gr.Row():
                image_output = gr.Image(label="Imagen Equirectangular Generada")
                image_360_output = gr.File(label="Descargar Imagen 360", type="filepath")

            with gr.Row():
                audio_output = gr.Audio(label="Música Generada")
                audio_file_output = gr.File(label="Descargar Música", type="filepath")  # ✅ DOWNLOAD

            gr.HTML("<hr style='margin: 20px 0; border: 1px solid #ccc;'>")

        group_components.append(chunk_group)
        output_containers.append(
            {
                "emotion": emotion_output,
                "transcription": transcription_output,
                "sentiment": sentiment_output,
                "image": image_output,
                "image_360": image_360_output,
                "music": audio_output,
                "music_file": audio_file_output,
            }
        )

    with gr.Group(visible=True, elem_classes="download-section") as download_group:
        viewer_html_output = gr.File(
            label="Una vez finalizado el procesamiento, descarga tu EVA aquí 🚀",
            type="filepath",
            interactive=False,
            elem_classes="download-button",
        )

    js_output = gr.HTML(visible=False)

    # IMPORTANT: outputs order must match yields/returns exactly
    process_btn.click(
        fn=process_and_display,
        inputs=[audio_input, generate_audio_checkbox, chunk_duration_input],
        outputs=[loading_indicator]
        + group_components
        + [
            comp
            for container in output_containers
            for comp in [
                container["emotion"],
                container["transcription"],
                container["sentiment"],
                container["image"],
                container["image_360"],
                container["music"],
                container["music_file"],  # ✅ ADD
            ]
        ]
        + [viewer_html_output, js_output],
    )

    clear_btn.click(
        fn=clear_all,
        inputs=[],
        outputs=[audio_input]
        + group_components
        + [
            comp
            for container in output_containers
            for comp in [
                container["emotion"],
                container["transcription"],
                container["sentiment"],
                container["image"],
                container["image_360"],
                container["music"],
                container["music_file"],  # ✅ ADD
            ]
        ]
        + [loading_indicator, chunk_duration_input, viewer_html_output, js_output],
    )

    interface.launch(server_name="0.0.0.0", server_port=7860)