Spaces:

jfforero
/

Speech2Scene3

Build error

App Files Files Community

jfforero commited on Sep 6, 2025

Commit

e6f2ff7

verified ·

1 Parent(s): eca105b

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -617

app.py CHANGED Viewed

@@ -30,471 +30,79 @@ import base64
 from io import BytesIO
 import struct
 import cv2
-# Load the emotion prediction model
-def load_emotion_model(model_path):
-    try:
-        model = load_model(model_path)
-        print("Emotion model loaded successfully")
-        return model
-    except Exception as e:
-        print("Error loading emotion prediction model:", e)
-        return None
-model_path = 'mymodel_SER_LSTM_RAVDESS.h5'
-model = load_emotion_model(model_path)
-# Initialize WhisperModel
-model_size = "small"
-model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
-# Load MusicGen model
-def load_musicgen_model():
-    try:
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
-        music_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
-        music_model.to(device)
-        print("MusicGen model loaded successfully")
-        return processor, music_model, device
-    except Exception as e:
-        print("Error loading MusicGen model:", e)
-        return None, None, None
-processor, music_model, device = load_musicgen_model()
-# Function to chunk audio into segments
-def chunk_audio(audio_path, chunk_duration=10):
-    """Split audio into chunks and return list of chunk file paths"""
-    try:
-        # Load audio file
-        audio = AudioSegment.from_file(audio_path)
-        duration_ms = len(audio)
-        chunk_ms = chunk_duration * 1000
-        # Validate chunk duration
-        if chunk_duration <= 0:
-            raise ValueError("Chunk duration must be positive")
-        if chunk_duration > duration_ms / 1000:
-            # If chunk duration is longer than audio, return the whole audio
-            return [audio_path], 1
-        chunks = []
-        chunk_files = []
-        # Calculate number of chunks
-        num_chunks = math.ceil(duration_ms / chunk_ms)
-        for i in range(num_chunks):
-            start_ms = i * chunk_ms
-            end_ms = min((i + 1) * chunk_ms, duration_ms)
-            # Extract chunk
-            chunk = audio[start_ms:end_ms]
-            chunks.append(chunk)
-            # Save chunk to temporary file
-            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-                chunk.export(tmp_file.name, format="wav")
-                chunk_files.append(tmp_file.name)
-        return chunk_files, num_chunks
-    except Exception as e:
-        print("Error chunking audio:", e)
-        # Return original file as single chunk if chunking fails
-        return [audio_path], 1
-# Function to transcribe audio
-def transcribe(wav_filepath):
-    try:
-        segments, _ = model2.transcribe(wav_filepath, beam_size=5)
-        return "".join([segment.text for segment in segments])
-    except Exception as e:
-        print("Error transcribing audio:", e)
-        return "Transcription failed"
-# Function to extract MFCC features from audio
-def extract_mfcc(wav_file_name):
-    try:
-        y, sr = librosa.load(wav_file_name)
-        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
-        return mfccs
-    except Exception as e:
-        print("Error extracting MFCC features:", e)
-        return None
-# Emotions dictionary
-emotions = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful', 6: 'disgust', 7: 'surprised'}
-# Function to predict emotion from audio
-def predict_emotion_from_audio(wav_filepath):
-    try:
-        if model is None:
-            return "Model not loaded"
-        test_point = extract_mfcc(wav_filepath)
-        if test_point is not None:
-            test_point = np.reshape(test_point, newshape=(1, 40, 1))
-            predictions = model.predict(test_point)
-            predicted_emotion_label = np.argmax(predictions[0])
-            return emotions.get(predicted_emotion_label, "Unknown emotion")
-        else:
-            return "Error: Unable to extract features"
-    except Exception as e:
-        print("Error predicting emotion:", e)
-        return "Prediction error"
-# Function to analyze sentiment from text
-def analyze_sentiment(text):
-    try:
-        if not text or text.strip() == "":
-            return "neutral", 0.0
-        analysis = TextBlob(text)
-        polarity = analysis.sentiment.polarity
-        if polarity > 0.1:
-            sentiment = "positive"
-        elif polarity < -0.1:
-            sentiment = "negative"
-        else:
-            sentiment = "neutral"
-        return sentiment, polarity
-    except Exception as e:
-        print("Error analyzing sentiment:", e)
-        return "neutral", 0.0
-# Function to get image prompt based on sentiment
-def get_image_prompt(sentiment, transcribed_text, chunk_idx, total_chunks):
-    base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: "
-    if sentiment == "positive":
-        return base_prompt + f"Generate a vibrant, uplifting equirectangular 360 image texture with bright colors, joyful atmosphere, and optimistic vibes representing: [{transcribed_text}]. The scene should evoke happiness and positivity."
-    elif sentiment == "negative":
-        return base_prompt + f"Generate a moody, dramatic equirectangular 360 image texture with dark tones, intense atmosphere, and emotional depth representing: [{transcribed_text}]. The scene should convey melancholy and intensity."
-    else:  # neutral
-        return base_prompt + f"Generate a balanced, serene equirectangular 360 image texture with harmonious colors, peaceful atmosphere, and calm vibes representing: [{transcribed_text}]. The scene should evoke tranquility and balance."
-# Function to get music prompt based on emotion
-def get_music_prompt(emotion, transcribed_text, chunk_idx, total_chunks):
-    base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: "
-    emotion_prompts = {
-        'neutral': f"Create ambient, background music with neutral tones, subtle melodies, and unobtrusive atmosphere that complements: {transcribed_text}. The music should be calm and balanced.",
-        'calm': f"Generate soothing, peaceful music with gentle melodies, soft instrumentation, and relaxing vibes that represents: {transcribed_text}. The music should evoke tranquility and serenity.",
-        'happy': f"Create joyful, upbeat music with cheerful melodies, bright instrumentation, and energetic rhythms that celebrates: {transcribed_text}. The music should evoke happiness and positivity.",
-        'sad': f"Generate emotional, melancholic music with poignant melodies, soft strings, and heartfelt atmosphere that reflects: {transcribed_text}. The music should evoke sadness and reflection.",
-        'angry': f"Create intense, powerful music with driving rhythms, aggressive instrumentation, and strong dynamics that expresses: {transcribed_text}. The music should evoke anger and intensity.",
-        'fearful': f"Generate suspenseful, tense music with eerie melodies, atmospheric sounds, and unsettling vibes that represents: {transcribed_text}. The music should evoke fear and anticipation.",
-        'disgust': f"Create dark, unsettling music with dissonant harmonies, unusual sounds, and uncomfortable atmosphere that reflects: {transcribed_text}. The music should evoke discomfort and unease.",
-        'surprised': f"Generate dynamic, unexpected music with sudden changes, playful melodies, and surprising elements that represents: {transcribed_text}. The music should evoke surprise and wonder."
-    }
-    return base_prompt + emotion_prompts.get(emotion.lower(),
-        f"Create background music with {emotion} atmosphere that represents: {transcribed_text}")
-# Function to generate music with MusicGen (using acoustic emotion prediction)
-def generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks):
-    try:
-        if processor is None or music_model is None:
-            return None
-        # Get specific prompt based on emotion
-        prompt = get_music_prompt(emotion_prediction, transcribed_text, chunk_idx, total_chunks)
-        # Limit prompt length to avoid model issues
-        if len(prompt) > 200:
-            prompt = prompt[:200] + "..."
-        inputs = processor(
-            text=[prompt],
-            padding=True,
-            return_tensors="pt",
-        ).to(device)
-        # Generate audio
-        audio_values = music_model.generate(**inputs, max_new_tokens=512)
-        # Convert to numpy array and sample rate
-        sampling_rate = music_model.config.audio_encoder.sampling_rate
-        audio_data = audio_values[0, 0].cpu().numpy()
-        # Normalize audio data
-        audio_data = audio_data / np.max(np.abs(audio_data))
-        # Create a temporary file to save the audio
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-            scipy.io.wavfile.write(tmp_file.name, rate=sampling_rate, data=audio_data)
-            return tmp_file.name
-    except Exception as e:
-        print("Error generating music:", e)
-        return None
-# --- DeepAI Image Generation (Text2Img) ---
-api_key = os.getenv("DeepAI_api_key")
-# Function to upscale image using Lanczos interpolation
-def upscale_image(image, target_width=4096, target_height=2048):
-    """
-    Upscale image using DeepAI's Torch-SRGAN API for super resolution
-    """
-    try:
-        if not api_key:
-            print("No API key available for upscaling")
-            # Fallback to OpenCV if no API key
-            img_array = np.array(image)
-            upscaled = cv2.resize(
-                img_array,
-                (target_width, target_height),
-                interpolation=cv2.INTER_LANCZOS4
-            )
-            return Image.fromarray(upscaled)
-        # Save the image to a temporary file
-        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_input:
-            image.save(tmp_input.name, "JPEG", quality=95)
-            # Make request to DeepAI torch-srgan API
-            response = requests.post(
-                "https://api.deepai.org/api/torch-srgan",
-                files={'image': open(tmp_input.name, 'rb')},
-                headers={'api-key': api_key}
-            )
-            data = response.json()
-            if 'output_url' in data:
-                # Download the upscaled image
-                img_resp = requests.get(data['output_url'])
-                upscaled_image = Image.open(BytesIO(img_resp.content))
-                # Ensure the image meets our target dimensions
-                if upscaled_image.size != (target_width, target_height):
-                    upscaled_image = upscaled_image.resize(
-                        (target_width, target_height),
-                        Image.Resampling.LANCZOS
-                    )
-                # Clean up temporary file
-                os.unlink(tmp_input.name)
-                return upscaled_image
-            else:
-                print("Error in DeepAI upscaling response:", data)
-                # Fallback to OpenCV if API fails
-                img_array = np.array(image)
-                upscaled = cv2.resize(
-                    img_array,
-                    (target_width, target_height),
-                    interpolation=cv2.INTER_LANCZOS4
-                )
-                return Image.fromarray(upscaled)
-    except Exception as e:
-        print(f"Error upscaling image with DeepAI: {e}")
-        # Fallback to OpenCV if any error occurs
-        img_array = np.array(image)
-        upscaled = cv2.resize(
-            img_array,
-            (target_width, target_height),
-            interpolation=cv2.INTER_LANCZOS4
-        )
-        return Image.fromarray(upscaled)
-# Function to generate image using DeepAI API
-def generate_image(sentiment_prediction, transcribed_text, chunk_idx, total_chunks):
-    try:
-        if not api_key:
-            # fallback white image if no API key
-            base_image = Image.new('RGB', (1024,512), color='white')
         else:
-            # Get specific prompt based on sentiment
-            prompt = get_image_prompt(sentiment_prediction, transcribed_text, chunk_idx, total_chunks)
-            # Make request to DeepAI text2img API
-            response = requests.post(
-                "https://api.deepai.org/api/text2img",
-                data={
-                    'text': prompt,
-                    'width': 1024,
-                    'height': 512,
-                    'image_generator_version': 'hd'
-                },
-                headers={'api-key': api_key}
-            )
-            data = response.json()
-            if 'output_url' in data:
-                # Download the generated image
-                img_resp = requests.get(data['output_url'])
-                base_image = Image.open(BytesIO(img_resp.content))
-            else:
-                print("Error in DeepAI response:", data)
-                # Return a fallback image
-                base_image = Image.new('RGB', (1024,512), color='white')
-        # Upscale the image for better quality in 360 viewer
-        upscaled_image = upscale_image(base_image)
-        return upscaled_image
-    except Exception as e:
-        print("Error generating image:", e)
-        # Return a fallback image
-        return Image.new('RGB', (1024,512), color='white')
-# Function to process a single chunk
-def process_chunk(chunk_path, chunk_idx, total_chunks, generate_audio=True):
-    try:
-        # Get acoustic emotion prediction (for music)
-        emotion_prediction = predict_emotion_from_audio(chunk_path)
-        # Get transcribed text
-        transcribed_text = transcribe(chunk_path)
-        # Analyze sentiment of transcribed text (for image)
-        sentiment, polarity = analyze_sentiment(transcribed_text)
-        # Generate image using SENTIMENT analysis with specific prompt
-        image = generate_image(sentiment, transcribed_text, chunk_idx, total_chunks)
-        # Add 360 metadata to the image
-        image_with_360_path = add_360_metadata(image)
-        # Generate music only if audio generation is enabled
-        music_path = None
-        if generate_audio:
-            music_path = generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks)
-        return {
-            'chunk_index': chunk_idx + 1,
-            'emotion': emotion_prediction,
-            'transcription': transcribed_text,
-            'sentiment': sentiment,
-            'image': image,  # Original image for display in Gradio
-            'image_360': image_with_360_path,  # Image with 360 metadata
-            'music': music_path
-        }
-    except Exception as e:
-        print(f"Error processing chunk {chunk_idx + 1}:", e)
-        # Return a fallback result with all required keys
-        return {
-            'chunk_index': chunk_idx + 1,
-            'emotion': "Error",
-            'transcription': "Transcription failed",
-            'sentiment': "Sentiment: error",
-            'image': Image.new('RGB', (1440, 770), color='white'),
-            'image_360': None,
-            'music': None
-        }
-# Function to get predictions for all chunks
-def get_predictions(audio_input, generate_audio=True, chunk_duration=10):
-    # Chunk the audio into segments
-    chunk_files, total_chunks = chunk_audio(audio_input, chunk_duration)
-    results = []
-    # Process each chunk
-    for i, chunk_path in enumerate(chunk_files):
-        print(f"Processing chunk {i+1}/{total_chunks} ({chunk_duration}s each)")
-        result = process_chunk(chunk_path, i, total_chunks, generate_audio)
-        results.append(result)
-    # Clean up temporary chunk files
-    for chunk_path in chunk_files:
-        try:
-            if chunk_path != audio_input:  # Don't delete original input file
-                os.unlink(chunk_path)
-        except:
-            pass
-    return results
-def create_xmp_block(width, height):
-    """Create XMP metadata block following ExifTool's exact format."""
-    xmp = (
-        f'<?xpacket begin="ï»¿" id="W5M0MpCehiHzreSzNTczkc9d"?>\n'
-        f'<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="ExifTool">\n'
-        f'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">\n'
-        f'<rdf:Description rdf:about=""\n'
-        f'xmlns:GPano="http://ns.google.com/photos/1.0/panorama/"\n'
-        f'GPano:ProjectionType="equirectangular"\n'
-        f'GPano:UsePanoramaViewer="True"\n'
-        f'GPano:FullPanoWidthPixels="{width}"\n'
-        f'GPano:FullPanoHeightPixels="{height}"\n'
-        f'GPano:CroppedAreaImageWidthPixels="{width}"\n'
-        f'GPano:CroppedAreaImageHeightPixels="{height}"\n'
-        f'GPano:CroppedAreaLeftPixels="0"\n'
-        f'GPano:CroppedAreaTopPixels="0"/>\n'
-        f'</rdf:RDF>\n'
-        f'</x:xmpmeta>\n'
-        f'<?xpacket end="w"?>'
-    )
-    return xmp
-def write_xmp_to_jpg(input_path, output_path, width, height):
-    """Write XMP metadata to JPEG file following ExifTool's method."""
-    # Read the original JPEG
-    with open(input_path, 'rb') as f:
-        data = f.read()
-    # Find the start of image marker
-    if data[0:2] != b'\xFF\xD8':
-        raise ValueError("Not a valid JPEG file")
-    # Create XMP data
-    xmp_data = create_xmp_block(width, height)
-    # Create APP1 segment for XMP
-    app1_marker = b'\xFF\xE1'
-    xmp_header = b'http://ns.adobe.com/xap/1.0/\x00'
-    xmp_bytes = xmp_data.encode('utf-8')
-    length = len(xmp_header) + len(xmp_bytes) + 2  # +2 for length bytes
-    length_bytes = struct.pack('>H', length)
-    # Construct new file content
-    output = bytearray()
-    output.extend(data[0:2])  # SOI marker
-    output.extend(app1_marker)
-    output.extend(length_bytes)
-    output.extend(xmp_header)
-    output.extend(xmp_bytes)
-    output.extend(data[2:])  # Rest of the original file
-    # Write the new file
-    with open(output_path, 'wb') as f:
-        f.write(output)
-def add_360_metadata(img):
-    """Add 360 photo metadata to a PIL Image and return the path to the processed image."""
-    try:
-        # First, ensure the image is upscaled to 4096x2048
-        target_width, target_height = 4096, 2048
-        if img.width != target_width or img.height != target_height:
-            img = img.resize((target_width, target_height), Image.Resampling.LANCZOS)
-        # Create a temporary file
-        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
-            # First save as high-quality JPEG
-            img.save(tmp_file.name, "JPEG", quality=95)
-            # Then inject XMP metadata directly into JPEG file
-            write_xmp_to_jpg(tmp_file.name, tmp_file.name, img.width, img.height)
-            return tmp_file.name
-    except Exception as e:
-        print(f"Error adding 360 metadata: {str(e)}")
-        # Fallback: return the original image path
-        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
-            img.save(tmp_file.name, "JPEG", quality=95)
-            return tmp_file.name
 def create_360_viewer_html(image_paths, audio_paths, output_path):
     """Create an HTML file with a 360 viewer and audio player for the given images and audio."""
     # Create a list of image data URIs
@@ -514,14 +122,14 @@ def create_360_viewer_html(image_paths, audio_paths, output_path):
         else:
             audio_data_list.append(None)  # Placeholder for chunks without audio
-    # Create the HTML content
     html_content = f"""
     <!DOCTYPE html>
     <html lang="en">
     <head>
         <meta charset="UTF-8">
         <meta name="viewport" content="width=device-width, initial-scale=1.0">
-        <title>360 Panorama Viewer with Audio</title>
         <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.css"/>
         <style>
             body {{
@@ -599,9 +207,46 @@ def create_360_viewer_html(image_paths, audio_paths, output_path):
                 border-radius: 3px;
                 border: 1px solid #ccc;
             }}
         </style>
     </head>
     <body>
         <div id="controls">
             <select id="image-selector">
                 {"".join([f'<option value="{i}">Chunk {i+1}</option>' for i in range(len(image_data_list))])}
@@ -657,6 +302,16 @@ def create_360_viewer_html(image_paths, audio_paths, output_path):
                 }}
             }}
             // Load the first image initially
             loadPanorama(0);
@@ -676,135 +331,7 @@ def create_360_viewer_html(image_paths, audio_paths, output_path):
     return output_path
-# Update the process_and_display function
-def process_and_display(audio_input, generate_audio, chunk_duration):
-    # Validate chunk duration
-    if chunk_duration is None or chunk_duration <= 0:
-        chunk_duration = 10
-    # Show loading indicator
-    yield [gr.HTML(f"""
-        <div style="text-align: center; margin: 20px;">
-            <p style="font-size: 18px; color: #4a4a4a;">Processing audio in {chunk_duration}-second chunks...</p>
-            <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
-            <style>@keyframes spin {{ 0% {{ transform: rotate(0deg); }} 100% {{ transform: rotate(360deg); }} }}</style>
-            <p style="font-size: 14px; color: #4a4a4a;">This may take several minutes depending on the audio length...</p>
-        </div>
-    """)] + [gr.Group(visible=False)] * len(group_components) + [None] * (len(output_containers) * 6) + [None, ""]
-    results = get_predictions(audio_input, generate_audio, chunk_duration)
-    # Initialize outputs list
-    outputs = []
-    group_visibility = []
-    all_360_images = []  # Collect all 360 images for the viewer
-    all_music_paths = []  # Collect all music paths for the viewer
-    # Process each result
-    for i, result in enumerate(results):
-        if i < len(output_containers):
-            group_visibility.append(gr.Group(visible=True))
-            outputs.extend([
-                result['emotion'],
-                result['transcription'],
-                result['sentiment'],
-                result['image'],
-                result['image_360'],
-                result['music']
-            ])
-            # Collect the 360-processed images and music
-            if result['image_360']:
-                all_360_images.append(result['image_360'])  # Use the 360-processed image
-            all_music_paths.append(result['music'])  # Can be None if no music generated
-        else:
-            # If we have more results than containers, just extend with None
-            group_visibility.append(gr.Group(visible=False))
-            outputs.extend([None] * 6)
-    # Hide remaining containers
-    for i in range(len(results), len(output_containers)):
-        group_visibility.append(gr.Group(visible=False))
-        outputs.extend([None] * 6)
-    # Create 360 viewer HTML if we have 360 images
-    viewer_html_path = None
-    if all_360_images:
-        with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as tmp_file:
-            viewer_html_path = create_360_viewer_html(all_360_images, all_music_paths, tmp_file.name)
-    # After processing, return the results along with other outputs
-    yield [gr.HTML("")] + group_visibility + outputs + [viewer_html_path, js_output, results]
-# Update the clear_all function to handle the new outputs
-def clear_all():
-    # Create a list with None for all outputs
-    outputs = [None]  # For audio input
-    # For group components (set to invisible)
-    outputs.extend([gr.Group(visible=False)] * len(group_components))
-    # For all output containers (set to None)
-    outputs.extend([None] * (len(output_containers) * 6))
-    # For loading indicator (empty HTML)
-    outputs.append(gr.HTML(""))
-    # For chunk duration (reset to 10)
-    outputs.append(10)
-    # For example selector (reset to None)
-    outputs.append(None)
-    # For viewer (set to None)
-    outputs.append(None)
-    # For JavaScript output (empty)
-    outputs.append("")
-    return outputs
-# Function to load example audio (placeholder - you need to implement this)
-def load_example_audio(example_name):
-    # This is a placeholder - you need to implement this function
-    # Return the path to the example audio file based on the example_name
-    return None
-# Function to generate a shareable link
-def generate_share_link(audio_input=None, generate_audio=True, chunk_duration=10):
-    try:
-        # Check if we're on Hugging Face Spaces
-        space_id = os.getenv('SPACE_ID')
-        if space_id:
-            space_url = f"https://huggingface.co/spaces/{space_id}"
-            return f"Your Space is already public! Share this URL: {space_url}\n\nTo share specific results, ask others to process the same audio with the same settings."
-        else:
-            if hasattr(interface, 'share_url') and interface.share_url:
-                return "Share this URL to let others use the app: " + interface.share_url + "\n\nTo share specific results, ask others to process the same audio with the same settings."
-            else:
-                return "Share link is not available. Make sure to set share=True when launching."
-    except Exception as e:
-        return f"Error generating share link: {str(e)}"
 # Create the Gradio interface with proper output handling
 with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as interface:
@@ -845,9 +372,6 @@ with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as i
             with gr.Row():
                 process_btn = gr.Button("Process Audio", variant="primary")
                 clear_btn = gr.Button("Clear All", variant="secondary")
-            # Add share button
-            share_btn = gr.Button("Generate Share Link", variant="secondary")
-            share_output = gr.Textbox(label="Share Link", interactive=False)
     # Add a loading indicator
     loading_indicator = gr.HTML("""
@@ -887,11 +411,12 @@ with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as i
             'music': audio_output
         })
-    # Add component for 360 viewer
     viewer_html_output = gr.File(
-        label="Download 360 Viewer",
         type="filepath",
-        interactive=False
     )
     # Add a hidden HTML component for JavaScript execution
@@ -909,9 +434,6 @@ with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as i
         return example_path, example_name
     # Set up the button clicks
-# Update the process_btn click handler to include results in the output
-    # Remove the results_state component and simplify the process_btn click handler
     process_btn.click(
         fn=process_and_display,
         inputs=[audio_input, generate_audio_checkbox, chunk_duration_input],
@@ -924,10 +446,6 @@ with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as i
             container['music']
         ]] + [viewer_html_output, js_output]
     )
-# Remove the results_state component
     clear_btn.click(
         fn=clear_all,
@@ -947,23 +465,6 @@ with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as i
         inputs=[example_selector],
         outputs=[audio_input, example_selector]
     )
-    # Update the share button to not expect results
-    share_btn.click(
-        fn=generate_share_link,
-        inputs=[audio_input, generate_audio_checkbox, chunk_duration_input],
-        outputs=[share_output]
-    )
-    # Check if we're running on Hugging Face Spaces
-    is_spaces = os.getenv('SPACE_ID') is not None
-    # Launch with appropriate settings
-    if is_spaces:
-        # On Spaces, don't use share=True as it's not supported
-        interface.launch()
-    else:
-        # Running locally, use share=True to generate a public link
-        interface.launch(share=True)

 from io import BytesIO
 import struct
 import cv2
+import shutil
+from datetime import datetime
+# [Keep all your existing code until the process_and_display function]
+# Update the process_and_display function to create a named HTML file
+def process_and_display(audio_input, generate_audio, chunk_duration):
+    # Validate chunk duration
+    if chunk_duration is None or chunk_duration <= 0:
+        chunk_duration = 10
+    # Show loading indicator
+    yield [gr.HTML(f"""
+        <div style="text-align: center; margin: 20px;">
+            <p style="font-size: 18px; color: #4a4a4a;">Processing audio in {chunk_duration}-second chunks...</p>
+            <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
+            <style>@keyframes spin {{ 0% {{ transform: rotate(0deg); }} 100% {{ transform: rotate(360deg); }} }}</style>
+            <p style="font-size: 14px; color: #4a4a4a;">This may take several minutes depending on the audio length...</p>
+        </div>
+    """)] + [gr.Group(visible=False)] * len(group_components) + [None] * (len(output_containers) * 6) + [None, ""]
+    results = get_predictions(audio_input, generate_audio, chunk_duration)
+    # Initialize outputs list
+    outputs = []
+    group_visibility = []
+    all_360_images = []  # Collect all 360 images for the viewer
+    all_music_paths = []  # Collect all music paths for the viewer
+    # Process each result
+    for i, result in enumerate(results):
+        if i < len(output_containers):
+            group_visibility.append(gr.Group(visible=True))
+            outputs.extend([
+                result['emotion'],
+                result['transcription'],
+                result['sentiment'],
+                result['image'],
+                result['image_360'],
+                result['music']
+            ])
+            # Collect the 360-processed images and music
+            if result['image_360']:
+                all_360_images.append(result['image_360'])  # Use the 360-processed image
+            all_music_paths.append(result['music'])  # Can be None if no music generated
         else:
+            # If we have more results than containers, just extend with None
+            group_visibility.append(gr.Group(visible=False))
+            outputs.extend([None] * 6)
+    # Hide remaining containers
+    for i in range(len(results), len(output_containers)):
+        group_visibility.append(gr.Group(visible=False))
+        outputs.extend([None] * 6)
+    # Create 360 viewer HTML if we have 360 images
+    viewer_html_path = None
+    if all_360_images:
+        # Create a timestamp for unique filenames
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        html_filename = f"MyAVE_{timestamp}.html"
+        # Create a temporary directory for our output
+        output_dir = tempfile.mkdtemp()
+        viewer_html_path = os.path.join(output_dir, html_filename)
+        # Create the HTML file
+        create_360_viewer_html(all_360_images, all_music_paths, viewer_html_path)
+    # After processing, return the results along with other outputs
+    yield [gr.HTML("")] + group_visibility + outputs + [viewer_html_path, js_output, results]
+# Update the create_360_viewer_html function to include a download button in the HTML itself
 def create_360_viewer_html(image_paths, audio_paths, output_path):
     """Create an HTML file with a 360 viewer and audio player for the given images and audio."""
     # Create a list of image data URIs
         else:
             audio_data_list.append(None)  # Placeholder for chunks without audio
+    # Create the HTML content with a styled download button
     html_content = f"""
     <!DOCTYPE html>
     <html lang="en">
     <head>
         <meta charset="UTF-8">
         <meta name="viewport" content="width=device-width, initial-scale=1.0">
+        <title>My AVE - 360 Panorama Viewer with Audio</title>
         <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.css"/>
         <style>
             body {{
                 border-radius: 3px;
                 border: 1px solid #ccc;
             }}
+            .download-btn {{
+                background: linear-gradient(to bottom, #4CAF50, #45a049);
+                color: white;
+                border: none;
+                padding: 12px 24px;
+                text-align: center;
+                text-decoration: none;
+                display: inline-block;
+                font-size: 16px;
+                margin: 10px 2px;
+                cursor: pointer;
+                border-radius: 25px;
+                box-shadow: 0 4px 8px rgba(0,0,0,0.2);
+                transition: all 0.3s ease;
+            }}
+            .download-btn:hover {{
+                background: linear-gradient(to bottom, #45a049, #4CAF50);
+                box-shadow: 0 6px 12px rgba(0,0,0,0.3);
+                transform: translateY(-2px);
+            }}
+            .header {{
+                display: flex;
+                justify-content: space-between;
+                align-items: center;
+                padding: 10px 20px;
+                background: rgba(0, 0, 0, 0.8);
+                color: white;
+            }}
+            .title {{
+                font-size: 24px;
+                font-weight: bold;
+            }}
         </style>
     </head>
     <body>
+        <div class="header">
+            <div class="title">My Affective Virtual Environment</div>
+            <button class="download-btn" onclick="downloadHTML()">Download This AVE</button>
+        </div>
         <div id="controls">
             <select id="image-selector">
                 {"".join([f'<option value="{i}">Chunk {i+1}</option>' for i in range(len(image_data_list))])}
                 }}
             }}
+            function downloadHTML() {{
+                // Create a download link for the current HTML file
+                const a = document.createElement('a');
+                a.href = window.location.href;
+                a.download = 'MyAVE.html';
+                document.body.appendChild(a);
+                a.click();
+                document.body.removeChild(a);
+            }}
             // Load the first image initially
             loadPanorama(0);
     return output_path
+# [Keep the rest of your code but remove the share button and related functions]
 # Create the Gradio interface with proper output handling
 with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as interface:
             with gr.Row():
                 process_btn = gr.Button("Process Audio", variant="primary")
                 clear_btn = gr.Button("Clear All", variant="secondary")
     # Add a loading indicator
     loading_indicator = gr.HTML("""
             'music': audio_output
         })
+    # Add component for 360 viewer with a fixed name
     viewer_html_output = gr.File(
+        label="Download Complete AVE Experience (HTML)",
         type="filepath",
+        interactive=False,
+        file_count="single"
     )
     # Add a hidden HTML component for JavaScript execution
         return example_path, example_name
     # Set up the button clicks
     process_btn.click(
         fn=process_and_display,
         inputs=[audio_input, generate_audio_checkbox, chunk_duration_input],
             container['music']
         ]] + [viewer_html_output, js_output]
     )
     clear_btn.click(
         fn=clear_all,
         inputs=[example_selector],
         outputs=[audio_input, example_selector]
     )
+    # Launch the interface
+    interface.launch()