Spaces:

jfforero
/

The_Emotional_Machine

Sleeping

App Files Files Community

The_Emotional_Machine / app.py

jfforero

Update app.py

db2648a verified 2 months ago

raw

history blame contribute delete

43.3 kB

	import gradio as gr
	import pyvista as pv
	from pyvista import examples
	import numpy as np
	import librosa
	import requests
	from io import BytesIO
	from PIL import Image
	import os
	from tensorflow.keras.models import load_model
	from faster_whisper import WhisperModel
	import random
	from textblob import TextBlob
	import torch
	import scipy.io.wavfile
	from transformers import AutoProcessor, MusicgenForConditionalGeneration
	import tempfile
	import base64
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	import soundfile as sf
	from pydub import AudioSegment
	import math
	import json
	import imageio
	from PIL import Image, ImageFilter
	import matplotlib.pyplot as plt
	from matplotlib.animation import FuncAnimation
	import base64
	from io import BytesIO
	import struct
	import cv2

	import os
	os.environ["OMP_NUM_THREADS"] = "4"
	os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

	# Load the emotion prediction model
	def load_emotion_model(model_path):
	try:
	model = load_model(model_path)
	print("Emotion model loaded successfully")
	return model
	except Exception as e:
	print("Error loading emotion prediction model:", e)
	return None

	model_path = 'mymodel_SER_LSTM_RAVDESS.h5'
	model = load_emotion_model(model_path)

	# Initialize WhisperModel
	model_size = "small"
	model2 = WhisperModel(model_size, device="cpu", compute_type="int8")

	# Load MusicGen model
	def load_musicgen_model():
	try:
	device = "cuda" if torch.cuda.is_available() else "cpu"
	processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
	music_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
	music_model.to(device)
	print("MusicGen model loaded successfully")
	return processor, music_model, device
	except Exception as e:
	print("Error loading MusicGen model:", e)
	return None, None, None

	processor, music_model, device = load_musicgen_model()

	# Function to chunk audio into segments
	def chunk_audio(audio_path, chunk_duration=10):
	"""Split audio into chunks and return list of chunk file paths"""
	try:
	# Load audio file
	audio = AudioSegment.from_file(audio_path)
	duration_ms = len(audio)
	chunk_ms = chunk_duration * 1000

	# Validate chunk duration
	if chunk_duration <= 0:
	raise ValueError("Chunk duration must be positive")

	if chunk_duration > duration_ms / 1000:
	# If chunk duration is longer than audio, return the whole audio
	return [audio_path], 1

	chunks = []
	chunk_files = []

	# Calculate number of chunks
	num_chunks = math.ceil(duration_ms / chunk_ms)

	for i in range(num_chunks):
	start_ms = i * chunk_ms
	end_ms = min((i + 1) * chunk_ms, duration_ms)

	# Extract chunk
	chunk = audio[start_ms:end_ms]
	chunks.append(chunk)

	# Save chunk to temporary file
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
	chunk.export(tmp_file.name, format="wav")
	chunk_files.append(tmp_file.name)

	return chunk_files, num_chunks

	except Exception as e:
	print("Error chunking audio:", e)
	# Return original file as single chunk if chunking fails
	return [audio_path], 1

	# Function to transcribe audio
	def transcribe(wav_filepath):
	try:
	segments, _ = model2.transcribe(wav_filepath, beam_size=5)
	return "".join([segment.text for segment in segments])
	except Exception as e:
	print("Error transcribing audio:", e)
	return "Transcription failed"

	# Function to extract MFCC features from audio
	def extract_mfcc(wav_file_name):
	try:
	y, sr = librosa.load(wav_file_name)
	mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
	return mfccs
	except Exception as e:
	print("Error extracting MFCC features:", e)
	return None

	# Emotions dictionary
	emotions = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful', 6: 'disgust', 7: 'surprised'}

	# Function to predict emotion from audio
	def predict_emotion_from_audio(wav_filepath):
	try:
	if model is None:
	return "Model not loaded"

	test_point = extract_mfcc(wav_filepath)
	if test_point is not None:
	test_point = np.reshape(test_point, newshape=(1, 40, 1))
	predictions = model.predict(test_point)
	predicted_emotion_label = np.argmax(predictions[0])
	return emotions.get(predicted_emotion_label, "Unknown emotion")
	else:
	return "Error: Unable to extract features"
	except Exception as e:
	print("Error predicting emotion:", e)
	return "Prediction error"

	# Function to analyze sentiment from text
	def analyze_sentiment(text):
	try:
	if not text or text.strip() == "":
	return "neutral", 0.0

	analysis = TextBlob(text)
	polarity = analysis.sentiment.polarity

	if polarity > 0.1:
	sentiment = "positive"
	elif polarity < -0.1:
	sentiment = "negative"
	else:
	sentiment = "neutral"

	return sentiment, polarity
	except Exception as e:
	print("Error analyzing sentiment:", e)
	return "neutral", 0.0

	# Function to get image prompt based on sentiment
	def get_image_prompt(sentiment, transcribed_text, chunk_idx, total_chunks):
	base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: "

	if sentiment == "positive":
	return f"Generate a non-figurative equirectangular 360 abstract texture of:{transcribed_text}. Use low histogram frequency in bright bins, dominant color in high RGB range, and high brightness and color variance. Apply high-frequency texture with strong filter energy, pronounced gradient magnitude, and strong local contrast. Use high spatial complexity, increased horizontal and vertical symmetry, high edge density, bright gray levels, and high contrast. Emphasize rich visual structure, color variation, and texture intensity across spatial composition."

	elif sentiment == "negative":
	return f"Generate a non-figurative equirectangular 360 abstract texture of:{transcribed_text}. Use high histogram frequency in dark bins, dominant color in low RGB range, and low brightness and color variance. Apply low-frequency texture with low filter energy, weak gradient magnitude, and low local contrast. Use low spatial complexity, reduced horizontal and vertical symmetry, low edge density, dark gray levels, and moderate contrast. Emphasize coarse structure and limited variation in color, texture, and spatial distribution."

	else: # neutral
	return f"Generate a non-figurative abstract equirectangular 360 abstract texture of:{transcribed_text}. Use a balanced histogram frequency across bins, dominant color in a mid RGB range, and moderate brightness and color variance. Apply medium-frequency texture with moderate filter energy, standard gradient magnitude, and average local contrast. Use medium spatial complexity, balanced horizontal and vertical symmetry, medium edge density, mid-range gray levels, and standard contrast. Emphasize naturalistic structure and typical variation in color, texture, and spatial distribution."

	# Function to get music prompt based on emotion
	def get_music_prompt(emotion, transcribed_text, chunk_idx, total_chunks):
	base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: "

	emotion_prompts = {
	'neutral': f"Generate a neutral soundtrack with balanced energy and smooth spectral profile. Use moderate tempo (~100 BPM), onset rate around 2.8/sec, spectral centroid near 1000 Hz, and low dissonance. Keep pitch salience moderate (0.50) and loudness stable (~0.70 dB). Maintain low harmonic change rate (~0.05/sec) and tonal entropy 1.5 for equilibrium. Emphasize tonal balance, steady dynamics, and calm tonal centers. The music should feel even, ambient, and unobtrusive, complementing: {transcribed_text}.",
	'calm': f"Generate a calm soundtrack with a slow tempo (~85 BPM), low onset rate (~2.2/sec), soft spectral centroid (~850 Hz), and smooth timbral evolution. Use low dissonance, high spectral flatness, and gentle pitch salience (~0.48). Keep loudness low (~0.65 dB) with infrequent harmonic changes (~0.04/sec) and stable tonality (Krumhansl value 0.80, major mode). The music should evoke tranquility and serenity through warm timbres, sustained harmonies, and flowing textures inspired by: {transcribed_text}.",
	'happy': f"Generate a happy soundtrack with fast tempo (~127 BPM), dense rhythmic activity (~4.2 onsets/sec), and bright timbre (spectral centroid ~1321 Hz). Use variable dissonance and peaked spectral kurtosis to create vivid texture. Maintain pitch salience (~0.54), loudness (~0.90 dB), and chord change rate (~0.07/sec). Keep tonal entropy moderate (1.95) and Krumhansl value (0.83, major mode). The music should convey joy and positivity through energetic rhythms, ornamented melodic contours, and harmonically grounded progressions inspired by: {transcribed_text}.",
	'sad': f"Generate a sad soundtrack with slow tempo (~72 BPM), sparse onset rate (~2.0/sec), and dark timbre (spectral centroid ~720 Hz). Use moderate dissonance, low spectral kurtosis, and soft pitch salience (~0.45). Keep loudness subdued (~0.60 dB) with rare harmonic changes (~0.05/sec) and low tonal entropy (~1.4). Emphasize minor mode with gentle phrasing and sustained harmonic textures. The music should evoke sadness, intimacy, and reflection in relation to: {transcribed_text}.",
	'angry': f"Generate an angry soundtrack with moderately fast tempo (~120 BPM), onset rate (~3.4/sec), and bright, sharp timbre (spectral centroid ~2002 Hz). Use flat spectral kurtosis and stable dissonance. Maintain clear pitch salience (~0.58), high loudness (~0.96 dB), and frequent chord changes (~0.10/sec). Set tonal entropy to 2.57 and Krumhansl key profile (~0.54, minor mode). The music should express anger through strong rhythmic drive, aggressive articulation, and harmonically unstable progressions that reflect: {transcribed_text}.",
	'fearful': f"Generate a fearful soundtrack with irregular tempo (~95 BPM), fluctuating onset rate (~3.0/sec), and high spectral variability (centroid ~1750 Hz). Use unstable dissonance, low pitch salience (~0.42), and dynamic loudness (~0.80 dB). Increase chord change irregularity (~0.09/sec) and tonal entropy (2.4, minor mode). Emphasize eerie textures, spatial tension, and spectral modulation. The music should evoke suspense, fear, and anticipation inspired by: {transcribed_text}.",
	'disgust': f"Generate a disgusted soundtrack with moderate tempo (~90 BPM), irregular onset rate (~2.5/sec), and dark, rough timbre (spectral centroid ~950 Hz). Use dissonant harmonies, unstable spectral kurtosis, and low pitch salience (~0.40). Keep loudness (~0.75 dB) and tonal entropy (~2.2, minor mode). The music should evoke discomfort and unease through distorted textures, rough intervals, and unstable harmonic motion reflecting: {transcribed_text}.",
	'surprised': f"Generate a surprised soundtrack with variable tempo (~110 BPM), fluctuating onset rate (~3.8/sec), and dynamic spectral centroid (~1500 Hz). Use high spectral kurtosis and pitch salience (~0.57) to accent sudden contrasts. Loudness should vary (~0.85 dB) with irregular chord changes (~0.11/sec) and moderate tonal entropy (~2.0, major mode). The music should evoke surprise and wonder through abrupt transitions, playful motifs, and expressive timbral changes inspired by: {transcribed_text}."
	}

	return emotion_prompts.get(
	emotion.lower(),
	f"Create background music with {emotion} atmosphere that represents: {transcribed_text}"
	)

	# Function to generate music with MusicGen (using acoustic emotion prediction)
	def generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks):
	try:
	if processor is None or music_model is None:
	return None

	# Get specific prompt based on emotion
	prompt = get_music_prompt(emotion_prediction, transcribed_text, chunk_idx, total_chunks)

	# Limit prompt length to avoid model issues
	if len(prompt) > 200:
	prompt = prompt[:200] + "..."

	inputs = processor(
	text=[prompt],
	padding=True,
	return_tensors="pt",
	).to(device)

	# Generate audio
	audio_values = music_model.generate(**inputs, max_new_tokens=512)

	# Convert to numpy array and sample rate
	sampling_rate = music_model.config.audio_encoder.sampling_rate
	audio_data = audio_values[0, 0].cpu().numpy()

	# Normalize audio data
	audio_data = audio_data / np.max(np.abs(audio_data))

	# Create a temporary file to save the audio
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
	scipy.io.wavfile.write(tmp_file.name, rate=sampling_rate, data=audio_data)
	return tmp_file.name

	except Exception as e:
	print("Error generating music:", e)
	return None

	# --- DeepAI Image Generation (Text2Img) ---
	api_key = os.getenv("DeepAI_api_key")

	# Function to upscale image using Lanczos interpolation
	def upscale_image(image, target_width=4096, target_height=2048):
	"""
	Upscale image using DeepAI's Torch-SRGAN API for super resolution
	"""
	try:
	if not api_key:
	print("No API key available for upscaling")
	# Fallback to OpenCV if no API key
	img_array = np.array(image)
	upscaled = cv2.resize(
	img_array,
	(target_width, target_height),
	interpolation=cv2.INTER_LANCZOS4
	)
	return Image.fromarray(upscaled)

	# Save the image to a temporary file
	with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_input:
	image.save(tmp_input.name, "JPEG", quality=95)

	# Make request to DeepAI torch-srgan API
	response = requests.post(
	"https://api.deepai.org/api/torch-srgan",
	files={'image': open(tmp_input.name, 'rb')},
	headers={'api-key': api_key}
	)

	data = response.json()

	if 'output_url' in data:
	# Download the upscaled image
	img_resp = requests.get(data['output_url'])
	upscaled_image = Image.open(BytesIO(img_resp.content))

	# Ensure the image meets our target dimensions
	if upscaled_image.size != (target_width, target_height):
	upscaled_image = upscaled_image.resize(
	(target_width, target_height),
	Image.Resampling.LANCZOS
	)

	# Clean up temporary file
	os.unlink(tmp_input.name)
	return upscaled_image
	else:
	print("Error in DeepAI upscaling response:", data)
	# Fallback to OpenCV if API fails
	img_array = np.array(image)
	upscaled = cv2.resize(
	img_array,
	(target_width, target_height),
	interpolation=cv2.INTER_LANCZOS4
	)
	return Image.fromarray(upscaled)

	except Exception as e:
	print(f"Error upscaling image with DeepAI: {e}")
	# Fallback to OpenCV if any error occurs
	img_array = np.array(image)
	upscaled = cv2.resize(
	img_array,
	(target_width, target_height),
	interpolation=cv2.INTER_LANCZOS4
	)
	return Image.fromarray(upscaled)

	# ADD THE MISSING generate_image FUNCTION HERE
	def generate_image(sentiment_prediction, transcribed_text, chunk_idx, total_chunks):
	try:
	if not api_key:
	# fallback white image if no API key
	base_image = Image.new('RGB', (1024,512), color='white')
	else:
	# Get specific prompt based on sentiment
	prompt = get_image_prompt(sentiment_prediction, transcribed_text, chunk_idx, total_chunks)

	# Make request to DeepAI text2img API
	response = requests.post(
	"https://api.deepai.org/api/text2img",
	data={
	'text': prompt,
	'width': 1024,
	'height': 512,
	'image_generator_version': 'hd'
	},
	headers={'api-key': api_key}
	)

	data = response.json()
	if 'output_url' in data:
	# Download the generated image
	img_resp = requests.get(data['output_url'])
	base_image = Image.open(BytesIO(img_resp.content))
	else:
	print("Error in DeepAI response:", data)
	# Return a fallback image
	base_image = Image.new('RGB', (1024,512), color='white')

	# Upscale the image for better quality in 360 viewer
	upscaled_image = upscale_image(base_image)
	return upscaled_image

	except Exception as e:
	print("Error generating image:", e)
	# Return a fallback image
	return Image.new('RGB', (1024,512), color='white')

	# Function to process a single chunk
	def process_chunk(chunk_path, chunk_idx, total_chunks, generate_audio=True):
	try:
	# Get acoustic emotion prediction (for music)
	emotion_prediction = predict_emotion_from_audio(chunk_path)

	# Get transcribed text
	transcribed_text = transcribe(chunk_path)

	# Analyze sentiment of transcribed text (for image)
	sentiment, polarity = analyze_sentiment(transcribed_text)

	# Generate image using SENTIMENT analysis with specific prompt
	image = generate_image(sentiment, transcribed_text, chunk_idx, total_chunks)

	# Add 360 metadata to the image
	image_with_360_path = add_360_metadata(image)

	# Generate music only if audio generation is enabled
	music_path = None
	if generate_audio:
	music_path = generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks)

	return {
	'chunk_index': chunk_idx + 1,
	'emotion': emotion_prediction,
	'transcription': transcribed_text,
	'sentiment': sentiment,
	'image': image, # Original image for display in Gradio
	'image_360': image_with_360_path, # Image with 360 metadata
	'music': music_path
	}
	except Exception as e:
	print(f"Error processing chunk {chunk_idx + 1}:", e)
	# Return a fallback result with all required keys
	return {
	'chunk_index': chunk_idx + 1,
	'emotion': "Error",
	'transcription': "Transcription failed",
	'sentiment': "Sentiment: error",
	'image': Image.new('RGB', (1440, 770), color='white'),
	'image_360': None,
	'music': None
	}

	# Function to get predictions for all chunks
	def get_predictions(audio_input, generate_audio=True, chunk_duration=10):
	# Chunk the audio into segments
	chunk_files, total_chunks = chunk_audio(audio_input, chunk_duration)

	results = []

	# Process each chunk
	for i, chunk_path in enumerate(chunk_files):
	print(f"Processing chunk {i+1}/{total_chunks} ({chunk_duration}s each)")
	result = process_chunk(chunk_path, i, total_chunks, generate_audio)
	results.append(result)

	# Clean up temporary chunk files
	for chunk_path in chunk_files:
	try:
	if chunk_path != audio_input: # Don't delete original input file
	os.unlink(chunk_path)
	except:
	pass

	return results

	def create_xmp_block(width, height):
	"""Create XMP metadata block following ExifTool's exact format."""
	xmp = (
	f'<?xpacket begin="ï»¿" id="W5M0MpCehiHzreSzNTczkc9d"?>\n'
	f'<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="ExifTool">\n'
	f'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">\n'
	f'<rdf:Description rdf:about=""\n'
	f'xmlns:GPano="http://ns.google.com/photos/1.0/panorama/"\n'
	f'GPano:ProjectionType="equirectangular"\n'
	f'GPano:UsePanoramaViewer="True"\n'
	f'GPano:FullPanoWidthPixels="{width}"\n'
	f'GPano:FullPanoHeightPixels="{height}"\n'
	f'GPano:CroppedAreaImageWidthPixels="{width}"\n'
	f'GPano:CroppedAreaImageHeightPixels="{height}"\n'
	f'GPano:CroppedAreaLeftPixels="0"\n'
	f'GPano:CroppedAreaTopPixels="0"/>\n'
	f'</rdf:RDF>\n'
	f'</x:xmpmeta>\n'
	f'<?xpacket end="w"?>'
	)
	return xmp

	def write_xmp_to_jpg(input_path, output_path, width, height):
	"""Write XMP metadata to JPEG file following ExifTool's method."""
	# Read the original JPEG
	with open(input_path, 'rb') as f:
	data = f.read()

	# Find the start of image marker
	if data[0:2] != b'\xFF\xD8':
	raise ValueError("Not a valid JPEG file")

	# Create XMP data
	xmp_data = create_xmp_block(width, height)

	# Create APP1 segment for XMP
	app1_marker = b'\xFF\xE1'
	xmp_header = b'http://ns.adobe.com/xap/1.0/\x00'
	xmp_bytes = xmp_data.encode('utf-8')
	length = len(xmp_header) + len(xmp_bytes) + 2 # +2 for length bytes
	length_bytes = struct.pack('>H', length)

	# Construct new file content
	output = bytearray()
	output.extend(data[0:2]) # SOI marker
	output.extend(app1_marker)
	output.extend(length_bytes)
	output.extend(xmp_header)
	output.extend(xmp_bytes)
	output.extend(data[2:]) # Rest of the original file

	# Write the new file
	with open(output_path, 'wb') as f:
	f.write(output)

	def add_360_metadata(img):
	"""Add 360 photo metadata to a PIL Image and return the path to the processed image."""
	try:
	# First, ensure the image is upscaled to 4096x2048
	target_width, target_height = 4096, 2048
	if img.width != target_width or img.height != target_height:
	img = img.resize((target_width, target_height), Image.Resampling.LANCZOS)

	# Create a temporary file
	with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
	# First save as high-quality JPEG
	img.save(tmp_file.name, "JPEG", quality=95)

	# Then inject XMP metadata directly into JPEG file
	write_xmp_to_jpg(tmp_file.name, tmp_file.name, img.width, img.height)

	return tmp_file.name

	except Exception as e:
	print(f"Error adding 360 metadata: {str(e)}")
	# Fallback: return the original image path
	with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
	img.save(tmp_file.name, "JPEG", quality=95)
	return tmp_file.name

	def create_360_viewer_html(image_paths, audio_paths, output_path):
	"""Create an HTML file with a 360 viewer and audio player for the given images and audio."""
	# Create a list of image data URIs
	image_data_list = []
	for img_path in image_paths:
	with open(img_path, "rb") as f:
	img_data = base64.b64encode(f.read()).decode("utf-8")
	image_data_list.append(f"data:image/jpeg;base64,{img_data}")

	# Create a list of audio data URIs
	audio_data_list = []
	for audio_path in audio_paths:
	if audio_path: # Only process if audio exists
	with open(audio_path, "rb") as f:
	audio_data = base64.b64encode(f.read()).decode("utf-8")
	audio_data_list.append(f"data:audio/wav;base64,{audio_data}")
	else:
	audio_data_list.append(None) # Placeholder for chunks without audio

	# Create the HTML content
	html_content = f"""
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>360 Panorama Viewer with Audio</title>
	<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.css"/>
	<style>
	body {{
	margin: 0;
	overflow: hidden;
	font-family: Arial, sans-serif;
	}}
	#panorama {{
	width: 100vw;
	height: 80vh;
	}}
	.pnlm-hotspot.pnlm-info-hotspot {{
	background-color: rgba(0, 150, 255, 0.8);
	border-radius: 50%;
	width: 30px;
	height: 30px;
	}}
	.pnlm-hotspot.pnlm-info-hotspot .pnlm-sprite {{
	filter: brightness(0) invert(1);
	}}
	.pnlm-tooltip {{
	background-color: rgba(0, 0, 0, 0.7);
	color: white;
	border-radius: 3px;
	padding: 5px 10px;
	}}
	#controls {{
	position: absolute;
	top: 10px;
	right: 10px;
	z-index: 1000;
	background: rgba(0, 0, 0, 0.7);
	color: white;
	padding: 10px;
	border-radius: 5px;
	display: flex;
	flex-direction: column;
	gap: 10px;
	}}
	#audio-controls {{
	position: fixed;
	bottom: 0;
	left: 0;
	width: 100%;
	background: rgba(0, 0, 0, 0.8);
	color: white;
	padding: 15px;
	display: flex;
	flex-direction: column;
	align-items: center;
	z-index: 1000;
	}}
	#audio-player {{
	width: 80%;
	margin-bottom: 10px;
	}}
	#audio-info {{
	text-align: center;
	font-size: 14px;
	}}
	button {{
	background: #3498db;
	color: white;
	border: none;
	padding: 8px 15px;
	border-radius: 3px;
	cursor: pointer;
	margin: 5px;
	}}
	button:hover {{
	background: #2980b9;
	}}
	select {{
	padding: 5px;
	border-radius: 3px;
	border: 1px solid #ccc;
	}}
	</style>
	</head>
	<body>
	<div id="controls">
	<select id="image-selector">
	{"".join([f'<option value="{i}">Chunk {i+1}</option>' for i in range(len(image_data_list))])}
	</select>
	</div>

	<div id="panorama"></div>

	<div id="audio-controls">
	<audio id="audio-player" controls></audio>
	<div id="audio-info">No audio available for this chunk</div>
	</div>

	<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.js"></script>
	<script>
	const images = {json.dumps(image_data_list)};
	const audioFiles = {json.dumps(audio_data_list)};
	let currentViewer = null;

	function loadPanorama(index) {{
	if (currentViewer) {{
	currentViewer.destroy();
	}}

	currentViewer = pannellum.viewer('panorama', {{
	"type": "equirectangular",
	"panorama": images[index],
	"autoLoad": true,
	"autoRotate": -2,
	"showZoomCtrl": true,
	"showFullscreenCtrl": true,
	"hfov": 100
	}});

	// Update audio player
	updateAudioPlayer(index);
	}}

	function updateAudioPlayer(index) {{
	const audioPlayer = document.getElementById('audio-player');
	const audioInfo = document.getElementById('audio-info');

	if (audioFiles[index]) {{
	audioPlayer.src = audioFiles[index];
	audioInfo.textContent = 'Playing audio for Chunk ' + (index + 1);
	// Try to play automatically (may be blocked by browser policies)
	audioPlayer.play().catch(e => {{
	audioInfo.textContent = 'Click play to listen to audio for Chunk ' + (index + 1);
	}});
	}} else {{
	audioPlayer.src = '';
	audioInfo.textContent = 'No audio available for this chunk';
	}}
	}}

	// Load the first image initially
	loadPanorama(0);

	// Handle image selection changes
	document.getElementById('image-selector').addEventListener('change', function(e) {{
	const selectedIndex = parseInt(e.target.value);
	loadPanorama(selectedIndex);
	}});
	</script>
	</body>
	</html>
	"""

	# Write the HTML to a file
	with open(output_path, 'w') as f:
	f.write(html_content)

	return output_path

	# Update the process_and_display function
	def process_and_display(audio_input, generate_audio, chunk_duration):
	# Validate chunk duration
	if chunk_duration is None or chunk_duration <= 0:
	chunk_duration = 10

	# Show loading indicator
	yield [gr.HTML(f"""
	<div style="text-align: center; margin: 20px;">
	<p style="font-size: 18px; color: #4a4a4a;">Processing audio in {chunk_duration}-second chunks...</p>
	<div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
	<style>@keyframes spin {{ 0% {{ transform: rotate(0deg); }} 100% {{ transform: rotate(360deg); }} }}</style>
	<p style="font-size: 14px; color: #4a4a4a;">This may take several minutes depending on the audio length...</p>
	</div>
	""")] + [gr.Group(visible=False)] * len(group_components) + [None] * (len(output_containers) * 6) + [None, ""]

	results = get_predictions(audio_input, generate_audio, chunk_duration)

	# Initialize outputs list
	outputs = []
	group_visibility = []
	all_360_images = [] # Collect all 360 images for the viewer
	all_music_paths = [] # Collect all music paths for the viewer

	# Process each result
	for i, result in enumerate(results):
	if i < len(output_containers):
	group_visibility.append(gr.Group(visible=True))
	outputs.extend([
	result['emotion'],
	result['transcription'],
	result['sentiment'],
	result['image'],
	result['image_360'],
	result['music']
	])
	# Collect the 360-processed images and music
	if result['image_360']:
	all_360_images.append(result['image_360']) # Use the 360-processed image
	all_music_paths.append(result['music']) # Can be None if no music generated
	else:
	# If we have more results than containers, just extend with None
	group_visibility.append(gr.Group(visible=False))
	outputs.extend([None] * 6)

	# Hide remaining containers
	for i in range(len(results), len(output_containers)):
	group_visibility.append(gr.Group(visible=False))
	outputs.extend([None] * 6)

	# Create 360 viewer HTML if we have 360 images
	viewer_html_path = None
	if all_360_images:
	with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as tmp_file:
	viewer_html_path = create_360_viewer_html(all_360_images, all_music_paths, tmp_file.name)

	# Hide loading indicator and show results
	yield [gr.HTML("")] + group_visibility + outputs + [viewer_html_path, ""]

	# Update the clear_all function to handle the new outputs
	def clear_all():
	# Create a list with None for all outputs
	outputs = [None] # For audio input

	# For group components (set to invisible)
	outputs.extend([gr.Group(visible=False)] * len(group_components))

	# For all output containers (set to None)
	outputs.extend([None] * (len(output_containers) * 6))

	# For loading indicator (empty HTML)
	outputs.append(gr.HTML(""))

	# For chunk duration (reset to 10)
	outputs.append(10)

	# For example selector (reset to None)
	outputs.append(None)

	# For viewer (set to None)
	outputs.append(None)

	# For JavaScript output (empty)
	outputs.append("")

	return outputs

	# Function to load example audio (placeholder - you need to implement this)
	def load_example_audio(example_name):
	# This is a placeholder - you need to implement this function
	# Return the path to the example audio file based on the example_name
	return None

	# Custom CSS for enhanced styling
	custom_css = """
	.download-section {
	background: rgba(255,255,255,255);
	padding: 25px;
	border-radius: 15px;
	border: 3px solid #764ba2;
	text-align: left;
	margin: 25px 0;
	box-shadow: 0 10px 30px rgba(0,0,0,0.15);
	position: relative;
	overflow: hidden;
	}

	.download-section::before {
	content: "";
	position: absolute;
	top: -50%;
	left: -50%;
	width: 200%;
	height: 200%;
	background: radial-gradient(circle, rgba(255,255,255,0.1) 0%, transparent 70%);
	animation: shimmer 3s infinite linear;
	pointer-events: none;
	}

	@keyframes shimmer {
	0% { transform: rotate(0deg); }
	100% { transform: rotate(360deg); }
	}

	.download-section h2 {
	color: white;
	font-size: 16px;
	margin-bottom: 15px;
	text-shadow: 1px 1px 3px rgba(0,0,0,0.3);
	}

	.download-section p {
	color: rgba(255,255,255,0.9);
	font-size: 16px;
	margin-bottom: 20px;
	line-height: 3.5;
	}

	.download-button {
	background: rgba(155,155,155,255) !important;
	color: white !important;
	border: none !important;
	padding: 12px 30px !important;
	border-radius: 0px !important;
	font-weight: bold !important;
	font-size: 16px !important;
	margin-top: 15px !important;
	transition: all 0.3s ease !important;
	cursor: pointer !important;
	display: inline-block !important;
	}

	.download-button:hover {
	transform: translateY(-3px) !important;
	box-shadow: 0 8px 20px rgba(0,0,0,0.6) !important;
	}

	.download-button:active {
	transform: translateY(1px) !important;
	}

	.download-icon {
	margin-right: 8px;
	font-size: 28px;
	}

	.feature-list {
	display: flex;
	justify-content: center;
	flex-wrap: wrap;
	gap: 15px;
	margin: 20px 0;
	}

	.feature-item {
	background: rgba(255,255,255,0.15);
	padding: 10px 15px;
	border-radius: 8px;
	display: flex;
	align-items: center;
	gap: 8px;
	color: white;
	font-size: 14px;
	}

	.feature-icon {
	font-size: 26px;
	}

	.viewer-preview {
	margin-top: 20px;
	border-radius: 10px;
	overflow: hidden;
	box-shadow: 0 5px 15px rgba(0,0,0,0.2);
	max-width: 400px;
	margin-left: auto;
	margin-right: auto;
	}

	.viewer-preview img {
	width: 100%;
	display: block;
	}

	.instructions {
	background: rgba(255,255,255,0.1);
	padding: 15px;
	border-radius: 8px;
	margin-top: 20px;
	text-align: left;
	}

	.instructions h3 {
	color: white;
	margin-top: 0;
	font-size: 16px;
	}

	.instructions ol {
	color: rgba(255,255,255,0.9);
	padding-left: 20px;
	margin-bottom: 0;
	}

	.instructions li {
	margin-bottom: 8px;
	}
	"""
	# Create the Gradio interface with proper output handling
	with gr.Blocks(title="Affective Virtual Environments - Chunked Processing", css=custom_css) as interface:
	gr.Markdown("# The Emotional Machine")
	gr.Markdown(
	"""
	The Emotional Machine is a digital media project that generates virtual environments using multimodal speech emotion recognition as its main mode of interaction.

	### How to interact
	1. Record your voice or upload an audio file.
	2. Define the length to chunk your sample.
	3. Use the checkbox if you want to generate audio for each chunk.
	4. Generate your Affective Virtual Environment and wait for the results.
	5. Download the HTML file.
	6. Open your creation using any web browser.
	---
	Learn more:
	• Video Tutorial: [How to Use this space ](https://youtu.be/eVD1lzwVhi8)

	• For more information about the project, visit: [www.emotional-machines.com](https://www.emotional-machines.com)

	"""
	)


	with gr.Row():
	with gr.Column(scale=2):
	audio_input = gr.Audio(label="Input Audio", type="filepath", sources=["microphone", "upload"])

	# Add example audio selection
	# example_selector = gr.Dropdown(
	# label="Select Example Audio",
	# choices=["Happy Speech", "Sad Story", "Neutral News"],
	# value=None,
	# info="Choose from pre-recorded example speeches"
	# )

	# Add button to load selected example
	#load_example_btn = gr.Button("Load Example", variant="secondary")

	with gr.Column(scale=1):
	# Add chunk duration input
	chunk_duration_input = gr.Number(
	label="Chunk Duration (seconds)",
	value=10,
	minimum=1,
	maximum=60,
	step=1,
	info="Duration of each audio segment to process (1-60 seconds)"
	)
	# Add checkbox for audio generation
	generate_audio_checkbox = gr.Checkbox(
	label="Generate Audio (may take longer)",
	value=False,
	info="Uncheck to skip music generation and speed up processing"
	)
	with gr.Row():
	process_btn = gr.Button("Generate", variant="primary")
	clear_btn = gr.Button("Clear All", variant="secondary")

	# Add a loading indicator
	loading_indicator = gr.HTML("""
	<div id="loading" style="display: none; text-align: center; margin: 20px;">
	<p style="font-size: 18px; color: #4a4a4a;">Processing audio chunks...</p>
	<div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
	<style>@keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } }</style>
	</div>
	""")

	# Create output components for each chunk type
	output_containers = []
	group_components = [] # Store group components separately

	# We'll create up to 20 chunk slots to accommodate different chunk durations
	for i in range(20):
	with gr.Group(visible=False) as chunk_group:
	gr.Markdown(f"### Chunk {i+1} Results")
	with gr.Row():
	emotion_output = gr.Label(label="Acoustic Emotion Prediction")
	transcription_output = gr.Label(label="Transcribed Text")
	sentiment_output = gr.Label(label="Sentimental Analysis")
	with gr.Row():
	image_output = gr.Image(label="Generated Equirectangular Image")
	image_360_output = gr.File(label="Download 360 Image", type="filepath")
	with gr.Row():
	audio_output = gr.Audio(label="Generated Music")
	gr.HTML("<hr style='margin: 20px 0; border: 1px solid #ccc;'>")

	group_components.append(chunk_group)
	output_containers.append({
	'emotion': emotion_output,
	'transcription': transcription_output,
	'sentiment': sentiment_output,
	'image': image_output,
	'image_360': image_360_output,
	'music': audio_output
	})

	# Enhanced Download 360 Viewer Section
	with gr.Group(visible=True, elem_classes="download-section") as download_group:
	gr.Markdown("""



	""")

	# Enhanced download button
	viewer_html_output = gr.File(
	label=" Once processing is complete, download your AVE from here 🚀",
	type="filepath",
	interactive=False,
	elem_classes="download-button"
	)

	# Add a hidden HTML component for JavaScript execution
	js_output = gr.HTML(visible=False)

	# Function to handle example selection
	def load_example(example_name):
	if not example_name:
	return None, None

	# Get the path to the example audio file
	example_path = load_example_audio(example_name)

	# Return the example path to update the audio component
	return example_path, example_name

	# Set up the button clicks
	process_btn.click(
	fn=process_and_display,
	inputs=[audio_input, generate_audio_checkbox, chunk_duration_input],
	outputs=[loading_indicator] + group_components + [comp for container in output_containers for comp in [
	container['emotion'],
	container['transcription'],
	container['sentiment'],
	container['image'],
	container['image_360'],
	container['music']
	]] + [viewer_html_output, js_output]
	)

	clear_btn.click(
	fn=clear_all,
	inputs=[],
	outputs=[audio_input] + group_components + [comp for container in output_containers for comp in [
	container['emotion'],
	container['transcription'],
	container['sentiment'],
	container['image'],
	container['image_360'],
	container['music']
	]] + [loading_indicator, chunk_duration_input, viewer_html_output, js_output]
	)

	#load_example_btn.click(
	# fn=load_example,
	# inputs=[example_selector],
	# outputs=[audio_input, example_selector]
	#)

	# Check if we're running on Hugging Face Spaces
	is_spaces = os.getenv('SPACE_ID') is not None

	# Launch with appropriate settings
	interface.launch(share=True) # Only share when not on Spaces