|
|
import gradio as gr |
|
|
import pyvista as pv |
|
|
from pyvista import examples |
|
|
import numpy as np |
|
|
import librosa |
|
|
import requests |
|
|
from io import BytesIO |
|
|
from PIL import Image |
|
|
import os |
|
|
from tensorflow.keras.models import load_model |
|
|
from faster_whisper import WhisperModel |
|
|
import random |
|
|
from textblob import TextBlob |
|
|
import torch |
|
|
import scipy.io.wavfile |
|
|
from transformers import AutoProcessor, MusicgenForConditionalGeneration |
|
|
import tempfile |
|
|
import base64 |
|
|
import plotly.graph_objects as go |
|
|
from plotly.subplots import make_subplots |
|
|
import soundfile as sf |
|
|
from pydub import AudioSegment |
|
|
import math |
|
|
import json |
|
|
import imageio |
|
|
from PIL import Image, ImageFilter |
|
|
import matplotlib.pyplot as plt |
|
|
from matplotlib.animation import FuncAnimation |
|
|
import base64 |
|
|
from io import BytesIO |
|
|
import struct |
|
|
import cv2 |
|
|
|
|
|
|
|
|
def load_emotion_model(model_path): |
|
|
try: |
|
|
model = load_model(model_path) |
|
|
print("Emotion model loaded successfully") |
|
|
return model |
|
|
except Exception as e: |
|
|
print("Error loading emotion prediction model:", e) |
|
|
return None |
|
|
|
|
|
model_path = 'mymodel_SER_LSTM_RAVDESS.h5' |
|
|
model = load_emotion_model(model_path) |
|
|
|
|
|
|
|
|
model_size = "small" |
|
|
model2 = WhisperModel(model_size, device="cpu", compute_type="int8") |
|
|
|
|
|
|
|
|
def load_musicgen_model(): |
|
|
try: |
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
processor = AutoProcessor.from_pretrained("facebook/musicgen-small") |
|
|
music_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small") |
|
|
music_model.to(device) |
|
|
print("MusicGen model loaded successfully") |
|
|
return processor, music_model, device |
|
|
except Exception as e: |
|
|
print("Error loading MusicGen model:", e) |
|
|
return None, None, None |
|
|
|
|
|
processor, music_model, device = load_musicgen_model() |
|
|
|
|
|
|
|
|
def chunk_audio(audio_path, chunk_duration=10): |
|
|
"""Split audio into chunks and return list of chunk file paths""" |
|
|
try: |
|
|
|
|
|
audio = AudioSegment.from_file(audio_path) |
|
|
duration_ms = len(audio) |
|
|
chunk_ms = chunk_duration * 1000 |
|
|
|
|
|
|
|
|
if chunk_duration <= 0: |
|
|
raise ValueError("Chunk duration must be positive") |
|
|
|
|
|
if chunk_duration > duration_ms / 1000: |
|
|
|
|
|
return [audio_path], 1 |
|
|
|
|
|
chunks = [] |
|
|
chunk_files = [] |
|
|
|
|
|
|
|
|
num_chunks = math.ceil(duration_ms / chunk_ms) |
|
|
|
|
|
for i in range(num_chunks): |
|
|
start_ms = i * chunk_ms |
|
|
end_ms = min((i + 1) * chunk_ms, duration_ms) |
|
|
|
|
|
|
|
|
chunk = audio[start_ms:end_ms] |
|
|
chunks.append(chunk) |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: |
|
|
chunk.export(tmp_file.name, format="wav") |
|
|
chunk_files.append(tmp_file.name) |
|
|
|
|
|
return chunk_files, num_chunks |
|
|
|
|
|
except Exception as e: |
|
|
print("Error chunking audio:", e) |
|
|
|
|
|
return [audio_path], 1 |
|
|
|
|
|
|
|
|
def transcribe(wav_filepath): |
|
|
try: |
|
|
segments, _ = model2.transcribe(wav_filepath, beam_size=5) |
|
|
return "".join([segment.text for segment in segments]) |
|
|
except Exception as e: |
|
|
print("Error transcribing audio:", e) |
|
|
return "Transcription failed" |
|
|
|
|
|
|
|
|
def extract_mfcc(wav_file_name): |
|
|
try: |
|
|
y, sr = librosa.load(wav_file_name) |
|
|
mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0) |
|
|
return mfccs |
|
|
except Exception as e: |
|
|
print("Error extracting MFCC features:", e) |
|
|
return None |
|
|
|
|
|
|
|
|
emotions = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful', 6: 'disgust', 7: 'surprised'} |
|
|
|
|
|
|
|
|
def predict_emotion_from_audio(wav_filepath): |
|
|
try: |
|
|
if model is None: |
|
|
return "Model not loaded" |
|
|
|
|
|
test_point = extract_mfcc(wav_filepath) |
|
|
if test_point is not None: |
|
|
test_point = np.reshape(test_point, newshape=(1, 40, 1)) |
|
|
predictions = model.predict(test_point) |
|
|
predicted_emotion_label = np.argmax(predictions[0]) |
|
|
return emotions.get(predicted_emotion_label, "Unknown emotion") |
|
|
else: |
|
|
return "Error: Unable to extract features" |
|
|
except Exception as e: |
|
|
print("Error predicting emotion:", e) |
|
|
return "Prediction error" |
|
|
|
|
|
|
|
|
def analyze_sentiment(text): |
|
|
try: |
|
|
if not text or text.strip() == "": |
|
|
return "neutral", 0.0 |
|
|
|
|
|
analysis = TextBlob(text) |
|
|
polarity = analysis.sentiment.polarity |
|
|
|
|
|
if polarity > 0.1: |
|
|
sentiment = "positive" |
|
|
elif polarity < -0.1: |
|
|
sentiment = "negative" |
|
|
else: |
|
|
sentiment = "neutral" |
|
|
|
|
|
return sentiment, polarity |
|
|
except Exception as e: |
|
|
print("Error analyzing sentiment:", e) |
|
|
return "neutral", 0.0 |
|
|
|
|
|
|
|
|
def get_image_prompt(sentiment, transcribed_text, chunk_idx, total_chunks): |
|
|
base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: " |
|
|
|
|
|
if sentiment == "positive": |
|
|
return f"Generate an equirectangular 360° panoramic graphite sketch drawing, detailed pencil texture with faint neon glows, cinematic lighting of:{transcribed_text}. Use low histogram frequency in bright bins, dominant color in high RGB range, and high brightness and color variance. Apply high-frequency texture with strong filter energy, pronounced gradient magnitude, and strong local contrast. Use high spatial complexity, increased horizontal and vertical symmetry, high edge density, bright gray levels, and high contrast. Emphasize rich visual structure, color variation, and texture intensity across spatial composition." |
|
|
|
|
|
elif sentiment == "negative": |
|
|
return f"Generate an equirectangular 360° panoramic graphite sketch drawing, detailed pencil texture with faint neon glows, cinematic lighting of:{transcribed_text}. Use high histogram frequency in dark bins, dominant color in low RGB range, and low brightness and color variance. Apply low-frequency texture with low filter energy, weak gradient magnitude, and low local contrast. Use low spatial complexity, reduced horizontal and vertical symmetry, low edge density, dark gray levels, and moderate contrast. Emphasize coarse structure and limited variation in color, texture, and spatial distribution." |
|
|
|
|
|
else: |
|
|
return f"Generate an equirectangular 360° panoramic graphite sketch drawing, detailed pencil texture with faint neon glows, cinematic lighting of:{transcribed_text}. Use a balanced histogram frequency across bins, dominant color in a mid RGB range, and moderate brightness and color variance. Apply medium-frequency texture with moderate filter energy, standard gradient magnitude, and average local contrast. Use medium spatial complexity, balanced horizontal and vertical symmetry, medium edge density, mid-range gray levels, and standard contrast. Emphasize naturalistic structure and typical variation in color, texture, and spatial distribution." |
|
|
|
|
|
|
|
|
def get_music_prompt(emotion, transcribed_text, chunk_idx, total_chunks): |
|
|
base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: " |
|
|
|
|
|
emotion_prompts = { |
|
|
'neutral': f"Generate a neutral orchestral soundtrack with balanced energy and smooth spectral profile. Use moderate tempo (~100 BPM), onset rate around 2.8/sec, spectral centroid near 1000 Hz, and low dissonance. Keep pitch salience moderate (0.50) and loudness stable (~0.70 dB). Maintain low harmonic change rate (~0.05/sec) and tonal entropy 1.5 for equilibrium. Emphasize tonal balance, steady dynamics, and calm tonal centers. The music should feel even, ambient, and unobtrusive, complementing: {transcribed_text}.", |
|
|
'calm': f"Generate a calm orchestral soundtrack with a slow tempo (~85 BPM), low onset rate (~2.2/sec), soft spectral centroid (~850 Hz), and smooth timbral evolution. Use low dissonance, high spectral flatness, and gentle pitch salience (~0.48). Keep loudness low (~0.65 dB) with infrequent harmonic changes (~0.04/sec) and stable tonality (Krumhansl value 0.80, major mode). The music should evoke tranquility and serenity through warm timbres, sustained harmonies, and flowing textures inspired by: {transcribed_text}.", |
|
|
'happy': f"Generate a happy orchestral soundtrack with fast tempo (~127 BPM), dense rhythmic activity (~4.2 onsets/sec), and bright timbre (spectral centroid ~1321 Hz). Use variable dissonance and peaked spectral kurtosis to create vivid texture. Maintain pitch salience (~0.54), loudness (~0.90 dB), and chord change rate (~0.07/sec). Keep tonal entropy moderate (1.95) and Krumhansl value (0.83, major mode). The music should convey joy and positivity through energetic rhythms, ornamented melodic contours, and harmonically grounded progressions inspired by: {transcribed_text}.", |
|
|
'sad': f"Generate a sad orchestral soundtrack with slow tempo (~72 BPM), sparse onset rate (~2.0/sec), and dark timbre (spectral centroid ~720 Hz). Use moderate dissonance, low spectral kurtosis, and soft pitch salience (~0.45). Keep loudness subdued (~0.60 dB) with rare harmonic changes (~0.05/sec) and low tonal entropy (~1.4). Emphasize minor mode with gentle phrasing and sustained harmonic textures. The music should evoke sadness, intimacy, and reflection in relation to: {transcribed_text}.", |
|
|
'angry': f"Generate an angry orchestral soundtrack with moderately fast tempo (~120 BPM), onset rate (~3.4/sec), and bright, sharp timbre (spectral centroid ~2002 Hz). Use flat spectral kurtosis and stable dissonance. Maintain clear pitch salience (~0.58), high loudness (~0.96 dB), and frequent chord changes (~0.10/sec). Set tonal entropy to 2.57 and Krumhansl key profile (~0.54, minor mode). The music should express anger through strong rhythmic drive, aggressive articulation, and harmonically unstable progressions that reflect: {transcribed_text}.", |
|
|
'fearful': f"Generate a orchestral fearful soundtrack with irregular tempo (~95 BPM), fluctuating onset rate (~3.0/sec), and high spectral variability (centroid ~1750 Hz). Use unstable dissonance, low pitch salience (~0.42), and dynamic loudness (~0.80 dB). Increase chord change irregularity (~0.09/sec) and tonal entropy (2.4, minor mode). Emphasize eerie textures, spatial tension, and spectral modulation. The music should evoke suspense, fear, and anticipation inspired by: {transcribed_text}.", |
|
|
'disgust': f"Generate a orchestral disgusted soundtrack with moderate tempo (~90 BPM), irregular onset rate (~2.5/sec), and dark, rough timbre (spectral centroid ~950 Hz). Use dissonant harmonies, unstable spectral kurtosis, and low pitch salience (~0.40). Keep loudness (~0.75 dB) and tonal entropy (~2.2, minor mode). The music should evoke discomfort and unease through distorted textures, rough intervals, and unstable harmonic motion reflecting: {transcribed_text}.", |
|
|
'surprised': f"Generate a orchestral surprised soundtrack with variable tempo (~110 BPM), fluctuating onset rate (~3.8/sec), and dynamic spectral centroid (~1500 Hz). Use high spectral kurtosis and pitch salience (~0.57) to accent sudden contrasts. Loudness should vary (~0.85 dB) with irregular chord changes (~0.11/sec) and moderate tonal entropy (~2.0, major mode). The music should evoke surprise and wonder through abrupt transitions, playful motifs, and expressive timbral changes inspired by: {transcribed_text}." |
|
|
} |
|
|
|
|
|
return emotion_prompts.get( |
|
|
emotion.lower(), |
|
|
f"Create background music with {emotion} atmosphere that represents: {transcribed_text}" |
|
|
) |
|
|
|
|
|
|
|
|
def generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks): |
|
|
try: |
|
|
if processor is None or music_model is None: |
|
|
return None |
|
|
|
|
|
|
|
|
prompt = get_music_prompt(emotion_prediction, transcribed_text, chunk_idx, total_chunks) |
|
|
|
|
|
|
|
|
if len(prompt) > 200: |
|
|
prompt = prompt[:200] + "..." |
|
|
|
|
|
inputs = processor( |
|
|
text=[prompt], |
|
|
padding=True, |
|
|
return_tensors="pt", |
|
|
).to(device) |
|
|
|
|
|
|
|
|
audio_values = music_model.generate(**inputs, max_new_tokens=512) |
|
|
|
|
|
|
|
|
sampling_rate = music_model.config.audio_encoder.sampling_rate |
|
|
audio_data = audio_values[0, 0].cpu().numpy() |
|
|
|
|
|
|
|
|
audio_data = audio_data / np.max(np.abs(audio_data)) |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: |
|
|
scipy.io.wavfile.write(tmp_file.name, rate=sampling_rate, data=audio_data) |
|
|
return tmp_file.name |
|
|
|
|
|
except Exception as e: |
|
|
print("Error generating music:", e) |
|
|
return None |
|
|
|
|
|
|
|
|
api_key = os.getenv("DeepAI_api_key") |
|
|
|
|
|
|
|
|
def upscale_image(image, target_width=4096, target_height=2048): |
|
|
""" |
|
|
Upscale image using DeepAI's Torch-SRGAN API for super resolution |
|
|
""" |
|
|
try: |
|
|
if not api_key: |
|
|
print("No API key available for upscaling") |
|
|
|
|
|
img_array = np.array(image) |
|
|
upscaled = cv2.resize( |
|
|
img_array, |
|
|
(target_width, target_height), |
|
|
interpolation=cv2.INTER_LANCZOS4 |
|
|
) |
|
|
return Image.fromarray(upscaled) |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_input: |
|
|
image.save(tmp_input.name, "JPEG", quality=95) |
|
|
|
|
|
|
|
|
response = requests.post( |
|
|
"https://api.deepai.org/api/torch-srgan", |
|
|
files={'image': open(tmp_input.name, 'rb')}, |
|
|
headers={'api-key': api_key} |
|
|
) |
|
|
|
|
|
data = response.json() |
|
|
|
|
|
if 'output_url' in data: |
|
|
|
|
|
img_resp = requests.get(data['output_url']) |
|
|
upscaled_image = Image.open(BytesIO(img_resp.content)) |
|
|
|
|
|
|
|
|
if upscaled_image.size != (target_width, target_height): |
|
|
upscaled_image = upscaled_image.resize( |
|
|
(target_width, target_height), |
|
|
Image.Resampling.LANCZOS |
|
|
) |
|
|
|
|
|
|
|
|
os.unlink(tmp_input.name) |
|
|
return upscaled_image |
|
|
else: |
|
|
print("Error in DeepAI upscaling response:", data) |
|
|
|
|
|
img_array = np.array(image) |
|
|
upscaled = cv2.resize( |
|
|
img_array, |
|
|
(target_width, target_height), |
|
|
interpolation=cv2.INTER_LANCZOS4 |
|
|
) |
|
|
return Image.fromarray(upscaled) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error upscaling image with DeepAI: {e}") |
|
|
|
|
|
img_array = np.array(image) |
|
|
upscaled = cv2.resize( |
|
|
img_array, |
|
|
(target_width, target_height), |
|
|
interpolation=cv2.INTER_LANCZOS4 |
|
|
) |
|
|
return Image.fromarray(upscaled) |
|
|
|
|
|
|
|
|
def generate_image(sentiment_prediction, transcribed_text, chunk_idx, total_chunks): |
|
|
try: |
|
|
if not api_key: |
|
|
|
|
|
base_image = Image.new('RGB', (1024,512), color='white') |
|
|
else: |
|
|
|
|
|
prompt = get_image_prompt(sentiment_prediction, transcribed_text, chunk_idx, total_chunks) |
|
|
|
|
|
|
|
|
response = requests.post( |
|
|
"https://api.deepai.org/api/text2img", |
|
|
data={ |
|
|
'text': prompt, |
|
|
'width': 1024, |
|
|
'height': 512, |
|
|
'image_generator_version': 'hd' |
|
|
}, |
|
|
headers={'api-key': api_key} |
|
|
) |
|
|
|
|
|
data = response.json() |
|
|
if 'output_url' in data: |
|
|
|
|
|
img_resp = requests.get(data['output_url']) |
|
|
base_image = Image.open(BytesIO(img_resp.content)) |
|
|
else: |
|
|
print("Error in DeepAI response:", data) |
|
|
|
|
|
base_image = Image.new('RGB', (1024,512), color='white') |
|
|
|
|
|
|
|
|
upscaled_image = upscale_image(base_image) |
|
|
return upscaled_image |
|
|
|
|
|
except Exception as e: |
|
|
print("Error generating image:", e) |
|
|
|
|
|
return Image.new('RGB', (1024,512), color='white') |
|
|
|
|
|
|
|
|
def process_chunk(chunk_path, chunk_idx, total_chunks, generate_audio=True): |
|
|
try: |
|
|
|
|
|
emotion_prediction = predict_emotion_from_audio(chunk_path) |
|
|
|
|
|
|
|
|
transcribed_text = transcribe(chunk_path) |
|
|
|
|
|
|
|
|
sentiment, polarity = analyze_sentiment(transcribed_text) |
|
|
|
|
|
|
|
|
image = generate_image(sentiment, transcribed_text, chunk_idx, total_chunks) |
|
|
|
|
|
|
|
|
image_with_360_path = add_360_metadata(image) |
|
|
|
|
|
|
|
|
music_path = None |
|
|
if generate_audio: |
|
|
music_path = generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks) |
|
|
|
|
|
return { |
|
|
'chunk_index': chunk_idx + 1, |
|
|
'emotion': emotion_prediction, |
|
|
'transcription': transcribed_text, |
|
|
'sentiment': sentiment, |
|
|
'image': image, |
|
|
'image_360': image_with_360_path, |
|
|
'music': music_path |
|
|
} |
|
|
except Exception as e: |
|
|
print(f"Error processing chunk {chunk_idx + 1}:", e) |
|
|
|
|
|
return { |
|
|
'chunk_index': chunk_idx + 1, |
|
|
'emotion': "Error", |
|
|
'transcription': "Transcription failed", |
|
|
'sentiment': "Sentiment: error", |
|
|
'image': Image.new('RGB', (1440, 770), color='white'), |
|
|
'image_360': None, |
|
|
'music': None |
|
|
} |
|
|
|
|
|
|
|
|
def get_predictions(audio_input, generate_audio=True, chunk_duration=10): |
|
|
|
|
|
chunk_files, total_chunks = chunk_audio(audio_input, chunk_duration) |
|
|
|
|
|
results = [] |
|
|
|
|
|
|
|
|
for i, chunk_path in enumerate(chunk_files): |
|
|
print(f"Processing chunk {i+1}/{total_chunks} ({chunk_duration}s each)") |
|
|
result = process_chunk(chunk_path, i, total_chunks, generate_audio) |
|
|
results.append(result) |
|
|
|
|
|
|
|
|
for chunk_path in chunk_files: |
|
|
try: |
|
|
if chunk_path != audio_input: |
|
|
os.unlink(chunk_path) |
|
|
except: |
|
|
pass |
|
|
|
|
|
return results |
|
|
|
|
|
def create_xmp_block(width, height): |
|
|
"""Create XMP metadata block following ExifTool's exact format.""" |
|
|
xmp = ( |
|
|
f'<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>\n' |
|
|
f'<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="ExifTool">\n' |
|
|
f'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">\n' |
|
|
f'<rdf:Description rdf:about=""\n' |
|
|
f'xmlns:GPano="http://ns.google.com/photos/1.0/panorama/"\n' |
|
|
f'GPano:ProjectionType="equirectangular"\n' |
|
|
f'GPano:UsePanoramaViewer="True"\n' |
|
|
f'GPano:FullPanoWidthPixels="{width}"\n' |
|
|
f'GPano:FullPanoHeightPixels="{height}"\n' |
|
|
f'GPano:CroppedAreaImageWidthPixels="{width}"\n' |
|
|
f'GPano:CroppedAreaImageHeightPixels="{height}"\n' |
|
|
f'GPano:CroppedAreaLeftPixels="0"\n' |
|
|
f'GPano:CroppedAreaTopPixels="0"/>\n' |
|
|
f'</rdf:RDF>\n' |
|
|
f'</x:xmpmeta>\n' |
|
|
f'<?xpacket end="w"?>' |
|
|
) |
|
|
return xmp |
|
|
|
|
|
def write_xmp_to_jpg(input_path, output_path, width, height): |
|
|
"""Write XMP metadata to JPEG file following ExifTool's method.""" |
|
|
|
|
|
with open(input_path, 'rb') as f: |
|
|
data = f.read() |
|
|
|
|
|
|
|
|
if data[0:2] != b'\xFF\xD8': |
|
|
raise ValueError("Not a valid JPEG file") |
|
|
|
|
|
|
|
|
xmp_data = create_xmp_block(width, height) |
|
|
|
|
|
|
|
|
app1_marker = b'\xFF\xE1' |
|
|
xmp_header = b'http://ns.adobe.com/xap/1.0/\x00' |
|
|
xmp_bytes = xmp_data.encode('utf-8') |
|
|
length = len(xmp_header) + len(xmp_bytes) + 2 |
|
|
length_bytes = struct.pack('>H', length) |
|
|
|
|
|
|
|
|
output = bytearray() |
|
|
output.extend(data[0:2]) |
|
|
output.extend(app1_marker) |
|
|
output.extend(length_bytes) |
|
|
output.extend(xmp_header) |
|
|
output.extend(xmp_bytes) |
|
|
output.extend(data[2:]) |
|
|
|
|
|
|
|
|
with open(output_path, 'wb') as f: |
|
|
f.write(output) |
|
|
|
|
|
def add_360_metadata(img): |
|
|
"""Add 360 photo metadata to a PIL Image and return the path to the processed image.""" |
|
|
try: |
|
|
|
|
|
target_width, target_height = 4096, 2048 |
|
|
if img.width != target_width or img.height != target_height: |
|
|
img = img.resize((target_width, target_height), Image.Resampling.LANCZOS) |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file: |
|
|
|
|
|
img.save(tmp_file.name, "JPEG", quality=95) |
|
|
|
|
|
|
|
|
write_xmp_to_jpg(tmp_file.name, tmp_file.name, img.width, img.height) |
|
|
|
|
|
return tmp_file.name |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error adding 360 metadata: {str(e)}") |
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file: |
|
|
img.save(tmp_file.name, "JPEG", quality=95) |
|
|
return tmp_file.name |
|
|
|
|
|
def create_360_viewer_html(image_paths, audio_paths, output_path): |
|
|
"""Create an HTML file with a 360 viewer and audio player for the given images and audio.""" |
|
|
|
|
|
image_data_list = [] |
|
|
for img_path in image_paths: |
|
|
with open(img_path, "rb") as f: |
|
|
img_data = base64.b64encode(f.read()).decode("utf-8") |
|
|
image_data_list.append(f"data:image/jpeg;base64,{img_data}") |
|
|
|
|
|
|
|
|
audio_data_list = [] |
|
|
for audio_path in audio_paths: |
|
|
if audio_path: |
|
|
with open(audio_path, "rb") as f: |
|
|
audio_data = base64.b64encode(f.read()).decode("utf-8") |
|
|
audio_data_list.append(f"data:audio/wav;base64,{audio_data}") |
|
|
else: |
|
|
audio_data_list.append(None) |
|
|
|
|
|
|
|
|
html_content = f""" |
|
|
<!DOCTYPE html> |
|
|
<html lang="en"> |
|
|
<head> |
|
|
<meta charset="UTF-8"> |
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
|
<title>360 Panorama Viewer with Audio</title> |
|
|
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.css"/> |
|
|
<style> |
|
|
body {{ |
|
|
margin: 0; |
|
|
overflow: hidden; |
|
|
font-family: Arial, sans-serif; |
|
|
}} |
|
|
#panorama {{ |
|
|
width: 100vw; |
|
|
height: 80vh; |
|
|
}} |
|
|
.pnlm-hotspot.pnlm-info-hotspot {{ |
|
|
background-color: rgba(0, 150, 255, 0.8); |
|
|
border-radius: 50%; |
|
|
width: 30px; |
|
|
height: 30px; |
|
|
}} |
|
|
.pnlm-hotspot.pnlm-info-hotspot .pnlm-sprite {{ |
|
|
filter: brightness(0) invert(1); |
|
|
}} |
|
|
.pnlm-tooltip {{ |
|
|
background-color: rgba(0, 0, 0, 0.7); |
|
|
color: white; |
|
|
border-radius: 3px; |
|
|
padding: 5px 10px; |
|
|
}} |
|
|
#controls {{ |
|
|
position: absolute; |
|
|
top: 10px; |
|
|
right: 10px; |
|
|
z-index: 1000; |
|
|
background: rgba(0, 0, 0, 0.7); |
|
|
color: white; |
|
|
padding: 10px; |
|
|
border-radius: 5px; |
|
|
display: flex; |
|
|
flex-direction: column; |
|
|
gap: 10px; |
|
|
}} |
|
|
#audio-controls {{ |
|
|
position: fixed; |
|
|
bottom: 0; |
|
|
left: 0; |
|
|
width: 100%; |
|
|
background: rgba(0, 0, 0, 0.8); |
|
|
color: white; |
|
|
padding: 15px; |
|
|
display: flex; |
|
|
flex-direction: column; |
|
|
align-items: center; |
|
|
z-index: 1000; |
|
|
}} |
|
|
#audio-player {{ |
|
|
width: 80%; |
|
|
margin-bottom: 10px; |
|
|
}} |
|
|
#audio-info {{ |
|
|
text-align: center; |
|
|
font-size: 14px; |
|
|
}} |
|
|
button {{ |
|
|
background: #3498db; |
|
|
color: white; |
|
|
border: none; |
|
|
padding: 8px 15px; |
|
|
border-radius: 3px; |
|
|
cursor: pointer; |
|
|
margin: 5px; |
|
|
}} |
|
|
button:hover {{ |
|
|
background: #2980b9; |
|
|
}} |
|
|
select {{ |
|
|
padding: 5px; |
|
|
border-radius: 3px; |
|
|
border: 1px solid #ccc; |
|
|
}} |
|
|
</style> |
|
|
</head> |
|
|
<body> |
|
|
<div id="controls"> |
|
|
<select id="image-selector"> |
|
|
{"".join([f'<option value="{i}">Chunk {i+1}</option>' for i in range(len(image_data_list))])} |
|
|
</select> |
|
|
</div> |
|
|
|
|
|
<div id="panorama"></div> |
|
|
|
|
|
<div id="audio-controls"> |
|
|
<audio id="audio-player" controls></audio> |
|
|
<div id="audio-info">No audio available for this chunk</div> |
|
|
</div> |
|
|
|
|
|
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.js"></script> |
|
|
<script> |
|
|
const images = {json.dumps(image_data_list)}; |
|
|
const audioFiles = {json.dumps(audio_data_list)}; |
|
|
let currentViewer = null; |
|
|
|
|
|
function loadPanorama(index) {{ |
|
|
if (currentViewer) {{ |
|
|
currentViewer.destroy(); |
|
|
}} |
|
|
|
|
|
currentViewer = pannellum.viewer('panorama', {{ |
|
|
"type": "equirectangular", |
|
|
"panorama": images[index], |
|
|
"autoLoad": true, |
|
|
"autoRotate": -2, |
|
|
"showZoomCtrl": true, |
|
|
"showFullscreenCtrl": true, |
|
|
"hfov": 100 |
|
|
}}); |
|
|
|
|
|
// Update audio player |
|
|
updateAudioPlayer(index); |
|
|
}} |
|
|
|
|
|
function updateAudioPlayer(index) {{ |
|
|
const audioPlayer = document.getElementById('audio-player'); |
|
|
const audioInfo = document.getElementById('audio-info'); |
|
|
|
|
|
if (audioFiles[index]) {{ |
|
|
audioPlayer.src = audioFiles[index]; |
|
|
audioInfo.textContent = 'Playing audio for Chunk ' + (index + 1); |
|
|
// Try to play automatically (may be blocked by browser policies) |
|
|
audioPlayer.play().catch(e => {{ |
|
|
audioInfo.textContent = 'Click play to listen to audio for Chunk ' + (index + 1); |
|
|
}}); |
|
|
}} else {{ |
|
|
audioPlayer.src = ''; |
|
|
audioInfo.textContent = 'No audio available for this chunk'; |
|
|
}} |
|
|
}} |
|
|
|
|
|
// Load the first image initially |
|
|
loadPanorama(0); |
|
|
|
|
|
// Handle image selection changes |
|
|
document.getElementById('image-selector').addEventListener('change', function(e) {{ |
|
|
const selectedIndex = parseInt(e.target.value); |
|
|
loadPanorama(selectedIndex); |
|
|
}}); |
|
|
</script> |
|
|
</body> |
|
|
</html> |
|
|
""" |
|
|
|
|
|
|
|
|
with open(output_path, 'w') as f: |
|
|
f.write(html_content) |
|
|
|
|
|
return output_path |
|
|
|
|
|
|
|
|
def process_and_display(audio_input, generate_audio, chunk_duration): |
|
|
|
|
|
if chunk_duration is None or chunk_duration <= 0: |
|
|
chunk_duration = 10 |
|
|
|
|
|
|
|
|
yield [gr.HTML(f""" |
|
|
<div style="text-align: center; margin: 20px;"> |
|
|
<p style="font-size: 18px; color: #4a4a4a;">Processing audio in {chunk_duration}-second chunks...</p> |
|
|
<div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div> |
|
|
<style>@keyframes spin {{ 0% {{ transform: rotate(0deg); }} 100% {{ transform: rotate(360deg); }} }}</style> |
|
|
<p style="font-size: 14px; color: #4a4a4a;">This may take several minutes depending on the audio length...</p> |
|
|
</div> |
|
|
""")] + [gr.Group(visible=False)] * len(group_components) + [None] * (len(output_containers) * 6) + [None, ""] |
|
|
|
|
|
results = get_predictions(audio_input, generate_audio, chunk_duration) |
|
|
|
|
|
|
|
|
outputs = [] |
|
|
group_visibility = [] |
|
|
all_360_images = [] |
|
|
all_music_paths = [] |
|
|
|
|
|
|
|
|
for i, result in enumerate(results): |
|
|
if i < len(output_containers): |
|
|
group_visibility.append(gr.Group(visible=True)) |
|
|
outputs.extend([ |
|
|
result['emotion'], |
|
|
result['transcription'], |
|
|
result['sentiment'], |
|
|
result['image'], |
|
|
result['image_360'], |
|
|
result['music'] |
|
|
]) |
|
|
|
|
|
if result['image_360']: |
|
|
all_360_images.append(result['image_360']) |
|
|
all_music_paths.append(result['music']) |
|
|
else: |
|
|
|
|
|
group_visibility.append(gr.Group(visible=False)) |
|
|
outputs.extend([None] * 6) |
|
|
|
|
|
|
|
|
for i in range(len(results), len(output_containers)): |
|
|
group_visibility.append(gr.Group(visible=False)) |
|
|
outputs.extend([None] * 6) |
|
|
|
|
|
|
|
|
viewer_html_path = None |
|
|
if all_360_images: |
|
|
with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as tmp_file: |
|
|
viewer_html_path = create_360_viewer_html(all_360_images, all_music_paths, tmp_file.name) |
|
|
|
|
|
|
|
|
yield [gr.HTML("")] + group_visibility + outputs + [viewer_html_path, ""] |
|
|
|
|
|
|
|
|
def clear_all(): |
|
|
|
|
|
outputs = [None] |
|
|
|
|
|
|
|
|
outputs.extend([gr.Group(visible=False)] * len(group_components)) |
|
|
|
|
|
|
|
|
outputs.extend([None] * (len(output_containers) * 6)) |
|
|
|
|
|
|
|
|
outputs.append(gr.HTML("")) |
|
|
|
|
|
|
|
|
outputs.append(10) |
|
|
|
|
|
|
|
|
outputs.append(None) |
|
|
|
|
|
|
|
|
outputs.append(None) |
|
|
|
|
|
|
|
|
outputs.append("") |
|
|
|
|
|
return outputs |
|
|
|
|
|
|
|
|
def load_example_audio(example_name): |
|
|
|
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
custom_css = """ |
|
|
.download-section { |
|
|
background: rgba(255,255,255,255); |
|
|
padding: 25px; |
|
|
border-radius: 15px; |
|
|
border: 3px solid #764ba2; |
|
|
text-align: left; |
|
|
margin: 25px 0; |
|
|
box-shadow: 0 10px 30px rgba(0,0,0,0.15); |
|
|
position: relative; |
|
|
overflow: hidden; |
|
|
} |
|
|
|
|
|
.download-section::before { |
|
|
content: ""; |
|
|
position: absolute; |
|
|
top: -50%; |
|
|
left: -50%; |
|
|
width: 200%; |
|
|
height: 200%; |
|
|
background: radial-gradient(circle, rgba(255,255,255,0.1) 0%, transparent 70%); |
|
|
animation: shimmer 3s infinite linear; |
|
|
pointer-events: none; |
|
|
} |
|
|
|
|
|
@keyframes shimmer { |
|
|
0% { transform: rotate(0deg); } |
|
|
100% { transform: rotate(360deg); } |
|
|
} |
|
|
|
|
|
.download-section h2 { |
|
|
color: white; |
|
|
font-size: 16px; |
|
|
margin-bottom: 15px; |
|
|
text-shadow: 1px 1px 3px rgba(0,0,0,0.3); |
|
|
} |
|
|
|
|
|
.download-section p { |
|
|
color: rgba(255,255,255,0.9); |
|
|
font-size: 16px; |
|
|
margin-bottom: 20px; |
|
|
line-height: 3.5; |
|
|
} |
|
|
|
|
|
.download-button { |
|
|
background: rgba(155,155,155,255) !important; |
|
|
color: white !important; |
|
|
border: none !important; |
|
|
padding: 12px 30px !important; |
|
|
border-radius: 0px !important; |
|
|
font-weight: bold !important; |
|
|
font-size: 16px !important; |
|
|
margin-top: 15px !important; |
|
|
transition: all 0.3s ease !important; |
|
|
cursor: pointer !important; |
|
|
display: inline-block !important; |
|
|
} |
|
|
|
|
|
.download-button:hover { |
|
|
transform: translateY(-3px) !important; |
|
|
box-shadow: 0 8px 20px rgba(0,0,0,0.6) !important; |
|
|
} |
|
|
|
|
|
.download-button:active { |
|
|
transform: translateY(1px) !important; |
|
|
} |
|
|
|
|
|
.download-icon { |
|
|
margin-right: 8px; |
|
|
font-size: 28px; |
|
|
} |
|
|
|
|
|
.feature-list { |
|
|
display: flex; |
|
|
justify-content: center; |
|
|
flex-wrap: wrap; |
|
|
gap: 15px; |
|
|
margin: 20px 0; |
|
|
} |
|
|
|
|
|
.feature-item { |
|
|
background: rgba(255,255,255,0.15); |
|
|
padding: 10px 15px; |
|
|
border-radius: 8px; |
|
|
display: flex; |
|
|
align-items: center; |
|
|
gap: 8px; |
|
|
color: white; |
|
|
font-size: 14px; |
|
|
} |
|
|
|
|
|
.feature-icon { |
|
|
font-size: 26px; |
|
|
} |
|
|
|
|
|
.viewer-preview { |
|
|
margin-top: 20px; |
|
|
border-radius: 10px; |
|
|
overflow: hidden; |
|
|
box-shadow: 0 5px 15px rgba(0,0,0,0.2); |
|
|
max-width: 400px; |
|
|
margin-left: auto; |
|
|
margin-right: auto; |
|
|
} |
|
|
|
|
|
.viewer-preview img { |
|
|
width: 100%; |
|
|
display: block; |
|
|
} |
|
|
|
|
|
.instructions { |
|
|
background: rgba(255,255,255,0.1); |
|
|
padding: 15px; |
|
|
border-radius: 8px; |
|
|
margin-top: 20px; |
|
|
text-align: left; |
|
|
} |
|
|
|
|
|
.instructions h3 { |
|
|
color: white; |
|
|
margin-top: 0; |
|
|
font-size: 16px; |
|
|
} |
|
|
|
|
|
.instructions ol { |
|
|
color: rgba(255,255,255,0.9); |
|
|
padding-left: 20px; |
|
|
margin-bottom: 0; |
|
|
} |
|
|
|
|
|
.instructions li { |
|
|
margin-bottom: 8px; |
|
|
} |
|
|
""" |
|
|
|
|
|
with gr.Blocks(title="Entornos Virtuales Afectivos - Procesamiento por Segmentos", css=custom_css) as interface: |
|
|
gr.Markdown("# Bello") |
|
|
gr.Markdown( |
|
|
""" |
|
|
***Bello*** explora las sutilezas afectivas de la voz humana a través de la figura del **Teniente Bello**, |
|
|
el piloto chileno que desapareció misteriosamente en 1914 durante un vuelo de entrenamiento sobre la costa del Pacífico. |
|
|
|
|
|
Este espacio invita a habitar lo desconocido, desde la emoción y la palabra. |
|
|
Usando técnicas multimodales de reconocimiento de emociones en el habla, el proyecto analiza parámetros acústicos, prosódicos |
|
|
y semánticos del lenguaje hablado para generar entornos virtuales inmersivos en 360°. |
|
|
|
|
|
### Cómo interactuar |
|
|
|
|
|
1. Graba tu voz (o sube un audio) imaginando qué pudo haberle sucedido al Teniente Alejandro Bello. |
|
|
2. Establece la duración de cada segmento para dividir tu grabación en trozos. |
|
|
3. Marca la casilla si quieres generar audio para cada segmento. |
|
|
4. Genera tu Entorno Virtual Afectivo **EVA** y espera los resultados. |
|
|
5. Descarga el archivo HTML. |
|
|
6. Abre tu creación con cualquier navegador web. |
|
|
|
|
|
--- |
|
|
**Más información:** |
|
|
|
|
|
• Video Tutorial: [Cómo usar este espacio](https://youtu.be/eVD1lzwVhi8) |
|
|
|
|
|
• Para más detalles del proyecto, visita: [www.emotional-machines.com](https://www.emotional-machines.com) |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
audio_input = gr.Audio(label="Audio de Entrada", type="filepath", sources=["microphone", "upload"]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Column(scale=1): |
|
|
chunk_duration_input = gr.Number( |
|
|
label="Duración de Segmento (segundos)", |
|
|
value=10, |
|
|
minimum=1, |
|
|
maximum=60, |
|
|
step=1, |
|
|
info="Duración de cada segmento de audio a procesar (1-60 segundos)" |
|
|
) |
|
|
generate_audio_checkbox = gr.Checkbox( |
|
|
label="Generar Audio (puede tardar más)", |
|
|
value=False, |
|
|
info="Desmarca para omitir la generación de música y acelerar el procesamiento" |
|
|
) |
|
|
with gr.Row(): |
|
|
process_btn = gr.Button("Generar", variant="primary") |
|
|
clear_btn = gr.Button("Borrar Todo", variant="secondary") |
|
|
|
|
|
loading_indicator = gr.HTML(""" |
|
|
<div id="loading" style="display: none; text-align: center; margin: 20px;"> |
|
|
<p style="font-size: 18px; color: #4a4a4a;">Procesando segmentos de audio...</p> |
|
|
<div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div> |
|
|
<style>@keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } }</style> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
output_containers = [] |
|
|
group_components = [] |
|
|
|
|
|
for i in range(20): |
|
|
with gr.Group(visible=False) as chunk_group: |
|
|
gr.Markdown(f"### Resultados del Segmento {i+1}") |
|
|
with gr.Row(): |
|
|
emotion_output = gr.Label(label="Predicción de Emoción Acústica") |
|
|
transcription_output = gr.Label(label="Texto Transcrito") |
|
|
sentiment_output = gr.Label(label="Análisis Sentimental") |
|
|
with gr.Row(): |
|
|
image_output = gr.Image(label="Imagen Equirectangular Generada") |
|
|
image_360_output = gr.File(label="Descargar Imagen 360", type="filepath") |
|
|
with gr.Row(): |
|
|
audio_output = gr.Audio(label="Música Generada") |
|
|
gr.HTML("<hr style='margin: 20px 0; border: 1px solid #ccc;'>") |
|
|
|
|
|
group_components.append(chunk_group) |
|
|
output_containers.append({ |
|
|
'emotion': emotion_output, |
|
|
'transcription': transcription_output, |
|
|
'sentiment': sentiment_output, |
|
|
'image': image_output, |
|
|
'image_360': image_360_output, |
|
|
'music': audio_output |
|
|
}) |
|
|
|
|
|
with gr.Group(visible=True, elem_classes="download-section") as download_group: |
|
|
viewer_html_output = gr.File( |
|
|
label="Una vez finalizado el procesamiento, descarga tu EVA aquí 🚀", |
|
|
type="filepath", |
|
|
interactive=False, |
|
|
elem_classes="download-button" |
|
|
) |
|
|
|
|
|
js_output = gr.HTML(visible=False) |
|
|
|
|
|
process_btn.click( |
|
|
fn=process_and_display, |
|
|
inputs=[audio_input, generate_audio_checkbox, chunk_duration_input], |
|
|
outputs=[loading_indicator] + group_components + [comp for container in output_containers for comp in [ |
|
|
container['emotion'], |
|
|
container['transcription'], |
|
|
container['sentiment'], |
|
|
container['image'], |
|
|
container['image_360'], |
|
|
container['music'] |
|
|
]] + [viewer_html_output, js_output] |
|
|
) |
|
|
|
|
|
clear_btn.click( |
|
|
fn=clear_all, |
|
|
inputs=[], |
|
|
outputs=[audio_input] + group_components + [comp for container in output_containers for comp in [ |
|
|
container['emotion'], |
|
|
container['transcription'], |
|
|
container['sentiment'], |
|
|
container['image'], |
|
|
container['image_360'], |
|
|
container['music'] |
|
|
]] + [loading_indicator, chunk_duration_input, viewer_html_output, js_output] |
|
|
) |
|
|
|
|
|
interface.launch(share=True) |