Spaces:
Sleeping
Sleeping
File size: 43,293 Bytes
bf98ab3 db2648a bf98ab3 86bcbf0 bf98ab3 48a6187 0056a5d d81cc2a 0056a5d c752745 234fd40 0056a5d c752745 0056a5d 1bb5d46 bf98ab3 1bb5d46 bf98ab3 1bb5d46 bf98ab3 d81cc2a bf98ab3 1bb5d46 bf98ab3 1bb5d46 bf98ab3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 |
import gradio as gr
import pyvista as pv
from pyvista import examples
import numpy as np
import librosa
import requests
from io import BytesIO
from PIL import Image
import os
from tensorflow.keras.models import load_model
from faster_whisper import WhisperModel
import random
from textblob import TextBlob
import torch
import scipy.io.wavfile
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import tempfile
import base64
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import soundfile as sf
from pydub import AudioSegment
import math
import json
import imageio
from PIL import Image, ImageFilter
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import base64
from io import BytesIO
import struct
import cv2
import os
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
# Load the emotion prediction model
def load_emotion_model(model_path):
try:
model = load_model(model_path)
print("Emotion model loaded successfully")
return model
except Exception as e:
print("Error loading emotion prediction model:", e)
return None
model_path = 'mymodel_SER_LSTM_RAVDESS.h5'
model = load_emotion_model(model_path)
# Initialize WhisperModel
model_size = "small"
model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
# Load MusicGen model
def load_musicgen_model():
try:
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
music_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
music_model.to(device)
print("MusicGen model loaded successfully")
return processor, music_model, device
except Exception as e:
print("Error loading MusicGen model:", e)
return None, None, None
processor, music_model, device = load_musicgen_model()
# Function to chunk audio into segments
def chunk_audio(audio_path, chunk_duration=10):
"""Split audio into chunks and return list of chunk file paths"""
try:
# Load audio file
audio = AudioSegment.from_file(audio_path)
duration_ms = len(audio)
chunk_ms = chunk_duration * 1000
# Validate chunk duration
if chunk_duration <= 0:
raise ValueError("Chunk duration must be positive")
if chunk_duration > duration_ms / 1000:
# If chunk duration is longer than audio, return the whole audio
return [audio_path], 1
chunks = []
chunk_files = []
# Calculate number of chunks
num_chunks = math.ceil(duration_ms / chunk_ms)
for i in range(num_chunks):
start_ms = i * chunk_ms
end_ms = min((i + 1) * chunk_ms, duration_ms)
# Extract chunk
chunk = audio[start_ms:end_ms]
chunks.append(chunk)
# Save chunk to temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
chunk.export(tmp_file.name, format="wav")
chunk_files.append(tmp_file.name)
return chunk_files, num_chunks
except Exception as e:
print("Error chunking audio:", e)
# Return original file as single chunk if chunking fails
return [audio_path], 1
# Function to transcribe audio
def transcribe(wav_filepath):
try:
segments, _ = model2.transcribe(wav_filepath, beam_size=5)
return "".join([segment.text for segment in segments])
except Exception as e:
print("Error transcribing audio:", e)
return "Transcription failed"
# Function to extract MFCC features from audio
def extract_mfcc(wav_file_name):
try:
y, sr = librosa.load(wav_file_name)
mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
return mfccs
except Exception as e:
print("Error extracting MFCC features:", e)
return None
# Emotions dictionary
emotions = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful', 6: 'disgust', 7: 'surprised'}
# Function to predict emotion from audio
def predict_emotion_from_audio(wav_filepath):
try:
if model is None:
return "Model not loaded"
test_point = extract_mfcc(wav_filepath)
if test_point is not None:
test_point = np.reshape(test_point, newshape=(1, 40, 1))
predictions = model.predict(test_point)
predicted_emotion_label = np.argmax(predictions[0])
return emotions.get(predicted_emotion_label, "Unknown emotion")
else:
return "Error: Unable to extract features"
except Exception as e:
print("Error predicting emotion:", e)
return "Prediction error"
# Function to analyze sentiment from text
def analyze_sentiment(text):
try:
if not text or text.strip() == "":
return "neutral", 0.0
analysis = TextBlob(text)
polarity = analysis.sentiment.polarity
if polarity > 0.1:
sentiment = "positive"
elif polarity < -0.1:
sentiment = "negative"
else:
sentiment = "neutral"
return sentiment, polarity
except Exception as e:
print("Error analyzing sentiment:", e)
return "neutral", 0.0
# Function to get image prompt based on sentiment
def get_image_prompt(sentiment, transcribed_text, chunk_idx, total_chunks):
base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: "
if sentiment == "positive":
return f"Generate a non-figurative equirectangular 360 abstract texture of:{transcribed_text}. Use low histogram frequency in bright bins, dominant color in high RGB range, and high brightness and color variance. Apply high-frequency texture with strong filter energy, pronounced gradient magnitude, and strong local contrast. Use high spatial complexity, increased horizontal and vertical symmetry, high edge density, bright gray levels, and high contrast. Emphasize rich visual structure, color variation, and texture intensity across spatial composition."
elif sentiment == "negative":
return f"Generate a non-figurative equirectangular 360 abstract texture of:{transcribed_text}. Use high histogram frequency in dark bins, dominant color in low RGB range, and low brightness and color variance. Apply low-frequency texture with low filter energy, weak gradient magnitude, and low local contrast. Use low spatial complexity, reduced horizontal and vertical symmetry, low edge density, dark gray levels, and moderate contrast. Emphasize coarse structure and limited variation in color, texture, and spatial distribution."
else: # neutral
return f"Generate a non-figurative abstract equirectangular 360 abstract texture of:{transcribed_text}. Use a balanced histogram frequency across bins, dominant color in a mid RGB range, and moderate brightness and color variance. Apply medium-frequency texture with moderate filter energy, standard gradient magnitude, and average local contrast. Use medium spatial complexity, balanced horizontal and vertical symmetry, medium edge density, mid-range gray levels, and standard contrast. Emphasize naturalistic structure and typical variation in color, texture, and spatial distribution."
# Function to get music prompt based on emotion
def get_music_prompt(emotion, transcribed_text, chunk_idx, total_chunks):
base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: "
emotion_prompts = {
'neutral': f"Generate a neutral soundtrack with balanced energy and smooth spectral profile. Use moderate tempo (~100 BPM), onset rate around 2.8/sec, spectral centroid near 1000 Hz, and low dissonance. Keep pitch salience moderate (0.50) and loudness stable (~0.70 dB). Maintain low harmonic change rate (~0.05/sec) and tonal entropy 1.5 for equilibrium. Emphasize tonal balance, steady dynamics, and calm tonal centers. The music should feel even, ambient, and unobtrusive, complementing: {transcribed_text}.",
'calm': f"Generate a calm soundtrack with a slow tempo (~85 BPM), low onset rate (~2.2/sec), soft spectral centroid (~850 Hz), and smooth timbral evolution. Use low dissonance, high spectral flatness, and gentle pitch salience (~0.48). Keep loudness low (~0.65 dB) with infrequent harmonic changes (~0.04/sec) and stable tonality (Krumhansl value 0.80, major mode). The music should evoke tranquility and serenity through warm timbres, sustained harmonies, and flowing textures inspired by: {transcribed_text}.",
'happy': f"Generate a happy soundtrack with fast tempo (~127 BPM), dense rhythmic activity (~4.2 onsets/sec), and bright timbre (spectral centroid ~1321 Hz). Use variable dissonance and peaked spectral kurtosis to create vivid texture. Maintain pitch salience (~0.54), loudness (~0.90 dB), and chord change rate (~0.07/sec). Keep tonal entropy moderate (1.95) and Krumhansl value (0.83, major mode). The music should convey joy and positivity through energetic rhythms, ornamented melodic contours, and harmonically grounded progressions inspired by: {transcribed_text}.",
'sad': f"Generate a sad soundtrack with slow tempo (~72 BPM), sparse onset rate (~2.0/sec), and dark timbre (spectral centroid ~720 Hz). Use moderate dissonance, low spectral kurtosis, and soft pitch salience (~0.45). Keep loudness subdued (~0.60 dB) with rare harmonic changes (~0.05/sec) and low tonal entropy (~1.4). Emphasize minor mode with gentle phrasing and sustained harmonic textures. The music should evoke sadness, intimacy, and reflection in relation to: {transcribed_text}.",
'angry': f"Generate an angry soundtrack with moderately fast tempo (~120 BPM), onset rate (~3.4/sec), and bright, sharp timbre (spectral centroid ~2002 Hz). Use flat spectral kurtosis and stable dissonance. Maintain clear pitch salience (~0.58), high loudness (~0.96 dB), and frequent chord changes (~0.10/sec). Set tonal entropy to 2.57 and Krumhansl key profile (~0.54, minor mode). The music should express anger through strong rhythmic drive, aggressive articulation, and harmonically unstable progressions that reflect: {transcribed_text}.",
'fearful': f"Generate a fearful soundtrack with irregular tempo (~95 BPM), fluctuating onset rate (~3.0/sec), and high spectral variability (centroid ~1750 Hz). Use unstable dissonance, low pitch salience (~0.42), and dynamic loudness (~0.80 dB). Increase chord change irregularity (~0.09/sec) and tonal entropy (2.4, minor mode). Emphasize eerie textures, spatial tension, and spectral modulation. The music should evoke suspense, fear, and anticipation inspired by: {transcribed_text}.",
'disgust': f"Generate a disgusted soundtrack with moderate tempo (~90 BPM), irregular onset rate (~2.5/sec), and dark, rough timbre (spectral centroid ~950 Hz). Use dissonant harmonies, unstable spectral kurtosis, and low pitch salience (~0.40). Keep loudness (~0.75 dB) and tonal entropy (~2.2, minor mode). The music should evoke discomfort and unease through distorted textures, rough intervals, and unstable harmonic motion reflecting: {transcribed_text}.",
'surprised': f"Generate a surprised soundtrack with variable tempo (~110 BPM), fluctuating onset rate (~3.8/sec), and dynamic spectral centroid (~1500 Hz). Use high spectral kurtosis and pitch salience (~0.57) to accent sudden contrasts. Loudness should vary (~0.85 dB) with irregular chord changes (~0.11/sec) and moderate tonal entropy (~2.0, major mode). The music should evoke surprise and wonder through abrupt transitions, playful motifs, and expressive timbral changes inspired by: {transcribed_text}."
}
return emotion_prompts.get(
emotion.lower(),
f"Create background music with {emotion} atmosphere that represents: {transcribed_text}"
)
# Function to generate music with MusicGen (using acoustic emotion prediction)
def generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks):
try:
if processor is None or music_model is None:
return None
# Get specific prompt based on emotion
prompt = get_music_prompt(emotion_prediction, transcribed_text, chunk_idx, total_chunks)
# Limit prompt length to avoid model issues
if len(prompt) > 200:
prompt = prompt[:200] + "..."
inputs = processor(
text=[prompt],
padding=True,
return_tensors="pt",
).to(device)
# Generate audio
audio_values = music_model.generate(**inputs, max_new_tokens=512)
# Convert to numpy array and sample rate
sampling_rate = music_model.config.audio_encoder.sampling_rate
audio_data = audio_values[0, 0].cpu().numpy()
# Normalize audio data
audio_data = audio_data / np.max(np.abs(audio_data))
# Create a temporary file to save the audio
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
scipy.io.wavfile.write(tmp_file.name, rate=sampling_rate, data=audio_data)
return tmp_file.name
except Exception as e:
print("Error generating music:", e)
return None
# --- DeepAI Image Generation (Text2Img) ---
api_key = os.getenv("DeepAI_api_key")
# Function to upscale image using Lanczos interpolation
def upscale_image(image, target_width=4096, target_height=2048):
"""
Upscale image using DeepAI's Torch-SRGAN API for super resolution
"""
try:
if not api_key:
print("No API key available for upscaling")
# Fallback to OpenCV if no API key
img_array = np.array(image)
upscaled = cv2.resize(
img_array,
(target_width, target_height),
interpolation=cv2.INTER_LANCZOS4
)
return Image.fromarray(upscaled)
# Save the image to a temporary file
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_input:
image.save(tmp_input.name, "JPEG", quality=95)
# Make request to DeepAI torch-srgan API
response = requests.post(
"https://api.deepai.org/api/torch-srgan",
files={'image': open(tmp_input.name, 'rb')},
headers={'api-key': api_key}
)
data = response.json()
if 'output_url' in data:
# Download the upscaled image
img_resp = requests.get(data['output_url'])
upscaled_image = Image.open(BytesIO(img_resp.content))
# Ensure the image meets our target dimensions
if upscaled_image.size != (target_width, target_height):
upscaled_image = upscaled_image.resize(
(target_width, target_height),
Image.Resampling.LANCZOS
)
# Clean up temporary file
os.unlink(tmp_input.name)
return upscaled_image
else:
print("Error in DeepAI upscaling response:", data)
# Fallback to OpenCV if API fails
img_array = np.array(image)
upscaled = cv2.resize(
img_array,
(target_width, target_height),
interpolation=cv2.INTER_LANCZOS4
)
return Image.fromarray(upscaled)
except Exception as e:
print(f"Error upscaling image with DeepAI: {e}")
# Fallback to OpenCV if any error occurs
img_array = np.array(image)
upscaled = cv2.resize(
img_array,
(target_width, target_height),
interpolation=cv2.INTER_LANCZOS4
)
return Image.fromarray(upscaled)
# ADD THE MISSING generate_image FUNCTION HERE
def generate_image(sentiment_prediction, transcribed_text, chunk_idx, total_chunks):
try:
if not api_key:
# fallback white image if no API key
base_image = Image.new('RGB', (1024,512), color='white')
else:
# Get specific prompt based on sentiment
prompt = get_image_prompt(sentiment_prediction, transcribed_text, chunk_idx, total_chunks)
# Make request to DeepAI text2img API
response = requests.post(
"https://api.deepai.org/api/text2img",
data={
'text': prompt,
'width': 1024,
'height': 512,
'image_generator_version': 'hd'
},
headers={'api-key': api_key}
)
data = response.json()
if 'output_url' in data:
# Download the generated image
img_resp = requests.get(data['output_url'])
base_image = Image.open(BytesIO(img_resp.content))
else:
print("Error in DeepAI response:", data)
# Return a fallback image
base_image = Image.new('RGB', (1024,512), color='white')
# Upscale the image for better quality in 360 viewer
upscaled_image = upscale_image(base_image)
return upscaled_image
except Exception as e:
print("Error generating image:", e)
# Return a fallback image
return Image.new('RGB', (1024,512), color='white')
# Function to process a single chunk
def process_chunk(chunk_path, chunk_idx, total_chunks, generate_audio=True):
try:
# Get acoustic emotion prediction (for music)
emotion_prediction = predict_emotion_from_audio(chunk_path)
# Get transcribed text
transcribed_text = transcribe(chunk_path)
# Analyze sentiment of transcribed text (for image)
sentiment, polarity = analyze_sentiment(transcribed_text)
# Generate image using SENTIMENT analysis with specific prompt
image = generate_image(sentiment, transcribed_text, chunk_idx, total_chunks)
# Add 360 metadata to the image
image_with_360_path = add_360_metadata(image)
# Generate music only if audio generation is enabled
music_path = None
if generate_audio:
music_path = generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks)
return {
'chunk_index': chunk_idx + 1,
'emotion': emotion_prediction,
'transcription': transcribed_text,
'sentiment': sentiment,
'image': image, # Original image for display in Gradio
'image_360': image_with_360_path, # Image with 360 metadata
'music': music_path
}
except Exception as e:
print(f"Error processing chunk {chunk_idx + 1}:", e)
# Return a fallback result with all required keys
return {
'chunk_index': chunk_idx + 1,
'emotion': "Error",
'transcription': "Transcription failed",
'sentiment': "Sentiment: error",
'image': Image.new('RGB', (1440, 770), color='white'),
'image_360': None,
'music': None
}
# Function to get predictions for all chunks
def get_predictions(audio_input, generate_audio=True, chunk_duration=10):
# Chunk the audio into segments
chunk_files, total_chunks = chunk_audio(audio_input, chunk_duration)
results = []
# Process each chunk
for i, chunk_path in enumerate(chunk_files):
print(f"Processing chunk {i+1}/{total_chunks} ({chunk_duration}s each)")
result = process_chunk(chunk_path, i, total_chunks, generate_audio)
results.append(result)
# Clean up temporary chunk files
for chunk_path in chunk_files:
try:
if chunk_path != audio_input: # Don't delete original input file
os.unlink(chunk_path)
except:
pass
return results
def create_xmp_block(width, height):
"""Create XMP metadata block following ExifTool's exact format."""
xmp = (
f'<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>\n'
f'<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="ExifTool">\n'
f'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">\n'
f'<rdf:Description rdf:about=""\n'
f'xmlns:GPano="http://ns.google.com/photos/1.0/panorama/"\n'
f'GPano:ProjectionType="equirectangular"\n'
f'GPano:UsePanoramaViewer="True"\n'
f'GPano:FullPanoWidthPixels="{width}"\n'
f'GPano:FullPanoHeightPixels="{height}"\n'
f'GPano:CroppedAreaImageWidthPixels="{width}"\n'
f'GPano:CroppedAreaImageHeightPixels="{height}"\n'
f'GPano:CroppedAreaLeftPixels="0"\n'
f'GPano:CroppedAreaTopPixels="0"/>\n'
f'</rdf:RDF>\n'
f'</x:xmpmeta>\n'
f'<?xpacket end="w"?>'
)
return xmp
def write_xmp_to_jpg(input_path, output_path, width, height):
"""Write XMP metadata to JPEG file following ExifTool's method."""
# Read the original JPEG
with open(input_path, 'rb') as f:
data = f.read()
# Find the start of image marker
if data[0:2] != b'\xFF\xD8':
raise ValueError("Not a valid JPEG file")
# Create XMP data
xmp_data = create_xmp_block(width, height)
# Create APP1 segment for XMP
app1_marker = b'\xFF\xE1'
xmp_header = b'http://ns.adobe.com/xap/1.0/\x00'
xmp_bytes = xmp_data.encode('utf-8')
length = len(xmp_header) + len(xmp_bytes) + 2 # +2 for length bytes
length_bytes = struct.pack('>H', length)
# Construct new file content
output = bytearray()
output.extend(data[0:2]) # SOI marker
output.extend(app1_marker)
output.extend(length_bytes)
output.extend(xmp_header)
output.extend(xmp_bytes)
output.extend(data[2:]) # Rest of the original file
# Write the new file
with open(output_path, 'wb') as f:
f.write(output)
def add_360_metadata(img):
"""Add 360 photo metadata to a PIL Image and return the path to the processed image."""
try:
# First, ensure the image is upscaled to 4096x2048
target_width, target_height = 4096, 2048
if img.width != target_width or img.height != target_height:
img = img.resize((target_width, target_height), Image.Resampling.LANCZOS)
# Create a temporary file
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
# First save as high-quality JPEG
img.save(tmp_file.name, "JPEG", quality=95)
# Then inject XMP metadata directly into JPEG file
write_xmp_to_jpg(tmp_file.name, tmp_file.name, img.width, img.height)
return tmp_file.name
except Exception as e:
print(f"Error adding 360 metadata: {str(e)}")
# Fallback: return the original image path
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
img.save(tmp_file.name, "JPEG", quality=95)
return tmp_file.name
def create_360_viewer_html(image_paths, audio_paths, output_path):
"""Create an HTML file with a 360 viewer and audio player for the given images and audio."""
# Create a list of image data URIs
image_data_list = []
for img_path in image_paths:
with open(img_path, "rb") as f:
img_data = base64.b64encode(f.read()).decode("utf-8")
image_data_list.append(f"data:image/jpeg;base64,{img_data}")
# Create a list of audio data URIs
audio_data_list = []
for audio_path in audio_paths:
if audio_path: # Only process if audio exists
with open(audio_path, "rb") as f:
audio_data = base64.b64encode(f.read()).decode("utf-8")
audio_data_list.append(f"data:audio/wav;base64,{audio_data}")
else:
audio_data_list.append(None) # Placeholder for chunks without audio
# Create the HTML content
html_content = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>360 Panorama Viewer with Audio</title>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.css"/>
<style>
body {{
margin: 0;
overflow: hidden;
font-family: Arial, sans-serif;
}}
#panorama {{
width: 100vw;
height: 80vh;
}}
.pnlm-hotspot.pnlm-info-hotspot {{
background-color: rgba(0, 150, 255, 0.8);
border-radius: 50%;
width: 30px;
height: 30px;
}}
.pnlm-hotspot.pnlm-info-hotspot .pnlm-sprite {{
filter: brightness(0) invert(1);
}}
.pnlm-tooltip {{
background-color: rgba(0, 0, 0, 0.7);
color: white;
border-radius: 3px;
padding: 5px 10px;
}}
#controls {{
position: absolute;
top: 10px;
right: 10px;
z-index: 1000;
background: rgba(0, 0, 0, 0.7);
color: white;
padding: 10px;
border-radius: 5px;
display: flex;
flex-direction: column;
gap: 10px;
}}
#audio-controls {{
position: fixed;
bottom: 0;
left: 0;
width: 100%;
background: rgba(0, 0, 0, 0.8);
color: white;
padding: 15px;
display: flex;
flex-direction: column;
align-items: center;
z-index: 1000;
}}
#audio-player {{
width: 80%;
margin-bottom: 10px;
}}
#audio-info {{
text-align: center;
font-size: 14px;
}}
button {{
background: #3498db;
color: white;
border: none;
padding: 8px 15px;
border-radius: 3px;
cursor: pointer;
margin: 5px;
}}
button:hover {{
background: #2980b9;
}}
select {{
padding: 5px;
border-radius: 3px;
border: 1px solid #ccc;
}}
</style>
</head>
<body>
<div id="controls">
<select id="image-selector">
{"".join([f'<option value="{i}">Chunk {i+1}</option>' for i in range(len(image_data_list))])}
</select>
</div>
<div id="panorama"></div>
<div id="audio-controls">
<audio id="audio-player" controls></audio>
<div id="audio-info">No audio available for this chunk</div>
</div>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/pannellum@2.5.6/build/pannellum.js"></script>
<script>
const images = {json.dumps(image_data_list)};
const audioFiles = {json.dumps(audio_data_list)};
let currentViewer = null;
function loadPanorama(index) {{
if (currentViewer) {{
currentViewer.destroy();
}}
currentViewer = pannellum.viewer('panorama', {{
"type": "equirectangular",
"panorama": images[index],
"autoLoad": true,
"autoRotate": -2,
"showZoomCtrl": true,
"showFullscreenCtrl": true,
"hfov": 100
}});
// Update audio player
updateAudioPlayer(index);
}}
function updateAudioPlayer(index) {{
const audioPlayer = document.getElementById('audio-player');
const audioInfo = document.getElementById('audio-info');
if (audioFiles[index]) {{
audioPlayer.src = audioFiles[index];
audioInfo.textContent = 'Playing audio for Chunk ' + (index + 1);
// Try to play automatically (may be blocked by browser policies)
audioPlayer.play().catch(e => {{
audioInfo.textContent = 'Click play to listen to audio for Chunk ' + (index + 1);
}});
}} else {{
audioPlayer.src = '';
audioInfo.textContent = 'No audio available for this chunk';
}}
}}
// Load the first image initially
loadPanorama(0);
// Handle image selection changes
document.getElementById('image-selector').addEventListener('change', function(e) {{
const selectedIndex = parseInt(e.target.value);
loadPanorama(selectedIndex);
}});
</script>
</body>
</html>
"""
# Write the HTML to a file
with open(output_path, 'w') as f:
f.write(html_content)
return output_path
# Update the process_and_display function
def process_and_display(audio_input, generate_audio, chunk_duration):
# Validate chunk duration
if chunk_duration is None or chunk_duration <= 0:
chunk_duration = 10
# Show loading indicator
yield [gr.HTML(f"""
<div style="text-align: center; margin: 20px;">
<p style="font-size: 18px; color: #4a4a4a;">Processing audio in {chunk_duration}-second chunks...</p>
<div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
<style>@keyframes spin {{ 0% {{ transform: rotate(0deg); }} 100% {{ transform: rotate(360deg); }} }}</style>
<p style="font-size: 14px; color: #4a4a4a;">This may take several minutes depending on the audio length...</p>
</div>
""")] + [gr.Group(visible=False)] * len(group_components) + [None] * (len(output_containers) * 6) + [None, ""]
results = get_predictions(audio_input, generate_audio, chunk_duration)
# Initialize outputs list
outputs = []
group_visibility = []
all_360_images = [] # Collect all 360 images for the viewer
all_music_paths = [] # Collect all music paths for the viewer
# Process each result
for i, result in enumerate(results):
if i < len(output_containers):
group_visibility.append(gr.Group(visible=True))
outputs.extend([
result['emotion'],
result['transcription'],
result['sentiment'],
result['image'],
result['image_360'],
result['music']
])
# Collect the 360-processed images and music
if result['image_360']:
all_360_images.append(result['image_360']) # Use the 360-processed image
all_music_paths.append(result['music']) # Can be None if no music generated
else:
# If we have more results than containers, just extend with None
group_visibility.append(gr.Group(visible=False))
outputs.extend([None] * 6)
# Hide remaining containers
for i in range(len(results), len(output_containers)):
group_visibility.append(gr.Group(visible=False))
outputs.extend([None] * 6)
# Create 360 viewer HTML if we have 360 images
viewer_html_path = None
if all_360_images:
with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as tmp_file:
viewer_html_path = create_360_viewer_html(all_360_images, all_music_paths, tmp_file.name)
# Hide loading indicator and show results
yield [gr.HTML("")] + group_visibility + outputs + [viewer_html_path, ""]
# Update the clear_all function to handle the new outputs
def clear_all():
# Create a list with None for all outputs
outputs = [None] # For audio input
# For group components (set to invisible)
outputs.extend([gr.Group(visible=False)] * len(group_components))
# For all output containers (set to None)
outputs.extend([None] * (len(output_containers) * 6))
# For loading indicator (empty HTML)
outputs.append(gr.HTML(""))
# For chunk duration (reset to 10)
outputs.append(10)
# For example selector (reset to None)
outputs.append(None)
# For viewer (set to None)
outputs.append(None)
# For JavaScript output (empty)
outputs.append("")
return outputs
# Function to load example audio (placeholder - you need to implement this)
def load_example_audio(example_name):
# This is a placeholder - you need to implement this function
# Return the path to the example audio file based on the example_name
return None
# Custom CSS for enhanced styling
custom_css = """
.download-section {
background: rgba(255,255,255,255);
padding: 25px;
border-radius: 15px;
border: 3px solid #764ba2;
text-align: left;
margin: 25px 0;
box-shadow: 0 10px 30px rgba(0,0,0,0.15);
position: relative;
overflow: hidden;
}
.download-section::before {
content: "";
position: absolute;
top: -50%;
left: -50%;
width: 200%;
height: 200%;
background: radial-gradient(circle, rgba(255,255,255,0.1) 0%, transparent 70%);
animation: shimmer 3s infinite linear;
pointer-events: none;
}
@keyframes shimmer {
0% { transform: rotate(0deg); }
100% { transform: rotate(360deg); }
}
.download-section h2 {
color: white;
font-size: 16px;
margin-bottom: 15px;
text-shadow: 1px 1px 3px rgba(0,0,0,0.3);
}
.download-section p {
color: rgba(255,255,255,0.9);
font-size: 16px;
margin-bottom: 20px;
line-height: 3.5;
}
.download-button {
background: rgba(155,155,155,255) !important;
color: white !important;
border: none !important;
padding: 12px 30px !important;
border-radius: 0px !important;
font-weight: bold !important;
font-size: 16px !important;
margin-top: 15px !important;
transition: all 0.3s ease !important;
cursor: pointer !important;
display: inline-block !important;
}
.download-button:hover {
transform: translateY(-3px) !important;
box-shadow: 0 8px 20px rgba(0,0,0,0.6) !important;
}
.download-button:active {
transform: translateY(1px) !important;
}
.download-icon {
margin-right: 8px;
font-size: 28px;
}
.feature-list {
display: flex;
justify-content: center;
flex-wrap: wrap;
gap: 15px;
margin: 20px 0;
}
.feature-item {
background: rgba(255,255,255,0.15);
padding: 10px 15px;
border-radius: 8px;
display: flex;
align-items: center;
gap: 8px;
color: white;
font-size: 14px;
}
.feature-icon {
font-size: 26px;
}
.viewer-preview {
margin-top: 20px;
border-radius: 10px;
overflow: hidden;
box-shadow: 0 5px 15px rgba(0,0,0,0.2);
max-width: 400px;
margin-left: auto;
margin-right: auto;
}
.viewer-preview img {
width: 100%;
display: block;
}
.instructions {
background: rgba(255,255,255,0.1);
padding: 15px;
border-radius: 8px;
margin-top: 20px;
text-align: left;
}
.instructions h3 {
color: white;
margin-top: 0;
font-size: 16px;
}
.instructions ol {
color: rgba(255,255,255,0.9);
padding-left: 20px;
margin-bottom: 0;
}
.instructions li {
margin-bottom: 8px;
}
"""
# Create the Gradio interface with proper output handling
with gr.Blocks(title="Affective Virtual Environments - Chunked Processing", css=custom_css) as interface:
gr.Markdown("# The Emotional Machine")
gr.Markdown(
"""
**The Emotional Machine** is a digital media project that generates virtual environments using multimodal speech emotion recognition as its main mode of interaction.
### How to interact
1. Record your voice or upload an audio file.
2. Define the length to chunk your sample.
3. Use the checkbox if you want to generate audio for each chunk.
4. Generate your Affective Virtual Environment and wait for the results.
5. Download the HTML file.
6. Open your creation using any web browser.
---
**Learn more:**
• Video Tutorial: [How to Use this space ](https://youtu.be/eVD1lzwVhi8)
• For more information about the project, visit: [www.emotional-machines.com](https://www.emotional-machines.com)
"""
)
with gr.Row():
with gr.Column(scale=2):
audio_input = gr.Audio(label="Input Audio", type="filepath", sources=["microphone", "upload"])
# Add example audio selection
# example_selector = gr.Dropdown(
# label="Select Example Audio",
# choices=["Happy Speech", "Sad Story", "Neutral News"],
# value=None,
# info="Choose from pre-recorded example speeches"
# )
# Add button to load selected example
#load_example_btn = gr.Button("Load Example", variant="secondary")
with gr.Column(scale=1):
# Add chunk duration input
chunk_duration_input = gr.Number(
label="Chunk Duration (seconds)",
value=10,
minimum=1,
maximum=60,
step=1,
info="Duration of each audio segment to process (1-60 seconds)"
)
# Add checkbox for audio generation
generate_audio_checkbox = gr.Checkbox(
label="Generate Audio (may take longer)",
value=False,
info="Uncheck to skip music generation and speed up processing"
)
with gr.Row():
process_btn = gr.Button("Generate", variant="primary")
clear_btn = gr.Button("Clear All", variant="secondary")
# Add a loading indicator
loading_indicator = gr.HTML("""
<div id="loading" style="display: none; text-align: center; margin: 20px;">
<p style="font-size: 18px; color: #4a4a4a;">Processing audio chunks...</p>
<div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
<style>@keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } }</style>
</div>
""")
# Create output components for each chunk type
output_containers = []
group_components = [] # Store group components separately
# We'll create up to 20 chunk slots to accommodate different chunk durations
for i in range(20):
with gr.Group(visible=False) as chunk_group:
gr.Markdown(f"### Chunk {i+1} Results")
with gr.Row():
emotion_output = gr.Label(label="Acoustic Emotion Prediction")
transcription_output = gr.Label(label="Transcribed Text")
sentiment_output = gr.Label(label="Sentimental Analysis")
with gr.Row():
image_output = gr.Image(label="Generated Equirectangular Image")
image_360_output = gr.File(label="Download 360 Image", type="filepath")
with gr.Row():
audio_output = gr.Audio(label="Generated Music")
gr.HTML("<hr style='margin: 20px 0; border: 1px solid #ccc;'>")
group_components.append(chunk_group)
output_containers.append({
'emotion': emotion_output,
'transcription': transcription_output,
'sentiment': sentiment_output,
'image': image_output,
'image_360': image_360_output,
'music': audio_output
})
# Enhanced Download 360 Viewer Section
with gr.Group(visible=True, elem_classes="download-section") as download_group:
gr.Markdown("""
""")
# Enhanced download button
viewer_html_output = gr.File(
label=" Once processing is complete, download your AVE from here 🚀",
type="filepath",
interactive=False,
elem_classes="download-button"
)
# Add a hidden HTML component for JavaScript execution
js_output = gr.HTML(visible=False)
# Function to handle example selection
def load_example(example_name):
if not example_name:
return None, None
# Get the path to the example audio file
example_path = load_example_audio(example_name)
# Return the example path to update the audio component
return example_path, example_name
# Set up the button clicks
process_btn.click(
fn=process_and_display,
inputs=[audio_input, generate_audio_checkbox, chunk_duration_input],
outputs=[loading_indicator] + group_components + [comp for container in output_containers for comp in [
container['emotion'],
container['transcription'],
container['sentiment'],
container['image'],
container['image_360'],
container['music']
]] + [viewer_html_output, js_output]
)
clear_btn.click(
fn=clear_all,
inputs=[],
outputs=[audio_input] + group_components + [comp for container in output_containers for comp in [
container['emotion'],
container['transcription'],
container['sentiment'],
container['image'],
container['image_360'],
container['music']
]] + [loading_indicator, chunk_duration_input, viewer_html_output, js_output]
)
#load_example_btn.click(
# fn=load_example,
# inputs=[example_selector],
# outputs=[audio_input, example_selector]
#)
# Check if we're running on Hugging Face Spaces
is_spaces = os.getenv('SPACE_ID') is not None
# Launch with appropriate settings
interface.launch(share=True) # Only share when not on Spaces |