import gradio as gr
import pyvista as pv
from pyvista import examples
import numpy as np
import librosa
import requests
from io import BytesIO
from PIL import Image
import os
from tensorflow.keras.models import load_model
from faster_whisper import WhisperModel
import random
from textblob import TextBlob
import torch
import scipy.io.wavfile
from transformers import AutoProcessor, MusicgenForConditionalGeneration
import tempfile
import base64
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import soundfile as sf
from pydub import AudioSegment
import math
import json
import imageio
from PIL import Image, ImageFilter
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import base64
from io import BytesIO
import struct
import cv2
import os
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
# Load the emotion prediction model
def load_emotion_model(model_path):
try:
model = load_model(model_path)
print("Emotion model loaded successfully")
return model
except Exception as e:
print("Error loading emotion prediction model:", e)
return None
model_path = 'mymodel_SER_LSTM_RAVDESS.h5'
model = load_emotion_model(model_path)
# Initialize WhisperModel
model_size = "small"
model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
# Load MusicGen model
def load_musicgen_model():
try:
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
music_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
music_model.to(device)
print("MusicGen model loaded successfully")
return processor, music_model, device
except Exception as e:
print("Error loading MusicGen model:", e)
return None, None, None
processor, music_model, device = load_musicgen_model()
# Function to chunk audio into segments
def chunk_audio(audio_path, chunk_duration=10):
"""Split audio into chunks and return list of chunk file paths"""
try:
# Load audio file
audio = AudioSegment.from_file(audio_path)
duration_ms = len(audio)
chunk_ms = chunk_duration * 1000
# Validate chunk duration
if chunk_duration <= 0:
raise ValueError("Chunk duration must be positive")
if chunk_duration > duration_ms / 1000:
# If chunk duration is longer than audio, return the whole audio
return [audio_path], 1
chunks = []
chunk_files = []
# Calculate number of chunks
num_chunks = math.ceil(duration_ms / chunk_ms)
for i in range(num_chunks):
start_ms = i * chunk_ms
end_ms = min((i + 1) * chunk_ms, duration_ms)
# Extract chunk
chunk = audio[start_ms:end_ms]
chunks.append(chunk)
# Save chunk to temporary file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
chunk.export(tmp_file.name, format="wav")
chunk_files.append(tmp_file.name)
return chunk_files, num_chunks
except Exception as e:
print("Error chunking audio:", e)
# Return original file as single chunk if chunking fails
return [audio_path], 1
# Function to transcribe audio
def transcribe(wav_filepath):
try:
segments, _ = model2.transcribe(wav_filepath, beam_size=5)
return "".join([segment.text for segment in segments])
except Exception as e:
print("Error transcribing audio:", e)
return "Transcription failed"
# Function to extract MFCC features from audio
def extract_mfcc(wav_file_name):
try:
y, sr = librosa.load(wav_file_name)
mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
return mfccs
except Exception as e:
print("Error extracting MFCC features:", e)
return None
# Emotions dictionary
emotions = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful', 6: 'disgust', 7: 'surprised'}
# Function to predict emotion from audio
def predict_emotion_from_audio(wav_filepath):
try:
if model is None:
return "Model not loaded"
test_point = extract_mfcc(wav_filepath)
if test_point is not None:
test_point = np.reshape(test_point, newshape=(1, 40, 1))
predictions = model.predict(test_point)
predicted_emotion_label = np.argmax(predictions[0])
return emotions.get(predicted_emotion_label, "Unknown emotion")
else:
return "Error: Unable to extract features"
except Exception as e:
print("Error predicting emotion:", e)
return "Prediction error"
# Function to analyze sentiment from text
def analyze_sentiment(text):
try:
if not text or text.strip() == "":
return "neutral", 0.0
analysis = TextBlob(text)
polarity = analysis.sentiment.polarity
if polarity > 0.1:
sentiment = "positive"
elif polarity < -0.1:
sentiment = "negative"
else:
sentiment = "neutral"
return sentiment, polarity
except Exception as e:
print("Error analyzing sentiment:", e)
return "neutral", 0.0
# Function to get image prompt based on sentiment
def get_image_prompt(sentiment, transcribed_text, chunk_idx, total_chunks):
base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: "
if sentiment == "positive":
return f"Generate a non-figurative equirectangular 360 abstract texture of:{transcribed_text}. Use low histogram frequency in bright bins, dominant color in high RGB range, and high brightness and color variance. Apply high-frequency texture with strong filter energy, pronounced gradient magnitude, and strong local contrast. Use high spatial complexity, increased horizontal and vertical symmetry, high edge density, bright gray levels, and high contrast. Emphasize rich visual structure, color variation, and texture intensity across spatial composition."
elif sentiment == "negative":
return f"Generate a non-figurative equirectangular 360 abstract texture of:{transcribed_text}. Use high histogram frequency in dark bins, dominant color in low RGB range, and low brightness and color variance. Apply low-frequency texture with low filter energy, weak gradient magnitude, and low local contrast. Use low spatial complexity, reduced horizontal and vertical symmetry, low edge density, dark gray levels, and moderate contrast. Emphasize coarse structure and limited variation in color, texture, and spatial distribution."
else: # neutral
return f"Generate a non-figurative abstract equirectangular 360 abstract texture of:{transcribed_text}. Use a balanced histogram frequency across bins, dominant color in a mid RGB range, and moderate brightness and color variance. Apply medium-frequency texture with moderate filter energy, standard gradient magnitude, and average local contrast. Use medium spatial complexity, balanced horizontal and vertical symmetry, medium edge density, mid-range gray levels, and standard contrast. Emphasize naturalistic structure and typical variation in color, texture, and spatial distribution."
# Function to get music prompt based on emotion
def get_music_prompt(emotion, transcribed_text, chunk_idx, total_chunks):
base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: "
emotion_prompts = {
'neutral': f"Generate a neutral soundtrack with balanced energy and smooth spectral profile. Use moderate tempo (~100 BPM), onset rate around 2.8/sec, spectral centroid near 1000 Hz, and low dissonance. Keep pitch salience moderate (0.50) and loudness stable (~0.70 dB). Maintain low harmonic change rate (~0.05/sec) and tonal entropy 1.5 for equilibrium. Emphasize tonal balance, steady dynamics, and calm tonal centers. The music should feel even, ambient, and unobtrusive, complementing: {transcribed_text}.",
'calm': f"Generate a calm soundtrack with a slow tempo (~85 BPM), low onset rate (~2.2/sec), soft spectral centroid (~850 Hz), and smooth timbral evolution. Use low dissonance, high spectral flatness, and gentle pitch salience (~0.48). Keep loudness low (~0.65 dB) with infrequent harmonic changes (~0.04/sec) and stable tonality (Krumhansl value 0.80, major mode). The music should evoke tranquility and serenity through warm timbres, sustained harmonies, and flowing textures inspired by: {transcribed_text}.",
'happy': f"Generate a happy soundtrack with fast tempo (~127 BPM), dense rhythmic activity (~4.2 onsets/sec), and bright timbre (spectral centroid ~1321 Hz). Use variable dissonance and peaked spectral kurtosis to create vivid texture. Maintain pitch salience (~0.54), loudness (~0.90 dB), and chord change rate (~0.07/sec). Keep tonal entropy moderate (1.95) and Krumhansl value (0.83, major mode). The music should convey joy and positivity through energetic rhythms, ornamented melodic contours, and harmonically grounded progressions inspired by: {transcribed_text}.",
'sad': f"Generate a sad soundtrack with slow tempo (~72 BPM), sparse onset rate (~2.0/sec), and dark timbre (spectral centroid ~720 Hz). Use moderate dissonance, low spectral kurtosis, and soft pitch salience (~0.45). Keep loudness subdued (~0.60 dB) with rare harmonic changes (~0.05/sec) and low tonal entropy (~1.4). Emphasize minor mode with gentle phrasing and sustained harmonic textures. The music should evoke sadness, intimacy, and reflection in relation to: {transcribed_text}.",
'angry': f"Generate an angry soundtrack with moderately fast tempo (~120 BPM), onset rate (~3.4/sec), and bright, sharp timbre (spectral centroid ~2002 Hz). Use flat spectral kurtosis and stable dissonance. Maintain clear pitch salience (~0.58), high loudness (~0.96 dB), and frequent chord changes (~0.10/sec). Set tonal entropy to 2.57 and Krumhansl key profile (~0.54, minor mode). The music should express anger through strong rhythmic drive, aggressive articulation, and harmonically unstable progressions that reflect: {transcribed_text}.",
'fearful': f"Generate a fearful soundtrack with irregular tempo (~95 BPM), fluctuating onset rate (~3.0/sec), and high spectral variability (centroid ~1750 Hz). Use unstable dissonance, low pitch salience (~0.42), and dynamic loudness (~0.80 dB). Increase chord change irregularity (~0.09/sec) and tonal entropy (2.4, minor mode). Emphasize eerie textures, spatial tension, and spectral modulation. The music should evoke suspense, fear, and anticipation inspired by: {transcribed_text}.",
'disgust': f"Generate a disgusted soundtrack with moderate tempo (~90 BPM), irregular onset rate (~2.5/sec), and dark, rough timbre (spectral centroid ~950 Hz). Use dissonant harmonies, unstable spectral kurtosis, and low pitch salience (~0.40). Keep loudness (~0.75 dB) and tonal entropy (~2.2, minor mode). The music should evoke discomfort and unease through distorted textures, rough intervals, and unstable harmonic motion reflecting: {transcribed_text}.",
'surprised': f"Generate a surprised soundtrack with variable tempo (~110 BPM), fluctuating onset rate (~3.8/sec), and dynamic spectral centroid (~1500 Hz). Use high spectral kurtosis and pitch salience (~0.57) to accent sudden contrasts. Loudness should vary (~0.85 dB) with irregular chord changes (~0.11/sec) and moderate tonal entropy (~2.0, major mode). The music should evoke surprise and wonder through abrupt transitions, playful motifs, and expressive timbral changes inspired by: {transcribed_text}."
}
return emotion_prompts.get(
emotion.lower(),
f"Create background music with {emotion} atmosphere that represents: {transcribed_text}"
)
# Function to generate music with MusicGen (using acoustic emotion prediction)
def generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks):
try:
if processor is None or music_model is None:
return None
# Get specific prompt based on emotion
prompt = get_music_prompt(emotion_prediction, transcribed_text, chunk_idx, total_chunks)
# Limit prompt length to avoid model issues
if len(prompt) > 200:
prompt = prompt[:200] + "..."
inputs = processor(
text=[prompt],
padding=True,
return_tensors="pt",
).to(device)
# Generate audio
audio_values = music_model.generate(**inputs, max_new_tokens=512)
# Convert to numpy array and sample rate
sampling_rate = music_model.config.audio_encoder.sampling_rate
audio_data = audio_values[0, 0].cpu().numpy()
# Normalize audio data
audio_data = audio_data / np.max(np.abs(audio_data))
# Create a temporary file to save the audio
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
scipy.io.wavfile.write(tmp_file.name, rate=sampling_rate, data=audio_data)
return tmp_file.name
except Exception as e:
print("Error generating music:", e)
return None
# --- DeepAI Image Generation (Text2Img) ---
api_key = os.getenv("DeepAI_api_key")
# Function to upscale image using Lanczos interpolation
def upscale_image(image, target_width=4096, target_height=2048):
"""
Upscale image using DeepAI's Torch-SRGAN API for super resolution
"""
try:
if not api_key:
print("No API key available for upscaling")
# Fallback to OpenCV if no API key
img_array = np.array(image)
upscaled = cv2.resize(
img_array,
(target_width, target_height),
interpolation=cv2.INTER_LANCZOS4
)
return Image.fromarray(upscaled)
# Save the image to a temporary file
with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_input:
image.save(tmp_input.name, "JPEG", quality=95)
# Make request to DeepAI torch-srgan API
response = requests.post(
"https://api.deepai.org/api/torch-srgan",
files={'image': open(tmp_input.name, 'rb')},
headers={'api-key': api_key}
)
data = response.json()
if 'output_url' in data:
# Download the upscaled image
img_resp = requests.get(data['output_url'])
upscaled_image = Image.open(BytesIO(img_resp.content))
# Ensure the image meets our target dimensions
if upscaled_image.size != (target_width, target_height):
upscaled_image = upscaled_image.resize(
(target_width, target_height),
Image.Resampling.LANCZOS
)
# Clean up temporary file
os.unlink(tmp_input.name)
return upscaled_image
else:
print("Error in DeepAI upscaling response:", data)
# Fallback to OpenCV if API fails
img_array = np.array(image)
upscaled = cv2.resize(
img_array,
(target_width, target_height),
interpolation=cv2.INTER_LANCZOS4
)
return Image.fromarray(upscaled)
except Exception as e:
print(f"Error upscaling image with DeepAI: {e}")
# Fallback to OpenCV if any error occurs
img_array = np.array(image)
upscaled = cv2.resize(
img_array,
(target_width, target_height),
interpolation=cv2.INTER_LANCZOS4
)
return Image.fromarray(upscaled)
# ADD THE MISSING generate_image FUNCTION HERE
def generate_image(sentiment_prediction, transcribed_text, chunk_idx, total_chunks):
try:
if not api_key:
# fallback white image if no API key
base_image = Image.new('RGB', (1024,512), color='white')
else:
# Get specific prompt based on sentiment
prompt = get_image_prompt(sentiment_prediction, transcribed_text, chunk_idx, total_chunks)
# Make request to DeepAI text2img API
response = requests.post(
"https://api.deepai.org/api/text2img",
data={
'text': prompt,
'width': 1024,
'height': 512,
'image_generator_version': 'hd'
},
headers={'api-key': api_key}
)
data = response.json()
if 'output_url' in data:
# Download the generated image
img_resp = requests.get(data['output_url'])
base_image = Image.open(BytesIO(img_resp.content))
else:
print("Error in DeepAI response:", data)
# Return a fallback image
base_image = Image.new('RGB', (1024,512), color='white')
# Upscale the image for better quality in 360 viewer
upscaled_image = upscale_image(base_image)
return upscaled_image
except Exception as e:
print("Error generating image:", e)
# Return a fallback image
return Image.new('RGB', (1024,512), color='white')
# Function to process a single chunk
def process_chunk(chunk_path, chunk_idx, total_chunks, generate_audio=True):
try:
# Get acoustic emotion prediction (for music)
emotion_prediction = predict_emotion_from_audio(chunk_path)
# Get transcribed text
transcribed_text = transcribe(chunk_path)
# Analyze sentiment of transcribed text (for image)
sentiment, polarity = analyze_sentiment(transcribed_text)
# Generate image using SENTIMENT analysis with specific prompt
image = generate_image(sentiment, transcribed_text, chunk_idx, total_chunks)
# Add 360 metadata to the image
image_with_360_path = add_360_metadata(image)
# Generate music only if audio generation is enabled
music_path = None
if generate_audio:
music_path = generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks)
return {
'chunk_index': chunk_idx + 1,
'emotion': emotion_prediction,
'transcription': transcribed_text,
'sentiment': sentiment,
'image': image, # Original image for display in Gradio
'image_360': image_with_360_path, # Image with 360 metadata
'music': music_path
}
except Exception as e:
print(f"Error processing chunk {chunk_idx + 1}:", e)
# Return a fallback result with all required keys
return {
'chunk_index': chunk_idx + 1,
'emotion': "Error",
'transcription': "Transcription failed",
'sentiment': "Sentiment: error",
'image': Image.new('RGB', (1440, 770), color='white'),
'image_360': None,
'music': None
}
# Function to get predictions for all chunks
def get_predictions(audio_input, generate_audio=True, chunk_duration=10):
# Chunk the audio into segments
chunk_files, total_chunks = chunk_audio(audio_input, chunk_duration)
results = []
# Process each chunk
for i, chunk_path in enumerate(chunk_files):
print(f"Processing chunk {i+1}/{total_chunks} ({chunk_duration}s each)")
result = process_chunk(chunk_path, i, total_chunks, generate_audio)
results.append(result)
# Clean up temporary chunk files
for chunk_path in chunk_files:
try:
if chunk_path != audio_input: # Don't delete original input file
os.unlink(chunk_path)
except:
pass
return results
def create_xmp_block(width, height):
"""Create XMP metadata block following ExifTool's exact format."""
xmp = (
f'\n'
f'
Processing audio in {chunk_duration}-second chunks...
This may take several minutes depending on the audio length...