Spaces:

jfforero
/

Speech2Scene

Sleeping

App Files Files Community

jfforero commited on Sep 4, 2025

Commit

2f2ec24

verified ·

1 Parent(s): a585b58

first commit

Browse files

Files changed (1) hide show

app.py +423 -0

app.py ADDED Viewed

	@@ -0,0 +1,423 @@

+import gradio as gr
+import pyvista as pv
+from pyvista import examples
+import numpy as np
+import librosa
+import requests
+from io import BytesIO
+from PIL import Image
+import os
+from tensorflow.keras.models import load_model
+from faster_whisper import WhisperModel
+import random
+from textblob import TextBlob
+import torch
+import scipy.io.wavfile
+from transformers import AutoProcessor, MusicgenForConditionalGeneration
+import tempfile
+import base64
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import soundfile as sf
+from pydub import AudioSegment
+import math
+import json
+# Load the emotion prediction model
+def load_emotion_model(model_path):
+    try:
+        model = load_model(model_path)
+        print("Emotion model loaded successfully")
+        return model
+    except Exception as e:
+        print("Error loading emotion prediction model:", e)
+        return None
+model_path = 'mymodel_SER_LSTM_RAVDESS.h5'
+model = load_emotion_model(model_path)
+# Initialize WhisperModel
+model_size = "small"
+model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
+# Load MusicGen model
+def load_musicgen_model():
+    try:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
+        music_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+        music_model.to(device)
+        print("MusicGen model loaded successfully")
+        return processor, music_model, device
+    except Exception as e:
+        print("Error loading MusicGen model:", e)
+        return None, None, None
+processor, music_model, device = load_musicgen_model()
+# Function to chunk audio into 5-second segments
+def chunk_audio(audio_path, chunk_duration=5):
+    """Split audio into 5-second chunks and return list of chunk file paths"""
+    try:
+        # Load audio file
+        audio = AudioSegment.from_file(audio_path)
+        duration_ms = len(audio)
+        chunk_ms = chunk_duration * 1000
+        chunks = []
+        chunk_files = []
+        # Calculate number of chunks
+        num_chunks = math.ceil(duration_ms / chunk_ms)
+        for i in range(num_chunks):
+            start_ms = i * chunk_ms
+            end_ms = min((i + 1) * chunk_ms, duration_ms)
+            # Extract chunk
+            chunk = audio[start_ms:end_ms]
+            chunks.append(chunk)
+            # Save chunk to temporary file
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+                chunk.export(tmp_file.name, format="wav")
+                chunk_files.append(tmp_file.name)
+        return chunk_files, num_chunks
+    except Exception as e:
+        print("Error chunking audio:", e)
+        # Return original file as single chunk if chunking fails
+        return [audio_path], 1
+# Function to transcribe audio
+def transcribe(wav_filepath):
+    try:
+        segments, _ = model2.transcribe(wav_filepath, beam_size=5)
+        return "".join([segment.text for segment in segments])
+    except Exception as e:
+        print("Error transcribing audio:", e)
+        return "Transcription failed"
+# Function to extract MFCC features from audio
+def extract_mfcc(wav_file_name):
+    try:
+        y, sr = librosa.load(wav_file_name)
+        mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
+        return mfccs
+    except Exception as e:
+        print("Error extracting MFCC features:", e)
+        return None
+# Emotions dictionary
+emotions = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful', 6: 'disgust', 7: 'surprised'}
+# Function to predict emotion from audio
+def predict_emotion_from_audio(wav_filepath):
+    try:
+        if model is None:
+            return "Model not loaded"
+        test_point = extract_mfcc(wav_filepath)
+        if test_point is not None:
+            test_point = np.reshape(test_point, newshape=(1, 40, 1))
+            predictions = model.predict(test_point)
+            predicted_emotion_label = np.argmax(predictions[0])
+            return emotions.get(predicted_emotion_label, "Unknown emotion")
+        else:
+            return "Error: Unable to extract features"
+    except Exception as e:
+        print("Error predicting emotion:", e)
+        return "Prediction error"
+# Function to analyze sentiment from text
+def analyze_sentiment(text):
+    try:
+        if not text or text.strip() == "":
+            return "neutral", 0.0
+        analysis = TextBlob(text)
+        polarity = analysis.sentiment.polarity
+        if polarity > 0.1:
+            sentiment = "positive"
+        elif polarity < -0.1:
+            sentiment = "negative"
+        else:
+            sentiment = "neutral"
+        return sentiment, polarity
+    except Exception as e:
+        print("Error analyzing sentiment:", e)
+        return "neutral", 0.0
+# Function to get image prompt based on sentiment
+def get_image_prompt(sentiment, transcribed_text, chunk_idx, total_chunks):
+    base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: "
+    if sentiment == "positive":
+        return base_prompt + f"Generate a vibrant, uplifting equirectangular 360 image texture with bright colors, joyful atmosphere, and optimistic vibes representing: [{transcribed_text}]. The scene should evoke happiness and positivity."
+    elif sentiment == "negative":
+        return base_prompt + f"Generate a moody, dramatic equirectangular 360 image texture with dark tones, intense atmosphere, and emotional depth representing: [{transcribed_text}]. The scene should convey melancholy and intensity."
+    else:  # neutral
+        return base_prompt + f"Generate a balanced, serene equirectangular 360 image texture with harmonious colors, peaceful atmosphere, and calm vibes representing: [{transcribed_text}]. The scene should evoke tranquility and balance."
+# Function to get music prompt based on emotion
+def get_music_prompt(emotion, transcribed_text, chunk_idx, total_chunks):
+    base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: "
+    emotion_prompts = {
+        'neutral': f"Create ambient, background music with neutral tones, subtle melodies, and unobtrusive atmosphere that complements: {transcribed_text}. The music should be calm and balanced.",
+        'calm': f"Generate soothing, peaceful music with gentle melodies, soft instrumentation, and relaxing vibes that represents: {transcribed_text}. The music should evoke tranquility and serenity.",
+        'happy': f"Create joyful, upbeat music with cheerful melodies, bright instrumentation, and energetic rhythms that celebrates: {transcribed_text}. The music should evoke happiness and positivity.",
+        'sad': f"Generate emotional, melancholic music with poignant melodies, soft strings, and heartfelt atmosphere that reflects: {transcribed_text}. The music should evoke sadness and reflection.",
+        'angry': f"Create intense, powerful music with driving rhythms, aggressive instrumentation, and strong dynamics that expresses: {transcribed_text}. The music should evoke anger and intensity.",
+        'fearful': f"Generate suspenseful, tense music with eerie melodies, atmospheric sounds, and unsettling vibes that represents: {transcribed_text}. The music should evoke fear and anticipation.",
+        'disgust': f"Create dark, unsettling music with dissonant harmonies, unusual sounds, and uncomfortable atmosphere that reflects: {transcribed_text}. The music should evoke discomfort and unease.",
+        'surprised': f"Generate dynamic, unexpected music with sudden changes, playful melodies, and surprising elements that represents: {transcribed_text}. The music should evoke surprise and wonder."
+    }
+    return base_prompt + emotion_prompts.get(emotion.lower(),
+        f"Create background music with {emotion} atmosphere that represents: {transcribed_text}")
+# Function to generate music with MusicGen (using acoustic emotion prediction)
+def generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks):
+    try:
+        if processor is None or music_model is None:
+            return None
+        # Get specific prompt based on emotion
+        prompt = get_music_prompt(emotion_prediction, transcribed_text, chunk_idx, total_chunks)
+        # Limit prompt length to avoid model issues
+        if len(prompt) > 200:
+            prompt = prompt[:200] + "..."
+        inputs = processor(
+            text=[prompt],
+            padding=True,
+            return_tensors="pt",
+        ).to(device)
+        # Generate audio
+        audio_values = music_model.generate(**inputs, max_new_tokens=512)
+        # Convert to numpy array and sample rate
+        sampling_rate = music_model.config.audio_encoder.sampling_rate
+        audio_data = audio_values[0, 0].cpu().numpy()
+        # Normalize audio data
+        audio_data = audio_data / np.max(np.abs(audio_data))
+        # Create a temporary file to save the audio
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            scipy.io.wavfile.write(tmp_file.name, rate=sampling_rate, data=audio_data)
+            return tmp_file.name
+    except Exception as e:
+        print("Error generating music:", e)
+        return None
+# --- DeepAI Image Generation (Text2Img) ---
+api_key = os.getenv("DeepAI_api_key")
+def generate_image(sentiment_prediction, transcribed_text, chunk_idx, total_chunks):
+    try:
+        if not api_key:
+            # fallback white image if no API key
+            return Image.new('RGB', (1024, 512), color='white')
+        # Get specific prompt based on sentiment
+        prompt = get_image_prompt(sentiment_prediction, transcribed_text, chunk_idx, total_chunks)
+        # Make request to DeepAI text2img API
+        response = requests.post(
+            "https://api.deepai.org/api/text2img",
+            data={
+                'text': prompt,
+                'width': 1024,
+                'height': 512,
+                'image_generator_version': 'hd'
+            },
+            headers={'api-key': api_key}
+        )
+        data = response.json()
+        if 'output_url' in data:
+            # Download the generated image
+            img_resp = requests.get(data['output_url'])
+            return Image.open(BytesIO(img_resp.content))
+        else:
+            print("Error in DeepAI response:", data)
+            # Return a fallback image
+            return Image.new('RGB', (1024, 512), color='white')
+    except Exception as e:
+        print("Error generating image:", e)
+        # Return a fallback image
+        return Image.new('RGB', (1024, 512), color='white')
+# Function to process a single chunk
+def process_chunk(chunk_path, chunk_idx, total_chunks):
+    try:
+        # Get acoustic emotion prediction (for music)
+        emotion_prediction = predict_emotion_from_audio(chunk_path)
+        # Get transcribed text
+        transcribed_text = transcribe(chunk_path)
+        # Analyze sentiment of transcribed text (for image)
+        sentiment, polarity = analyze_sentiment(transcribed_text)
+        # Generate image using SENTIMENT analysis with specific prompt
+        image = generate_image(sentiment, transcribed_text, chunk_idx, total_chunks)
+        # Generate music using ACOUSTIC EMOTION prediction with specific prompt
+        music_path = generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks)
+        #'sentiment': f"Sentiment: {sentiment} (Polarity: {polarity:.2f})",
+        return {
+            'chunk_index': chunk_idx + 1,
+            'emotion': emotion_prediction,
+            'transcription': transcribed_text,
+            'sentiment':  sentiment,
+            'image': image,
+            'music': music_path
+        }
+    except Exception as e:
+        print(f"Error processing chunk {chunk_idx + 1}:", e)
+        # Return a fallback result with all required keys
+        return {
+            'chunk_index': chunk_idx + 1,
+            'emotion': "Error",
+            'transcription': "Transcription failed",
+            'sentiment': "Sentiment: error",
+            'image': Image.new('RGB', (1024, 512), color='white'),
+            'music': None
+        }
+# Function to get predictions for all chunks
+def get_predictions(audio_input):
+    # Chunk the audio into 5-second segments
+    chunk_files, total_chunks = chunk_audio(audio_input, chunk_duration=5)
+    results = []
+    # Process each chunk
+    for i, chunk_path in enumerate(chunk_files):
+        print(f"Processing chunk {i+1}/{total_chunks}")
+        result = process_chunk(chunk_path, i, total_chunks)
+        results.append(result)
+    # Clean up temporary chunk files
+    for chunk_path in chunk_files:
+        try:
+            if chunk_path != audio_input:  # Don't delete original input file
+                os.unlink(chunk_path)
+        except:
+            pass
+    return results
+# ... (your existing imports remain the same)
+# Create the Gradio interface with proper output handling
+with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as interface:
+    gr.Markdown("# Affective Virtual Environments")
+    gr.Markdown("Create an AVE using your voice. Audio is split into 5-second chunks, with separate predictions and generations for each segment.")
+    with gr.Row():
+        audio_input = gr.Audio(label="Input Audio", type="filepath", sources=["microphone", "upload"])
+        process_btn = gr.Button("Process Audio", variant="primary")
+    # Add a loading indicator
+    loading_indicator = gr.HTML("""
+        <div id="loading" style="display: none; text-align: center; margin: 20px;">
+            <p style="font-size: 18px; color: #4a4a4a;">Processing audio chunks...</p>
+            <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
+            <style>@keyframes spin {0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); }}</style>
+        </div>
+    """)
+    # Create output components for each chunk type
+    output_containers = []
+    group_components = []  # Store group components separately
+    # We'll create up to 10 chunk slots (adjust as needed)
+    for i in range(10):
+        with gr.Group(visible=False) as chunk_group:
+            gr.Markdown(f"### Chunk {i+1} Results")
+            with gr.Row():
+                emotion_output = gr.Label(label="Acoustic Emotion Prediction")
+                transcription_output = gr.Label(label="Transcribed Text")
+                sentiment_output = gr.Label(label="Sentiment Analysis")
+            with gr.Row():
+                image_output = gr.Image(label="Generated Equirectangular Image")
+                audio_output = gr.Audio(label="Generated Music")
+            gr.HTML("<hr style='margin: 20px 0; border: 1px solid #ccc;'>")
+        group_components.append(chunk_group)  # Store the group component
+        output_containers.append({
+            'transcription': transcription_output,
+            'emotion': emotion_output,
+            'sentiment': sentiment_output,
+            'image': image_output,
+            'music': audio_output
+        })
+    def process_and_display(audio_input):
+        # Show loading indicator
+        yield [gr.HTML("""
+            <div style="text-align: center; margin: 20px;">
+                <p style="font-size: 18px; color: #4a4a4a;">Processing audio chunks...</p>
+                <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
+                <style>@keyframes spin {0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); }}</style>
+            </div>
+        """)] + [gr.Group(visible=False)] * len(group_components) + [None] * (len(output_containers) * 5)
+        results = get_predictions(audio_input)
+        # Initialize outputs list
+        outputs = []
+        group_visibility = []
+        # Process each result
+        for i, result in enumerate(results):
+            if i < len(output_containers):
+                group_visibility.append(gr.Group(visible=True))
+                outputs.extend([
+                    result['transcription'],
+                    result['emotion'],
+                    result['sentiment'],
+                    result['image'],
+                    result['music']
+                ])
+            else:
+                # If we have more results than containers, just extend with None
+                group_visibility.append(gr.Group(visible=False))
+                outputs.extend([None] * 5)
+        # Hide remaining containers
+        for i in range(len(results), len(output_containers)):
+            group_visibility.append(gr.Group(visible=False))
+            outputs.extend([None] * 5)
+        # Hide loading indicator and show results
+        yield [gr.HTML("")] + group_visibility + outputs
+    # Set up the button click
+    process_btn.click(
+        fn=process_and_display,
+        inputs=audio_input,
+        outputs=[loading_indicator] + group_components + [comp for container in output_containers for comp in [
+            container['transcription'],
+            container['emotion'],
+            container['sentiment'],
+            container['image'],
+            container['music']
+        ]]
+    )
+interface.launch()