jfforero's picture
Update app.py
69b48cd verified
import gradio as gr
import numpy as np
import librosa
import requests
from io import BytesIO
from PIL import Image
import os
from tensorflow.keras.models import load_model
from faster_whisper import WhisperModel
# Load the emotion prediction model
def load_emotion_model(model_path):
try:
model = load_model(model_path)
return model
except Exception as e:
print("Error loading emotion prediction model:", e)
return None
model_path = 'mymodel_SER_LSTM_RAVDESS.h5'
model = load_emotion_model(model_path)
# Initialize WhisperModel
model_size = "small"
model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
# Function to transcribe audio
def transcribe(wav_filepath):
segments, _ = model2.transcribe(wav_filepath, beam_size=5)
return "".join([segment.text for segment in segments])
# Function to extract MFCC features from audio
def extract_mfcc(wav_file_name):
try:
y, sr = librosa.load(wav_file_name)
mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
return mfccs
except Exception as e:
print("Error extracting MFCC features:", e)
return None
# Emotions dictionary (aligned with training: 0–7)
emotions = {
0: 'neutral',
1: 'calm',
2: 'happy',
3: 'sad',
4: 'angry',
5: 'fearful',
6: 'disgust',
7: 'surprised'
}
# Function to predict emotion from audio
def predict_emotion_from_audio(wav_filepath):
try:
test_point = extract_mfcc(wav_filepath)
if test_point is None:
return "Error: Unable to extract features"
# Reshape to match model input
test_point = np.reshape(test_point, (1, 40, 1))
# Predict
predictions = model.predict(test_point, verbose=0)
# Get index (0–7, consistent with training)
predicted_emotion_label = np.argmax(predictions[0])
return emotions[predicted_emotion_label]
except Exception as e:
print("Error predicting emotion:", e)
return None
api_key = os.getenv("DeepAI_api_key")
# Function to generate an image using DeepAI Text to Image API
import random
def generate_image(emotion_prediction, transcribed_text, output_resolution=(1024, 1024)):
try:
url = "https://api.deepai.org/api/image-editor"
headers = {
'api-key': api_key
}
# Select a random image file from TerraIncognita0.jpg to TerraIncognita9.jpg
image_file_path = f'TAI_Images/TerraIncognita{random.randint(0, 9)}.jpg'
files = {
'image': open(image_file_path, 'rb'),
'text': f"""
Transform this image into an ancient exploratory map of an unknown land. The territory emerges from: {transcribed_text}. Its geography, symbols, and distortions should evoke the emotion of {emotion_prediction}."""
}
response = requests.post(url, headers=headers, files=files)
response_data = response.json()
if 'output_url' in response_data:
return response_data['output_url']
else:
return None
except Exception as e:
print("Error generating image:", e)
return None
# Function to get predictions
def get_predictions(audio_input):
emotion_prediction = predict_emotion_from_audio(audio_input)
transcribed_text = transcribe(audio_input)
texto_imagen = emotion_prediction + transcribed_text
image = generate_image(api_key, texto_imagen)
return emotion_prediction, transcribed_text, image
# Create the Gradio interface
interface = gr.Interface(
fn=get_predictions,
inputs=gr.Audio(label="Input Audio", type="filepath", sources=["microphone"]),
outputs=[
gr.Label("Acoustic Prediction", label="Acoustic Prediction"),
gr.Label("Transcribed Text", label="Transcribed Text"),
gr.Image(type='pil', label="Generated Image")
],
title="Terra Australis Ignota: Map Generation",
description="""
Imagine Terra Australis Ignota.
Speak freely. Describe what emerges:
a place, a presence, a climate, a memory, a distortion.
↳ Reference: [Terra Australis Ignota](https://territoriosvirtuales.wordpress.com/1570/05/20/terra-australis-ignota/)
To interact:
Click Record and speak. Then stop the recording.
You can listen, edit if needed, and submit your audio file to compose a map.
"""
,
flagging_mode="never" # ✅ this removes the flag button
)
interface.launch(server_name="0.0.0.0", server_port=7860)