Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| import librosa | |
| import requests | |
| from io import BytesIO | |
| from PIL import Image | |
| import os | |
| from tensorflow.keras.models import load_model | |
| from faster_whisper import WhisperModel | |
| # Load the emotion prediction model | |
| def load_emotion_model(model_path): | |
| try: | |
| model = load_model(model_path) | |
| return model | |
| except Exception as e: | |
| print("Error loading emotion prediction model:", e) | |
| return None | |
| model_path = 'mymodel_SER_LSTM_RAVDESS.h5' | |
| model = load_emotion_model(model_path) | |
| # Initialize WhisperModel | |
| model_size = "small" | |
| model2 = WhisperModel(model_size, device="cpu", compute_type="int8") | |
| # Function to transcribe audio | |
| def transcribe(wav_filepath): | |
| segments, _ = model2.transcribe(wav_filepath, beam_size=5) | |
| return "".join([segment.text for segment in segments]) | |
| # Function to extract MFCC features from audio | |
| def extract_mfcc(wav_file_name): | |
| try: | |
| y, sr = librosa.load(wav_file_name) | |
| mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0) | |
| return mfccs | |
| except Exception as e: | |
| print("Error extracting MFCC features:", e) | |
| return None | |
| # Emotions dictionary (aligned with training: 0–7) | |
| emotions = { | |
| 0: 'neutral', | |
| 1: 'calm', | |
| 2: 'happy', | |
| 3: 'sad', | |
| 4: 'angry', | |
| 5: 'fearful', | |
| 6: 'disgust', | |
| 7: 'surprised' | |
| } | |
| # Function to predict emotion from audio | |
| def predict_emotion_from_audio(wav_filepath): | |
| try: | |
| test_point = extract_mfcc(wav_filepath) | |
| if test_point is None: | |
| return "Error: Unable to extract features" | |
| # Reshape to match model input | |
| test_point = np.reshape(test_point, (1, 40, 1)) | |
| # Predict | |
| predictions = model.predict(test_point, verbose=0) | |
| # Get index (0–7, consistent with training) | |
| predicted_emotion_label = np.argmax(predictions[0]) | |
| return emotions[predicted_emotion_label] | |
| except Exception as e: | |
| print("Error predicting emotion:", e) | |
| return None | |
| api_key = os.getenv("DeepAI_api_key") | |
| # Function to generate an image using DeepAI Text to Image API | |
| import random | |
| def generate_image(emotion_prediction, transcribed_text, output_resolution=(1024, 1024)): | |
| try: | |
| url = "https://api.deepai.org/api/image-editor" | |
| headers = { | |
| 'api-key': api_key | |
| } | |
| # Select a random image file from TerraIncognita0.jpg to TerraIncognita9.jpg | |
| image_file_path = f'TAI_Images/TerraIncognita{random.randint(0, 9)}.jpg' | |
| files = { | |
| 'image': open(image_file_path, 'rb'), | |
| 'text': f""" | |
| Transform this image into an ancient exploratory map of an unknown land. The territory emerges from: {transcribed_text}. Its geography, symbols, and distortions should evoke the emotion of {emotion_prediction}.""" | |
| } | |
| response = requests.post(url, headers=headers, files=files) | |
| response_data = response.json() | |
| if 'output_url' in response_data: | |
| return response_data['output_url'] | |
| else: | |
| return None | |
| except Exception as e: | |
| print("Error generating image:", e) | |
| return None | |
| # Function to get predictions | |
| def get_predictions(audio_input): | |
| emotion_prediction = predict_emotion_from_audio(audio_input) | |
| transcribed_text = transcribe(audio_input) | |
| texto_imagen = emotion_prediction + transcribed_text | |
| image = generate_image(api_key, texto_imagen) | |
| return emotion_prediction, transcribed_text, image | |
| # Create the Gradio interface | |
| interface = gr.Interface( | |
| fn=get_predictions, | |
| inputs=gr.Audio(label="Input Audio", type="filepath", sources=["microphone"]), | |
| outputs=[ | |
| gr.Label("Acoustic Prediction", label="Acoustic Prediction"), | |
| gr.Label("Transcribed Text", label="Transcribed Text"), | |
| gr.Image(type='pil', label="Generated Image") | |
| ], | |
| title="Terra Australis Ignota: Map Generation", | |
| description=""" | |
| Imagine Terra Australis Ignota. | |
| Speak freely. Describe what emerges: | |
| a place, a presence, a climate, a memory, a distortion. | |
| ↳ Reference: [Terra Australis Ignota](https://territoriosvirtuales.wordpress.com/1570/05/20/terra-australis-ignota/) | |
| To interact: | |
| Click Record and speak. Then stop the recording. | |
| You can listen, edit if needed, and submit your audio file to compose a map. | |
| """ | |
| , | |
| flagging_mode="never" # ✅ this removes the flag button | |
| ) | |
| interface.launch(server_name="0.0.0.0", server_port=7860) |