Spaces:

jfforero
/

Bello

Sleeping

App Files Files Community

jfforero commited on Aug 29, 2025

Commit

15be55e

verified ·

1 Parent(s): 60e3c05

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -27

app.py CHANGED Viewed

@@ -7,11 +7,13 @@ from PIL import Image
 import os
 from tensorflow.keras.models import load_model
 from faster_whisper import WhisperModel
 # Load the emotion prediction model
 def load_emotion_model(model_path):
     try:
         model = load_model(model_path)
         return model
     except Exception as e:
         print("Error loading emotion prediction model:", e)
@@ -26,8 +28,12 @@ model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
 # Function to transcribe audio
 def transcribe(wav_filepath):
-    segments, _ = model2.transcribe(wav_filepath, beam_size=5)
-    return "".join([segment.text for segment in segments])
 # Function to extract MFCC features from audio
 def extract_mfcc(wav_file_name):
@@ -45,59 +51,76 @@ emotions = {1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad', 5: 'angry', 6: 'fearf
 # Function to predict emotion from audio
 def predict_emotion_from_audio(wav_filepath):
     try:
         test_point = extract_mfcc(wav_filepath)
         if test_point is not None:
             test_point = np.reshape(test_point, newshape=(1, 40, 1))
             predictions = model.predict(test_point)
-            predicted_emotion_label = np.argmax(predictions[0])
-            return emotions[predicted_emotion_label]
         else:
             return "Error: Unable to extract features"
     except Exception as e:
         print("Error predicting emotion:", e)
-        return None
 api_key = os.getenv("DeepAI_api_key")
 # Function to generate an image using DeepAI Text to Image API
-import random
 def generate_image(emotion_prediction, transcribed_text, output_resolution=(1024, 1024)):
     try:
         url = "https://api.deepai.org/api/image-editor"
         headers = {
             'api-key': api_key
         }
         # Select a random image file from TerraIncognita0.jpg to TerraIncognita9.jpg
-        image_file_path = f'TAI_Images/TerraIncognita{random.randint(0, 9)}.jpg'
-        files = {
-            'image': open(image_file_path, 'rb'),
-             'text': "Generate Patagonian Monsters' with a " + emotion_prediction + " attitude, representing the idea of: [ "+ transcribed_text + "]. Illustrate this using asemic writings in an old map style."
-        }
-        response = requests.post(url, headers=headers, files=files)
         response_data = response.json()
         if 'output_url' in response_data:
-            return response_data['output_url']
         else:
             return None
     except Exception as e:
         print("Error generating image:", e)
         return None
 # Function to get predictions
 def get_predictions(audio_input):
     emotion_prediction = predict_emotion_from_audio(audio_input)
     transcribed_text = transcribe(audio_input)
-    texto_imagen = emotion_prediction + transcribed_text
-    image = generate_image(api_key, texto_imagen)
     return emotion_prediction, transcribed_text, image
 # Create the Gradio interface
@@ -105,13 +128,12 @@ interface = gr.Interface(
     fn=get_predictions,
     inputs=gr.Audio(label="Input Audio", type="filepath", sources=["microphone"]),
     outputs=[
-        gr.Label("Acoustic Prediction", label="Acoustic Prediction"),
-        gr.Label("Transcribed Text", label="Transcribed Text"),
         gr.Image(type='pil', label="Generated Image")
     ],
     title="Affective Virtual Environments",
     description="Create an AVE using your voice."
 )
 interface.launch()

 import os
 from tensorflow.keras.models import load_model
 from faster_whisper import WhisperModel
+import random
 # Load the emotion prediction model
 def load_emotion_model(model_path):
     try:
         model = load_model(model_path)
+        print("Emotion model loaded successfully")
         return model
     except Exception as e:
         print("Error loading emotion prediction model:", e)
 # Function to transcribe audio
 def transcribe(wav_filepath):
+    try:
+        segments, _ = model2.transcribe(wav_filepath, beam_size=5)
+        return "".join([segment.text for segment in segments])
+    except Exception as e:
+        print("Error transcribing audio:", e)
+        return "Transcription failed"
 # Function to extract MFCC features from audio
 def extract_mfcc(wav_file_name):
 # Function to predict emotion from audio
 def predict_emotion_from_audio(wav_filepath):
     try:
+        if model is None:
+            return "Model not loaded"
         test_point = extract_mfcc(wav_filepath)
         if test_point is not None:
             test_point = np.reshape(test_point, newshape=(1, 40, 1))
             predictions = model.predict(test_point)
+            predicted_emotion_label = np.argmax(predictions[0]) + 1  # Adding 1 to match your emotion dictionary
+            return emotions.get(predicted_emotion_label, "Unknown emotion")
         else:
             return "Error: Unable to extract features"
     except Exception as e:
         print("Error predicting emotion:", e)
+        return "Prediction error"
 api_key = os.getenv("DeepAI_api_key")
 # Function to generate an image using DeepAI Text to Image API
 def generate_image(emotion_prediction, transcribed_text, output_resolution=(1024, 1024)):
     try:
+        if not api_key:
+            return "API key not found"
         url = "https://api.deepai.org/api/image-editor"
         headers = {
             'api-key': api_key
         }
         # Select a random image file from TerraIncognita0.jpg to TerraIncognita9.jpg
+        random_index = random.randint(0, 9)
+        image_file_path = f'TAI_Images/TerraIncognita{random_index}.jpg'
+        # Check if the file exists
+        if not os.path.exists(image_file_path):
+            return f"Image file not found: {image_file_path}"
+        prompt_text = f"Generate Patagonian Monsters' with a {emotion_prediction} attitude, representing the idea of: [ {transcribed_text} ]. Illustrate this using asemic writings in an old map style."
+        with open(image_file_path, 'rb') as image_file:
+            files = {
+                'image': image_file,
+            }
+            data = {
+                'text': prompt_text
+            }
+            response = requests.post(url, headers=headers, files=files, data=data)
         response_data = response.json()
         if 'output_url' in response_data:
+            # Download the image and return it as a PIL Image
+            image_response = requests.get(response_data['output_url'])
+            return Image.open(BytesIO(image_response.content))
         else:
+            print("Error in DeepAI response:", response_data)
             return None
     except Exception as e:
         print("Error generating image:", e)
         return None
 # Function to get predictions
 def get_predictions(audio_input):
     emotion_prediction = predict_emotion_from_audio(audio_input)
     transcribed_text = transcribe(audio_input)
+    # Handle case where emotion_prediction might be None
+    if emotion_prediction is None:
+        emotion_prediction = "Unknown"
+    image = generate_image(emotion_prediction, transcribed_text)
     return emotion_prediction, transcribed_text, image
 # Create the Gradio interface
     fn=get_predictions,
     inputs=gr.Audio(label="Input Audio", type="filepath", sources=["microphone"]),
     outputs=[
+        gr.Label(label="Acoustic Prediction"),
+        gr.Label(label="Transcribed Text"),
         gr.Image(type='pil', label="Generated Image")
     ],
     title="Affective Virtual Environments",
     description="Create an AVE using your voice."
 )
 interface.launch()