AVE2

Runtime error

App Files Files Community

jfforero commited on May 6, 2024

Commit

f2b7d46

verified ·

1 Parent(s): 81019be

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -40

app.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import gradio as gr
 import numpy as np
 import librosa
 import requests
 from io import BytesIO
 from PIL import Image
 import os
 from tensorflow.keras.models import load_model
-from faster_whisper import WhisperModel
 # Load the emotion prediction model
 def load_emotion_model(model_path):
@@ -17,14 +17,25 @@ def load_emotion_model(model_path):
         print("Error loading emotion prediction model:", e)
         return None
 model_size = "small"
-# Run on CPU with INT8 compute
-model = WhisperModel(model_size, device="cpu", compute_type="int8")
-# Load emotion prediction model
-model_path = 'mymodel_SER_LSTM_RAVDESS.h5'
-emotion_model = load_emotion_model(model_path)
 # Function to extract MFCC features from audio
 def extract_mfcc(wav_file_name):
     try:
@@ -44,7 +55,7 @@ def predict_emotion_from_audio(wav_filepath):
         test_point = extract_mfcc(wav_filepath)
         if test_point is not None:
             test_point = np.reshape(test_point, newshape=(1, 40, 1))
-            predictions = emotion_model.predict(test_point)
             predicted_emotion_label = np.argmax(predictions[0]) + 1
             return emotions[predicted_emotion_label]
         else:
@@ -55,43 +66,47 @@ def predict_emotion_from_audio(wav_filepath):
 api_key = os.getenv("DeepAI_api_key")
 # Predict emotion from audio
 def get_predictions(audio_input):
-    try:
-        audio_data = audio_input.read()  # Read the audio data
-        emotion_prediction = predict_emotion_from_audio(audio_data)
-        image = generate_image(api_key, emotion_prediction)
-        return emotion_prediction, image
-    except Exception as e:
-        print("Error processing audio:", e)
-        return None, None
 # Define a function to generate an image using DeepAI Text to Image API
 def generate_image(api_key, text):
-    try:
-        url = "https://api.deepai.org/api/text2img"
-        headers = {'api-key': api_key}
-        response = requests.post(
-            url,
-            data={'text': text},
-            headers=headers
-        )
-        response_data = response.json()
-        if 'output_url' in response_data:
-            image_url = response_data['output_url']
-            image_response = requests.get(image_url)
-            image = Image.open(BytesIO(image_response.content))
-            return image
-        else:
-            return None
-    except Exception as e:
-        print("Error generating image:", e)
         return None
 # Create the Gradio interface
-with gr.Interface(get_predictions,
-                  inputs=gr.inputs.Audio(label="Input Audio", type="file"),
-                  outputs=[gr.outputs.Text(label="Prediction"), gr.outputs.Image(label="Generated Image")],
-                  title="Emotional Machines Test",
-                  description="Load or Record an audio file to perform emotion analysis") as iface:
-    iface.launch()

 import gradio as gr
 import numpy as np
 import librosa
+import time
 import requests
 from io import BytesIO
 from PIL import Image
 import os
 from tensorflow.keras.models import load_model
 # Load the emotion prediction model
 def load_emotion_model(model_path):
         print("Error loading emotion prediction model:", e)
         return None
+model_path = 'mymodel_SER_LSTM_RAVDESS.h5'
+model = load_emotion_model(model_path)
+#####
+from faster_whisper import WhisperModel
 model_size = "small"
+# Run on GPU with FP16
+model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
+def transcribe(audio):
+    segments, _ = model2.transcribe(audio, beam_size=5)
+    return "".join([segment.text for segment in segments])
+#########
 # Function to extract MFCC features from audio
 def extract_mfcc(wav_file_name):
     try:
         test_point = extract_mfcc(wav_filepath)
         if test_point is not None:
             test_point = np.reshape(test_point, newshape=(1, 40, 1))
+            predictions = model.predict(test_point)
             predicted_emotion_label = np.argmax(predictions[0]) + 1
             return emotions[predicted_emotion_label]
         else:
 api_key = os.getenv("DeepAI_api_key")
 # Predict emotion from audio
 def get_predictions(audio_input):
+    emotion_prediction = predict_emotion_from_audio(audio_input)
+    # Generate image here or call a separate function
+    image = generate_image(api_key, emotion_prediction)
+    return emotion_prediction, image
 # Define a function to generate an image using DeepAI Text to Image API
 def generate_image(api_key, text):
+    url = "https://api.deepai.org/api/text2img"
+    headers = {'api-key': api_key}
+    response = requests.post(
+        url,
+        data={
+            'text': text,
+        },
+        headers=headers
+    )
+    response_data = response.json()
+    if 'output_url' in response_data:
+        image_url = response_data['output_url']
+        image_response = requests.get(image_url)
+        image = Image.open(BytesIO(image_response.content))
+        return image
+    else:
         return None
+####
 # Create the Gradio interface
+with gr.Blocks() as interface:
+    gr.Markdown("Emotional Machines test: Load or Record an audio file to speech emotion analysis")
+    with gr.Tabs():
+        with gr.Tab("Acoustic and Semantic Predictions"):
+            with gr.Row():
+                input_audio = gr.Audio(label="Input Audio", type="filepath")
+                submit_button = gr.Button("Submit")
+            output_label = [gr.Label("Prediction"), gr.Image(type='pil')]  # Use a single Label instead of a list
+    # Set the function to be called when the button is clicked
+    submit_button.click(get_predictions, inputs=input_audio, outputs=output_label)
+interface.launch()