jfforero commited on
Commit
15be55e
·
verified ·
1 Parent(s): 60e3c05

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -27
app.py CHANGED
@@ -7,11 +7,13 @@ from PIL import Image
7
  import os
8
  from tensorflow.keras.models import load_model
9
  from faster_whisper import WhisperModel
 
10
 
11
  # Load the emotion prediction model
12
  def load_emotion_model(model_path):
13
  try:
14
  model = load_model(model_path)
 
15
  return model
16
  except Exception as e:
17
  print("Error loading emotion prediction model:", e)
@@ -26,8 +28,12 @@ model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
26
 
27
  # Function to transcribe audio
28
  def transcribe(wav_filepath):
29
- segments, _ = model2.transcribe(wav_filepath, beam_size=5)
30
- return "".join([segment.text for segment in segments])
 
 
 
 
31
 
32
  # Function to extract MFCC features from audio
33
  def extract_mfcc(wav_file_name):
@@ -45,59 +51,76 @@ emotions = {1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad', 5: 'angry', 6: 'fearf
45
  # Function to predict emotion from audio
46
  def predict_emotion_from_audio(wav_filepath):
47
  try:
 
 
 
48
  test_point = extract_mfcc(wav_filepath)
49
  if test_point is not None:
50
  test_point = np.reshape(test_point, newshape=(1, 40, 1))
51
  predictions = model.predict(test_point)
52
- predicted_emotion_label = np.argmax(predictions[0])
53
- return emotions[predicted_emotion_label]
54
  else:
55
  return "Error: Unable to extract features"
56
  except Exception as e:
57
  print("Error predicting emotion:", e)
58
- return None
59
 
60
  api_key = os.getenv("DeepAI_api_key")
61
 
62
  # Function to generate an image using DeepAI Text to Image API
63
-
64
-
65
-
66
-
67
-
68
- import random
69
-
70
  def generate_image(emotion_prediction, transcribed_text, output_resolution=(1024, 1024)):
71
-
72
  try:
 
 
 
73
  url = "https://api.deepai.org/api/image-editor"
74
  headers = {
75
  'api-key': api_key
76
  }
 
77
  # Select a random image file from TerraIncognita0.jpg to TerraIncognita9.jpg
78
- image_file_path = f'TAI_Images/TerraIncognita{random.randint(0, 9)}.jpg'
79
- files = {
80
- 'image': open(image_file_path, 'rb'),
81
- 'text': "Generate Patagonian Monsters' with a " + emotion_prediction + " attitude, representing the idea of: [ "+ transcribed_text + "]. Illustrate this using asemic writings in an old map style."
82
-
83
- }
84
- response = requests.post(url, headers=headers, files=files)
 
 
 
 
 
 
 
 
 
 
 
85
  response_data = response.json()
86
  if 'output_url' in response_data:
87
- return response_data['output_url']
 
 
88
  else:
 
89
  return None
90
  except Exception as e:
91
  print("Error generating image:", e)
92
  return None
93
 
94
-
95
  # Function to get predictions
96
  def get_predictions(audio_input):
97
  emotion_prediction = predict_emotion_from_audio(audio_input)
98
  transcribed_text = transcribe(audio_input)
99
- texto_imagen = emotion_prediction + transcribed_text
100
- image = generate_image(api_key, texto_imagen)
 
 
 
 
 
101
  return emotion_prediction, transcribed_text, image
102
 
103
  # Create the Gradio interface
@@ -105,13 +128,12 @@ interface = gr.Interface(
105
  fn=get_predictions,
106
  inputs=gr.Audio(label="Input Audio", type="filepath", sources=["microphone"]),
107
  outputs=[
108
- gr.Label("Acoustic Prediction", label="Acoustic Prediction"),
109
- gr.Label("Transcribed Text", label="Transcribed Text"),
110
  gr.Image(type='pil', label="Generated Image")
111
  ],
112
  title="Affective Virtual Environments",
113
  description="Create an AVE using your voice."
114
  )
115
 
116
-
117
  interface.launch()
 
7
  import os
8
  from tensorflow.keras.models import load_model
9
  from faster_whisper import WhisperModel
10
+ import random
11
 
12
  # Load the emotion prediction model
13
  def load_emotion_model(model_path):
14
  try:
15
  model = load_model(model_path)
16
+ print("Emotion model loaded successfully")
17
  return model
18
  except Exception as e:
19
  print("Error loading emotion prediction model:", e)
 
28
 
29
  # Function to transcribe audio
30
  def transcribe(wav_filepath):
31
+ try:
32
+ segments, _ = model2.transcribe(wav_filepath, beam_size=5)
33
+ return "".join([segment.text for segment in segments])
34
+ except Exception as e:
35
+ print("Error transcribing audio:", e)
36
+ return "Transcription failed"
37
 
38
  # Function to extract MFCC features from audio
39
  def extract_mfcc(wav_file_name):
 
51
  # Function to predict emotion from audio
52
  def predict_emotion_from_audio(wav_filepath):
53
  try:
54
+ if model is None:
55
+ return "Model not loaded"
56
+
57
  test_point = extract_mfcc(wav_filepath)
58
  if test_point is not None:
59
  test_point = np.reshape(test_point, newshape=(1, 40, 1))
60
  predictions = model.predict(test_point)
61
+ predicted_emotion_label = np.argmax(predictions[0]) + 1 # Adding 1 to match your emotion dictionary
62
+ return emotions.get(predicted_emotion_label, "Unknown emotion")
63
  else:
64
  return "Error: Unable to extract features"
65
  except Exception as e:
66
  print("Error predicting emotion:", e)
67
+ return "Prediction error"
68
 
69
  api_key = os.getenv("DeepAI_api_key")
70
 
71
  # Function to generate an image using DeepAI Text to Image API
 
 
 
 
 
 
 
72
  def generate_image(emotion_prediction, transcribed_text, output_resolution=(1024, 1024)):
 
73
  try:
74
+ if not api_key:
75
+ return "API key not found"
76
+
77
  url = "https://api.deepai.org/api/image-editor"
78
  headers = {
79
  'api-key': api_key
80
  }
81
+
82
  # Select a random image file from TerraIncognita0.jpg to TerraIncognita9.jpg
83
+ random_index = random.randint(0, 9)
84
+ image_file_path = f'TAI_Images/TerraIncognita{random_index}.jpg'
85
+
86
+ # Check if the file exists
87
+ if not os.path.exists(image_file_path):
88
+ return f"Image file not found: {image_file_path}"
89
+
90
+ prompt_text = f"Generate Patagonian Monsters' with a {emotion_prediction} attitude, representing the idea of: [ {transcribed_text} ]. Illustrate this using asemic writings in an old map style."
91
+
92
+ with open(image_file_path, 'rb') as image_file:
93
+ files = {
94
+ 'image': image_file,
95
+ }
96
+ data = {
97
+ 'text': prompt_text
98
+ }
99
+ response = requests.post(url, headers=headers, files=files, data=data)
100
+
101
  response_data = response.json()
102
  if 'output_url' in response_data:
103
+ # Download the image and return it as a PIL Image
104
+ image_response = requests.get(response_data['output_url'])
105
+ return Image.open(BytesIO(image_response.content))
106
  else:
107
+ print("Error in DeepAI response:", response_data)
108
  return None
109
  except Exception as e:
110
  print("Error generating image:", e)
111
  return None
112
 
 
113
  # Function to get predictions
114
  def get_predictions(audio_input):
115
  emotion_prediction = predict_emotion_from_audio(audio_input)
116
  transcribed_text = transcribe(audio_input)
117
+
118
+ # Handle case where emotion_prediction might be None
119
+ if emotion_prediction is None:
120
+ emotion_prediction = "Unknown"
121
+
122
+ image = generate_image(emotion_prediction, transcribed_text)
123
+
124
  return emotion_prediction, transcribed_text, image
125
 
126
  # Create the Gradio interface
 
128
  fn=get_predictions,
129
  inputs=gr.Audio(label="Input Audio", type="filepath", sources=["microphone"]),
130
  outputs=[
131
+ gr.Label(label="Acoustic Prediction"),
132
+ gr.Label(label="Transcribed Text"),
133
  gr.Image(type='pil', label="Generated Image")
134
  ],
135
  title="Affective Virtual Environments",
136
  description="Create an AVE using your voice."
137
  )
138
 
 
139
  interface.launch()