jfforero commited on
Commit
a1c3a87
·
verified ·
1 Parent(s): d628f83

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -5
app.py CHANGED
@@ -8,7 +8,11 @@ import os
8
  from tensorflow.keras.models import load_model
9
  from faster_whisper import WhisperModel
10
  import random
11
- from textblob import TextBlob # Added for sentiment analysis
 
 
 
 
12
 
13
  # Load the emotion prediction model
14
  def load_emotion_model(model_path):
@@ -27,6 +31,21 @@ model = load_emotion_model(model_path)
27
  model_size = "small"
28
  model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  # Function to transcribe audio
31
  def transcribe(wav_filepath):
32
  try:
@@ -88,6 +107,44 @@ def analyze_sentiment(text):
88
  print("Error analyzing sentiment:", e)
89
  return "sentiment analysis error", 0.0
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  api_key = os.getenv("DeepAI_api_key")
92
 
93
  # Function to generate an image using DeepAI Text to Image API
@@ -146,7 +203,10 @@ def get_predictions(audio_input):
146
 
147
  image = generate_image(emotion_prediction, transcribed_text)
148
 
149
- return emotion_prediction, transcribed_text, f"Sentiment: {sentiment} (Polarity: {polarity:.2f})", image
 
 
 
150
 
151
  # Create the Gradio interface
152
  interface = gr.Interface(
@@ -155,11 +215,12 @@ interface = gr.Interface(
155
  outputs=[
156
  gr.Label(label="Acoustic Prediction"),
157
  gr.Label(label="Transcribed Text"),
158
- gr.Label(label="Sentiment Analysis"), # Added sentiment analysis output
159
- gr.Image(type='pil', label="Generated Image")
 
160
  ],
161
  title="Affective Virtual Environments",
162
- description="Create an AVE using your voice. Get emotion prediction, transcription, sentiment analysis, and a generated image."
163
  )
164
 
165
  interface.launch()
 
8
  from tensorflow.keras.models import load_model
9
  from faster_whisper import WhisperModel
10
  import random
11
+ from textblob import TextBlob
12
+ import torch
13
+ import scipy.io.wavfile
14
+ from transformers import AutoProcessor, MusicgenForConditionalGeneration
15
+ import tempfile
16
 
17
  # Load the emotion prediction model
18
  def load_emotion_model(model_path):
 
31
  model_size = "small"
32
  model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
33
 
34
+ # Load MusicGen model
35
+ def load_musicgen_model():
36
+ try:
37
+ device = "cuda" if torch.cuda.is_available() else "cpu"
38
+ processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
39
+ music_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
40
+ music_model.to(device)
41
+ print("MusicGen model loaded successfully")
42
+ return processor, music_model, device
43
+ except Exception as e:
44
+ print("Error loading MusicGen model:", e)
45
+ return None, None, None
46
+
47
+ processor, music_model, device = load_musicgen_model()
48
+
49
  # Function to transcribe audio
50
  def transcribe(wav_filepath):
51
  try:
 
107
  print("Error analyzing sentiment:", e)
108
  return "sentiment analysis error", 0.0
109
 
110
+ # Function to generate music with MusicGen
111
+ def generate_music(transcribed_text, emotion_prediction):
112
+ try:
113
+ if processor is None or music_model is None:
114
+ return None
115
+
116
+ # Create a prompt that combines the emotion and transcription
117
+ prompt = f"Background music that is {emotion_prediction} and represents: {transcribed_text}"
118
+
119
+ # Limit prompt length to avoid model issues
120
+ if len(prompt) > 200:
121
+ prompt = prompt[:200] + "..."
122
+
123
+ inputs = processor(
124
+ text=[prompt],
125
+ padding=True,
126
+ return_tensors="pt",
127
+ ).to(device)
128
+
129
+ # Generate audio
130
+ audio_values = music_model.generate(**inputs, max_new_tokens=512)
131
+
132
+ # Convert to numpy array and sample rate
133
+ sampling_rate = music_model.config.audio_encoder.sampling_rate
134
+ audio_data = audio_values[0, 0].cpu().numpy()
135
+
136
+ # Normalize audio data
137
+ audio_data = audio_data / np.max(np.abs(audio_data))
138
+
139
+ # Create a temporary file to save the audio
140
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
141
+ scipy.io.wavfile.write(tmp_file.name, rate=sampling_rate, data=audio_data)
142
+ return tmp_file.name
143
+
144
+ except Exception as e:
145
+ print("Error generating music:", e)
146
+ return None
147
+
148
  api_key = os.getenv("DeepAI_api_key")
149
 
150
  # Function to generate an image using DeepAI Text to Image API
 
203
 
204
  image = generate_image(emotion_prediction, transcribed_text)
205
 
206
+ # Generate music based on transcription and emotion
207
+ music_path = generate_music(transcribed_text, emotion_prediction)
208
+
209
+ return emotion_prediction, transcribed_text, f"Sentiment: {sentiment} (Polarity: {polarity:.2f})", image, music_path
210
 
211
  # Create the Gradio interface
212
  interface = gr.Interface(
 
215
  outputs=[
216
  gr.Label(label="Acoustic Prediction"),
217
  gr.Label(label="Transcribed Text"),
218
+ gr.Label(label="Sentiment Analysis"),
219
+ gr.Image(type='pil', label="Generated Image"),
220
+ gr.Audio(label="Generated Music", type="filepath") # Added music output
221
  ],
222
  title="Affective Virtual Environments",
223
+ description="Create an AVE using your voice. Get emotion prediction, transcription, sentiment analysis, a generated image, and music."
224
  )
225
 
226
  interface.launch()