jfforero commited on
Commit
7cdb54d
·
verified ·
1 Parent(s): 278770c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -119
app.py CHANGED
@@ -8,13 +8,7 @@ import os
8
  from tensorflow.keras.models import load_model
9
  from faster_whisper import WhisperModel
10
  import random
11
- from textblob import TextBlob
12
- import torch
13
- import scipy.io.wavfile
14
- from transformers import AutoProcessor, MusicgenForConditionalGeneration
15
- import tempfile
16
- from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
17
- import torch
18
 
19
  # Load the emotion prediction model
20
  def load_emotion_model(model_path):
@@ -33,51 +27,6 @@ model = load_emotion_model(model_path)
33
  model_size = "small"
34
  model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
35
 
36
- # Load MusicGen model
37
- def load_musicgen_model():
38
- try:
39
- device = "cuda" if torch.cuda.is_available() else "cpu"
40
- processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
41
- music_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
42
- music_model.to(device)
43
- print("MusicGen model loaded successfully")
44
- return processor, music_model, device
45
- except Exception as e:
46
- print("Error loading MusicGen model:", e)
47
- return None, None, None
48
-
49
- processor, music_model, device = load_musicgen_model()
50
-
51
- # Load Stable Diffusion model
52
- def load_stable_diffusion_model():
53
- try:
54
- # Use GPU if available, otherwise CPU
55
- sd_device = "cuda" if torch.cuda.is_available() else "cpu"
56
-
57
- # Load Stable Diffusion 2.1
58
- model_id = "stabilityai/stable-diffusion-2-1"
59
-
60
- # Use the DPMSolverMultistepScheduler for faster inference
61
- pipe = StableDiffusionPipeline.from_pretrained(
62
- model_id,
63
- torch_dtype=torch.float16 if sd_device == "cuda" else torch.float32,
64
- safety_checker=None # Disable safety checker for more creative generations
65
- )
66
- pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
67
- pipe = pipe.to(sd_device)
68
-
69
- # Optimize for CPU if needed
70
- if sd_device == "cpu":
71
- pipe.enable_attention_slicing()
72
-
73
- print("Stable Diffusion model loaded successfully")
74
- return pipe, sd_device
75
- except Exception as e:
76
- print("Error loading Stable Diffusion model:", e)
77
- return None, None
78
-
79
- sd_pipe, sd_device = load_stable_diffusion_model()
80
-
81
  # Function to transcribe audio
82
  def transcribe(wav_filepath):
83
  try:
@@ -139,72 +88,48 @@ def analyze_sentiment(text):
139
  print("Error analyzing sentiment:", e)
140
  return "sentiment analysis error", 0.0
141
 
142
- # Function to generate music with MusicGen
143
- def generate_music(transcribed_text, emotion_prediction):
 
 
144
  try:
145
- if processor is None or music_model is None:
146
- return None
147
 
148
- # Create a prompt that combines the emotion and transcription
149
- prompt = f"Background music that is {emotion_prediction} and represents: {transcribed_text}"
 
 
150
 
151
- # Limit prompt length to avoid model issues
152
- if len(prompt) > 200:
153
- prompt = prompt[:200] + "..."
154
-
155
- inputs = processor(
156
- text=[prompt],
157
- padding=True,
158
- return_tensors="pt",
159
- ).to(device)
160
 
161
- # Generate audio
162
- audio_values = music_model.generate(**inputs, max_new_tokens=512)
163
-
164
- # Convert to numpy array and sample rate
165
- sampling_rate = music_model.config.audio_encoder.sampling_rate
166
- audio_data = audio_values[0, 0].cpu().numpy()
167
-
168
- # Normalize audio data
169
- audio_data = audio_data / np.max(np.abs(audio_data))
170
 
171
- # Create a temporary file to save the audio
172
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
173
- scipy.io.wavfile.write(tmp_file.name, rate=sampling_rate, data=audio_data)
174
- return tmp_file.name
 
 
 
 
175
 
176
- except Exception as e:
177
- print("Error generating music:", e)
178
- return None
179
-
180
- # Function to generate image with Stable Diffusion
181
- def generate_image(emotion_prediction, transcribed_text):
182
- try:
183
- if sd_pipe is None:
184
  return None
185
-
186
- # Create a detailed prompt for image generation
187
- prompt = f"Patagonian Monsters with a {emotion_prediction} attitude, representing: {transcribed_text}. " \
188
- f"Asemic writings in an old map style, vintage illustration, detailed, high quality, 4k resolution"
189
-
190
- # Negative prompt to avoid unwanted elements
191
- negative_prompt = "blurry, low quality, distorted, ugly, bad anatomy, text, watermark, signature"
192
-
193
- # Generate image
194
- with torch.autocast("cuda" if sd_device == "cuda" else "cpu"):
195
- image = sd_pipe(
196
- prompt=prompt,
197
- negative_prompt=negative_prompt,
198
- height=1024,
199
- width=512,
200
- num_inference_steps=25,
201
- guidance_scale=7.5
202
- ).images[0]
203
-
204
- return image
205
-
206
  except Exception as e:
207
- print("Error generating image with Stable Diffusion:", e)
208
  return None
209
 
210
  # Function to get predictions
@@ -219,13 +144,9 @@ def get_predictions(audio_input):
219
  # Analyze sentiment of transcribed text
220
  sentiment, polarity = analyze_sentiment(transcribed_text)
221
 
222
- # Generate image with Stable Diffusion
223
  image = generate_image(emotion_prediction, transcribed_text)
224
 
225
- # Generate music based on transcription and emotion
226
- music_path = generate_music(transcribed_text, emotion_prediction)
227
-
228
- return emotion_prediction, transcribed_text, f"Sentiment: {sentiment} (Polarity: {polarity:.2f})", image, music_path
229
 
230
  # Create the Gradio interface
231
  interface = gr.Interface(
@@ -234,12 +155,11 @@ interface = gr.Interface(
234
  outputs=[
235
  gr.Label(label="Acoustic Prediction"),
236
  gr.Label(label="Transcribed Text"),
237
- gr.Label(label="Sentiment Analysis"),
238
- gr.Image(type='pil', label="Generated Image"),
239
- gr.Audio(label="Generated Music", type="filepath")
240
  ],
241
  title="Affective Virtual Environments",
242
- description="Create an AVE using your voice. Get emotion prediction, transcription, sentiment analysis, a generated image, and music."
243
  )
244
 
245
  interface.launch()
 
8
  from tensorflow.keras.models import load_model
9
  from faster_whisper import WhisperModel
10
  import random
11
+ from textblob import TextBlob # Added for sentiment analysis
 
 
 
 
 
 
12
 
13
  # Load the emotion prediction model
14
  def load_emotion_model(model_path):
 
27
  model_size = "small"
28
  model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  # Function to transcribe audio
31
  def transcribe(wav_filepath):
32
  try:
 
88
  print("Error analyzing sentiment:", e)
89
  return "sentiment analysis error", 0.0
90
 
91
+ api_key = os.getenv("DeepAI_api_key")
92
+
93
+ # Function to generate an image using DeepAI Text to Image API
94
+ def generate_image(emotion_prediction, transcribed_text, output_resolution=(1024, 1024)):
95
  try:
96
+ if not api_key:
97
+ return "API key not found"
98
 
99
+ url = "https://api.deepai.org/api/image-editor"
100
+ headers = {
101
+ 'api-key': api_key
102
+ }
103
 
104
+ # Select a random image file from TerraIncognita0.jpg to TerraIncognita9.jpg
105
+ random_index = random.randint(0, 9)
106
+ image_file_path = f'TAI_Images/TerraIncognita{random_index}.jpg'
 
 
 
 
 
 
107
 
108
+ # Check if the file exists
109
+ if not os.path.exists(image_file_path):
110
+ return f"Image file not found: {image_file_path}"
111
+
112
+ prompt_text = f"Generate Patagonian Monsters' with a {emotion_prediction} attitude, representing the idea of: [ {transcribed_text} ]. Illustrate this using asemic writings in an old map style."
 
 
 
 
113
 
114
+ with open(image_file_path, 'rb') as image_file:
115
+ files = {
116
+ 'image': image_file,
117
+ }
118
+ data = {
119
+ 'text': prompt_text
120
+ }
121
+ response = requests.post(url, headers=headers, files=files, data=data)
122
 
123
+ response_data = response.json()
124
+ if 'output_url' in response_data:
125
+ # Download the image and return it as a PIL Image
126
+ image_response = requests.get(response_data['output_url'])
127
+ return Image.open(BytesIO(image_response.content))
128
+ else:
129
+ print("Error in DeepAI response:", response_data)
 
130
  return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  except Exception as e:
132
+ print("Error generating image:", e)
133
  return None
134
 
135
  # Function to get predictions
 
144
  # Analyze sentiment of transcribed text
145
  sentiment, polarity = analyze_sentiment(transcribed_text)
146
 
 
147
  image = generate_image(emotion_prediction, transcribed_text)
148
 
149
+ return emotion_prediction, transcribed_text, f"Sentiment: {sentiment} (Polarity: {polarity:.2f})", image
 
 
 
150
 
151
  # Create the Gradio interface
152
  interface = gr.Interface(
 
155
  outputs=[
156
  gr.Label(label="Acoustic Prediction"),
157
  gr.Label(label="Transcribed Text"),
158
+ gr.Label(label="Sentiment Analysis"), # Added sentiment analysis output
159
+ gr.Image(type='pil', label="Generated Image")
 
160
  ],
161
  title="Affective Virtual Environments",
162
+ description="Create an AVE using your voice. Get emotion prediction, transcription, sentiment analysis, and a generated image."
163
  )
164
 
165
  interface.launch()