jfforero commited on
Commit
2392587
·
verified ·
1 Parent(s): e6f2ff7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +556 -68
app.py CHANGED
@@ -33,76 +33,470 @@ import cv2
33
  import shutil
34
  from datetime import datetime
35
 
36
- # [Keep all your existing code until the process_and_display function]
 
 
 
 
 
 
 
 
37
 
38
- # Update the process_and_display function to create a named HTML file
39
- def process_and_display(audio_input, generate_audio, chunk_duration):
40
- # Validate chunk duration
41
- if chunk_duration is None or chunk_duration <= 0:
42
- chunk_duration = 10
43
-
44
- # Show loading indicator
45
- yield [gr.HTML(f"""
46
- <div style="text-align: center; margin: 20px;">
47
- <p style="font-size: 18px; color: #4a4a4a;">Processing audio in {chunk_duration}-second chunks...</p>
48
- <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
49
- <style>@keyframes spin {{ 0% {{ transform: rotate(0deg); }} 100% {{ transform: rotate(360deg); }} }}</style>
50
- <p style="font-size: 14px; color: #4a4a4a;">This may take several minutes depending on the audio length...</p>
51
- </div>
52
- """)] + [gr.Group(visible=False)] * len(group_components) + [None] * (len(output_containers) * 6) + [None, ""]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- results = get_predictions(audio_input, generate_audio, chunk_duration)
 
55
 
56
- # Initialize outputs list
57
- outputs = []
58
- group_visibility = []
59
- all_360_images = [] # Collect all 360 images for the viewer
60
- all_music_paths = [] # Collect all music paths for the viewer
61
 
62
- # Process each result
63
- for i, result in enumerate(results):
64
- if i < len(output_containers):
65
- group_visibility.append(gr.Group(visible=True))
66
- outputs.extend([
67
- result['emotion'],
68
- result['transcription'],
69
- result['sentiment'],
70
- result['image'],
71
- result['image_360'],
72
- result['music']
73
- ])
74
- # Collect the 360-processed images and music
75
- if result['image_360']:
76
- all_360_images.append(result['image_360']) # Use the 360-processed image
77
- all_music_paths.append(result['music']) # Can be None if no music generated
78
- else:
79
- # If we have more results than containers, just extend with None
80
- group_visibility.append(gr.Group(visible=False))
81
- outputs.extend([None] * 6)
82
 
83
- # Hide remaining containers
84
- for i in range(len(results), len(output_containers)):
85
- group_visibility.append(gr.Group(visible=False))
86
- outputs.extend([None] * 6)
 
 
 
 
 
 
87
 
88
- # Create 360 viewer HTML if we have 360 images
89
- viewer_html_path = None
90
- if all_360_images:
91
- # Create a timestamp for unique filenames
92
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
93
- html_filename = f"MyAVE_{timestamp}.html"
 
 
 
 
 
94
 
95
- # Create a temporary directory for our output
96
- output_dir = tempfile.mkdtemp()
97
- viewer_html_path = os.path.join(output_dir, html_filename)
 
 
 
 
 
 
98
 
99
- # Create the HTML file
100
- create_360_viewer_html(all_360_images, all_music_paths, viewer_html_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
- # After processing, return the results along with other outputs
103
- yield [gr.HTML("")] + group_visibility + outputs + [viewer_html_path, js_output, results]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- # Update the create_360_viewer_html function to include a download button in the HTML itself
106
  def create_360_viewer_html(image_paths, audio_paths, output_path):
107
  """Create an HTML file with a 360 viewer and audio player for the given images and audio."""
108
  # Create a list of image data URIs
@@ -331,7 +725,107 @@ def create_360_viewer_html(image_paths, audio_paths, output_path):
331
 
332
  return output_path
333
 
334
- # [Keep the rest of your code but remove the share button and related functions]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
 
336
  # Create the Gradio interface with proper output handling
337
  with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as interface:
@@ -374,13 +868,7 @@ with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as i
374
  clear_btn = gr.Button("Clear All", variant="secondary")
375
 
376
  # Add a loading indicator
377
- loading_indicator = gr.HTML("""
378
- <div id="loading" style="display: none; text-align: center; margin: 20px;">
379
- <p style="font-size: 18px; color: #4a4a4a;">Processing audio chunks...</p>
380
- <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
381
- <style>@keyframes spin { 0% { transform: rotate(0deg); } 100% { transform: rotate(360deg); } }</style>
382
- </div>
383
- """)
384
 
385
  # Create output components for each chunk type
386
  output_containers = []
 
33
  import shutil
34
  from datetime import datetime
35
 
36
+ # Load the emotion prediction model
37
+ def load_emotion_model(model_path):
38
+ try:
39
+ model = load_model(model_path)
40
+ print("Emotion model loaded successfully")
41
+ return model
42
+ except Exception as e:
43
+ print("Error loading emotion prediction model:", e)
44
+ return None
45
 
46
+ model_path = 'mymodel_SER_LSTM_RAVDESS.h5'
47
+ model = load_emotion_model(model_path)
48
+
49
+ # Initialize WhisperModel
50
+ model_size = "small"
51
+ model2 = WhisperModel(model_size, device="cpu", compute_type="int8")
52
+
53
+ # Load MusicGen model
54
+ def load_musicgen_model():
55
+ try:
56
+ device = "cuda" if torch.cuda.is_available() else "cpu"
57
+ processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
58
+ music_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
59
+ music_model.to(device)
60
+ print("MusicGen model loaded successfully")
61
+ return processor, music_model, device
62
+ except Exception as e:
63
+ print("Error loading MusicGen model:", e)
64
+ return None, None, None
65
+
66
+ processor, music_model, device = load_musicgen_model()
67
+
68
+ # Function to chunk audio into segments
69
+ def chunk_audio(audio_path, chunk_duration=10):
70
+ """Split audio into chunks and return list of chunk file paths"""
71
+ try:
72
+ # Load audio file
73
+ audio = AudioSegment.from_file(audio_path)
74
+ duration_ms = len(audio)
75
+ chunk_ms = chunk_duration * 1000
76
+
77
+ # Validate chunk duration
78
+ if chunk_duration <= 0:
79
+ raise ValueError("Chunk duration must be positive")
80
+
81
+ if chunk_duration > duration_ms / 1000:
82
+ # If chunk duration is longer than audio, return the whole audio
83
+ return [audio_path], 1
84
+
85
+ chunks = []
86
+ chunk_files = []
87
+
88
+ # Calculate number of chunks
89
+ num_chunks = math.ceil(duration_ms / chunk_ms)
90
+
91
+ for i in range(num_chunks):
92
+ start_ms = i * chunk_ms
93
+ end_ms = min((i + 1) * chunk_ms, duration_ms)
94
+
95
+ # Extract chunk
96
+ chunk = audio[start_ms:end_ms]
97
+ chunks.append(chunk)
98
+
99
+ # Save chunk to temporary file
100
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
101
+ chunk.export(tmp_file.name, format="wav")
102
+ chunk_files.append(tmp_file.name)
103
+
104
+ return chunk_files, num_chunks
105
+
106
+ except Exception as e:
107
+ print("Error chunking audio:", e)
108
+ # Return original file as single chunk if chunking fails
109
+ return [audio_path], 1
110
+
111
+ # Function to transcribe audio
112
+ def transcribe(wav_filepath):
113
+ try:
114
+ segments, _ = model2.transcribe(wav_filepath, beam_size=5)
115
+ return "".join([segment.text for segment in segments])
116
+ except Exception as e:
117
+ print("Error transcribing audio:", e)
118
+ return "Transcription failed"
119
+
120
+ # Function to extract MFCC features from audio
121
+ def extract_mfcc(wav_file_name):
122
+ try:
123
+ y, sr = librosa.load(wav_file_name)
124
+ mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
125
+ return mfccs
126
+ except Exception as e:
127
+ print("Error extracting MFCC features:", e)
128
+ return None
129
+
130
+ # Emotions dictionary
131
+ emotions = {0: 'neutral', 1: 'calm', 2: 'happy', 3: 'sad', 4: 'angry', 5: 'fearful', 6: 'disgust', 7: 'surprised'}
132
+
133
+ # Function to predict emotion from audio
134
+ def predict_emotion_from_audio(wav_filepath):
135
+ try:
136
+ if model is None:
137
+ return "Model not loaded"
138
+
139
+ test_point = extract_mfcc(wav_filepath)
140
+ if test_point is not None:
141
+ test_point = np.reshape(test_point, newshape=(1, 40, 1))
142
+ predictions = model.predict(test_point)
143
+ predicted_emotion_label = np.argmax(predictions[0])
144
+ return emotions.get(predicted_emotion_label, "Unknown emotion")
145
+ else:
146
+ return "Error: Unable to extract features"
147
+ except Exception as e:
148
+ print("Error predicting emotion:", e)
149
+ return "Prediction error"
150
+
151
+ # Function to analyze sentiment from text
152
+ def analyze_sentiment(text):
153
+ try:
154
+ if not text or text.strip() == "":
155
+ return "neutral", 0.0
156
+
157
+ analysis = TextBlob(text)
158
+ polarity = analysis.sentiment.polarity
159
+
160
+ if polarity > 0.1:
161
+ sentiment = "positive"
162
+ elif polarity < -0.1:
163
+ sentiment = "negative"
164
+ else:
165
+ sentiment = "neutral"
166
+
167
+ return sentiment, polarity
168
+ except Exception as e:
169
+ print("Error analyzing sentiment:", e)
170
+ return "neutral", 0.0
171
+
172
+ # Function to get image prompt based on sentiment
173
+ def get_image_prompt(sentiment, transcribed_text, chunk_idx, total_chunks):
174
+ base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: "
175
 
176
+ if sentiment == "positive":
177
+ return base_prompt + f"Generate a vibrant, uplifting equirectangular 360 image texture with bright colors, joyful atmosphere, and optimistic vibes representing: [{transcribed_text}]. The scene should evoke happiness and positivity."
178
 
179
+ elif sentiment == "negative":
180
+ return base_prompt + f"Generate a moody, dramatic equirectangular 360 image texture with dark tones, intense atmosphere, and emotional depth representing: [{transcribed_text}]. The scene should convey melancholy and intensity."
 
 
 
181
 
182
+ else: # neutral
183
+ return base_prompt + f"Generate a balanced, serene equirectangular 360 image texture with harmonious colors, peaceful atmosphere, and calm vibes representing: [{transcribed_text}]. The scene should evoke tranquility and balance."
184
+
185
+ # Function to get music prompt based on emotion
186
+ def get_music_prompt(emotion, transcribed_text, chunk_idx, total_chunks):
187
+ base_prompt = f"Chunk {chunk_idx+1}/{total_chunks}: "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
+ emotion_prompts = {
190
+ 'neutral': f"Create ambient, background music with neutral tones, subtle melodies, and unobtrusive atmosphere that complements: {transcribed_text}. The music should be calm and balanced.",
191
+ 'calm': f"Generate soothing, peaceful music with gentle melodies, soft instrumentation, and relaxing vibes that represents: {transcribed_text}. The music should evoke tranquility and serenity.",
192
+ 'happy': f"Create joyful, upbeat music with cheerful melodies, bright instrumentation, and energetic rhythms that celebrates: {transcribed_text}. The music should evoke happiness and positivity.",
193
+ 'sad': f"Generate emotional, melancholic music with poignant melodies, soft strings, and heartfelt atmosphere that reflects: {transcribed_text}. The music should evoke sadness and reflection.",
194
+ 'angry': f"Create intense, powerful music with driving rhythms, aggressive instrumentation, and strong dynamics that expresses: {transcribed_text}. The music should evoke anger and intensity.",
195
+ 'fearful': f"Generate suspenseful, tense music with eerie melodies, atmospheric sounds, and unsettling vibes that represents: {transcribed_text}. The music should evoke fear and anticipation.",
196
+ 'disgust': f"Create dark, unsettling music with dissonant harmonies, unusual sounds, and uncomfortable atmosphere that reflects: {transcribed_text}. The music should evoke discomfort and unease.",
197
+ 'surprised': f"Generate dynamic, unexpected music with sudden changes, playful melodies, and surprising elements that represents: {transcribed_text}. The music should evoke surprise and wonder."
198
+ }
199
 
200
+ return base_prompt + emotion_prompts.get(emotion.lower(),
201
+ f"Create background music with {emotion} atmosphere that represents: {transcribed_text}")
202
+
203
+ # Function to generate music with MusicGen (using acoustic emotion prediction)
204
+ def generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks):
205
+ try:
206
+ if processor is None or music_model is None:
207
+ return None
208
+
209
+ # Get specific prompt based on emotion
210
+ prompt = get_music_prompt(emotion_prediction, transcribed_text, chunk_idx, total_chunks)
211
 
212
+ # Limit prompt length to avoid model issues
213
+ if len(prompt) > 200:
214
+ prompt = prompt[:200] + "..."
215
+
216
+ inputs = processor(
217
+ text=[prompt],
218
+ padding=True,
219
+ return_tensors="pt",
220
+ ).to(device)
221
 
222
+ # Generate audio
223
+ audio_values = music_model.generate(**inputs, max_new_tokens=512)
224
+
225
+ # Convert to numpy array and sample rate
226
+ sampling_rate = music_model.config.audio_encoder.sampling_rate
227
+ audio_data = audio_values[0, 0].cpu().numpy()
228
+
229
+ # Normalize audio data
230
+ audio_data = audio_data / np.max(np.abs(audio_data))
231
+
232
+ # Create a temporary file to save the audio
233
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
234
+ scipy.io.wavfile.write(tmp_file.name, rate=sampling_rate, data=audio_data)
235
+ return tmp_file.name
236
+
237
+ except Exception as e:
238
+ print("Error generating music:", e)
239
+ return None
240
 
241
+ # --- DeepAI Image Generation (Text2Img) ---
242
+ api_key = os.getenv("DeepAI_api_key")
243
+
244
+ # Function to upscale image using Lanczos interpolation
245
+ def upscale_image(image, target_width=4096, target_height=2048):
246
+ """
247
+ Upscale image using DeepAI's Torch-SRGAN API for super resolution
248
+ """
249
+ try:
250
+ if not api_key:
251
+ print("No API key available for upscaling")
252
+ # Fallback to OpenCV if no API key
253
+ img_array = np.array(image)
254
+ upscaled = cv2.resize(
255
+ img_array,
256
+ (target_width, target_height),
257
+ interpolation=cv2.INTER_LANCZOS4
258
+ )
259
+ return Image.fromarray(upscaled)
260
+
261
+ # Save the image to a temporary file
262
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_input:
263
+ image.save(tmp_input.name, "JPEG", quality=95)
264
+
265
+ # Make request to DeepAI torch-srgan API
266
+ response = requests.post(
267
+ "https://api.deepai.org/api/torch-srgan",
268
+ files={'image': open(tmp_input.name, 'rb')},
269
+ headers={'api-key': api_key}
270
+ )
271
+
272
+ data = response.json()
273
+
274
+ if 'output_url' in data:
275
+ # Download the upscaled image
276
+ img_resp = requests.get(data['output_url'])
277
+ upscaled_image = Image.open(BytesIO(img_resp.content))
278
+
279
+ # Ensure the image meets our target dimensions
280
+ if upscaled_image.size != (target_width, target_height):
281
+ upscaled_image = upscaled_image.resize(
282
+ (target_width, target_height),
283
+ Image.Resampling.LANCZOS
284
+ )
285
+
286
+ # Clean up temporary file
287
+ os.unlink(tmp_input.name)
288
+ return upscaled_image
289
+ else:
290
+ print("Error in DeepAI upscaling response:", data)
291
+ # Fallback to OpenCV if API fails
292
+ img_array = np.array(image)
293
+ upscaled = cv2.resize(
294
+ img_array,
295
+ (target_width, target_height),
296
+ interpolation=cv2.INTER_LANCZOS4
297
+ )
298
+ return Image.fromarray(upscaled)
299
+
300
+ except Exception as e:
301
+ print(f"Error upscaling image with DeepAI: {e}")
302
+ # Fallback to OpenCV if any error occurs
303
+ img_array = np.array(image)
304
+ upscaled = cv2.resize(
305
+ img_array,
306
+ (target_width, target_height),
307
+ interpolation=cv2.INTER_LANCZOS4
308
+ )
309
+ return Image.fromarray(upscaled)
310
+
311
+ # Function to generate image using DeepAI API
312
+ def generate_image(sentiment_prediction, transcribed_text, chunk_idx, total_chunks):
313
+ try:
314
+ if not api_key:
315
+ # fallback white image if no API key
316
+ base_image = Image.new('RGB', (1024,512), color='white')
317
+ else:
318
+ # Get specific prompt based on sentiment
319
+ prompt = get_image_prompt(sentiment_prediction, transcribed_text, chunk_idx, total_chunks)
320
+
321
+ # Make request to DeepAI text2img API
322
+ response = requests.post(
323
+ "https://api.deepai.org/api/text2img",
324
+ data={
325
+ 'text': prompt,
326
+ 'width': 1024,
327
+ 'height': 512,
328
+ 'image_generator_version': 'hd'
329
+ },
330
+ headers={'api-key': api_key}
331
+ )
332
+
333
+ data = response.json()
334
+ if 'output_url' in data:
335
+ # Download the generated image
336
+ img_resp = requests.get(data['output_url'])
337
+ base_image = Image.open(BytesIO(img_resp.content))
338
+ else:
339
+ print("Error in DeepAI response:", data)
340
+ # Return a fallback image
341
+ base_image = Image.new('RGB', (1024,512), color='white')
342
+
343
+ # Upscale the image for better quality in 360 viewer
344
+ upscaled_image = upscale_image(base_image)
345
+ return upscaled_image
346
+
347
+ except Exception as e:
348
+ print("Error generating image:", e)
349
+ # Return a fallback image
350
+ return Image.new('RGB', (1024,512), color='white')
351
+
352
+ # Function to process a single chunk
353
+ def process_chunk(chunk_path, chunk_idx, total_chunks, generate_audio=True):
354
+ try:
355
+ # Get acoustic emotion prediction (for music)
356
+ emotion_prediction = predict_emotion_from_audio(chunk_path)
357
+
358
+ # Get transcribed text
359
+ transcribed_text = transcribe(chunk_path)
360
+
361
+ # Analyze sentiment of transcribed text (for image)
362
+ sentiment, polarity = analyze_sentiment(transcribed_text)
363
+
364
+ # Generate image using SENTIMENT analysis with specific prompt
365
+ image = generate_image(sentiment, transcribed_text, chunk_idx, total_chunks)
366
+
367
+ # Add 360 metadata to the image
368
+ image_with_360_path = add_360_metadata(image)
369
+
370
+ # Generate music only if audio generation is enabled
371
+ music_path = None
372
+ if generate_audio:
373
+ music_path = generate_music(transcribed_text, emotion_prediction, chunk_idx, total_chunks)
374
+
375
+ return {
376
+ 'chunk_index': chunk_idx + 1,
377
+ 'emotion': emotion_prediction,
378
+ 'transcription': transcribed_text,
379
+ 'sentiment': sentiment,
380
+ 'image': image, # Original image for display in Gradio
381
+ 'image_360': image_with_360_path, # Image with 360 metadata
382
+ 'music': music_path
383
+ }
384
+ except Exception as e:
385
+ print(f"Error processing chunk {chunk_idx + 1}:", e)
386
+ # Return a fallback result with all required keys
387
+ return {
388
+ 'chunk_index': chunk_idx + 1,
389
+ 'emotion': "Error",
390
+ 'transcription': "Transcription failed",
391
+ 'sentiment': "Sentiment: error",
392
+ 'image': Image.new('RGB', (1440, 770), color='white'),
393
+ 'image_360': None,
394
+ 'music': None
395
+ }
396
+
397
+ # Function to get predictions for all chunks
398
+ def get_predictions(audio_input, generate_audio=True, chunk_duration=10):
399
+ # Chunk the audio into segments
400
+ chunk_files, total_chunks = chunk_audio(audio_input, chunk_duration)
401
+
402
+ results = []
403
+
404
+ # Process each chunk
405
+ for i, chunk_path in enumerate(chunk_files):
406
+ print(f"Processing chunk {i+1}/{total_chunks} ({chunk_duration}s each)")
407
+ result = process_chunk(chunk_path, i, total_chunks, generate_audio)
408
+ results.append(result)
409
+
410
+ # Clean up temporary chunk files
411
+ for chunk_path in chunk_files:
412
+ try:
413
+ if chunk_path != audio_input: # Don't delete original input file
414
+ os.unlink(chunk_path)
415
+ except:
416
+ pass
417
+
418
+ return results
419
+
420
+ def create_xmp_block(width, height):
421
+ """Create XMP metadata block following ExifTool's exact format."""
422
+ xmp = (
423
+ f'<?xpacket begin="" id="W5M0MpCehiHzreSzNTczkc9d"?>\n'
424
+ f'<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="ExifTool">\n'
425
+ f'<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">\n'
426
+ f'<rdf:Description rdf:about=""\n'
427
+ f'xmlns:GPano="http://ns.google.com/photos/1.0/panorama/"\n'
428
+ f'GPano:ProjectionType="equirectangular"\n'
429
+ f'GPano:UsePanoramaViewer="True"\n'
430
+ f'GPano:FullPanoWidthPixels="{width}"\n'
431
+ f'GPano:FullPanoHeightPixels="{height}"\n'
432
+ f'GPano:CroppedAreaImageWidthPixels="{width}"\n'
433
+ f'GPano:CroppedAreaImageHeightPixels="{height}"\n'
434
+ f'GPano:CroppedAreaLeftPixels="0"\n'
435
+ f'GPano:CroppedAreaTopPixels="0"/>\n'
436
+ f'</rdf:RDF>\n'
437
+ f'</x:xmpmeta>\n'
438
+ f'<?xpacket end="w"?>'
439
+ )
440
+ return xmp
441
+
442
+ def write_xmp_to_jpg(input_path, output_path, width, height):
443
+ """Write XMP metadata to JPEG file following ExifTool's method."""
444
+ # Read the original JPEG
445
+ with open(input_path, 'rb') as f:
446
+ data = f.read()
447
+
448
+ # Find the start of image marker
449
+ if data[0:2] != b'\xFF\xD8':
450
+ raise ValueError("Not a valid JPEG file")
451
+
452
+ # Create XMP data
453
+ xmp_data = create_xmp_block(width, height)
454
+
455
+ # Create APP1 segment for XMP
456
+ app1_marker = b'\xFF\xE1'
457
+ xmp_header = b'http://ns.adobe.com/xap/1.0/\x00'
458
+ xmp_bytes = xmp_data.encode('utf-8')
459
+ length = len(xmp_header) + len(xmp_bytes) + 2 # +2 for length bytes
460
+ length_bytes = struct.pack('>H', length)
461
+
462
+ # Construct new file content
463
+ output = bytearray()
464
+ output.extend(data[0:2]) # SOI marker
465
+ output.extend(app1_marker)
466
+ output.extend(length_bytes)
467
+ output.extend(xmp_header)
468
+ output.extend(xmp_bytes)
469
+ output.extend(data[2:]) # Rest of the original file
470
+
471
+ # Write the new file
472
+ with open(output_path, 'wb') as f:
473
+ f.write(output)
474
+
475
+ def add_360_metadata(img):
476
+ """Add 360 photo metadata to a PIL Image and return the path to the processed image."""
477
+ try:
478
+ # First, ensure the image is upscaled to 4096x2048
479
+ target_width, target_height = 4096, 2048
480
+ if img.width != target_width or img.height != target_height:
481
+ img = img.resize((target_width, target_height), Image.Resampling.LANCZOS)
482
+
483
+ # Create a temporary file
484
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
485
+ # First save as high-quality JPEG
486
+ img.save(tmp_file.name, "JPEG", quality=95)
487
+
488
+ # Then inject XMP metadata directly into JPEG file
489
+ write_xmp_to_jpg(tmp_file.name, tmp_file.name, img.width, img.height)
490
+
491
+ return tmp_file.name
492
+
493
+ except Exception as e:
494
+ print(f"Error adding 360 metadata: {str(e)}")
495
+ # Fallback: return the original image path
496
+ with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
497
+ img.save(tmp_file.name, "JPEG", quality=95)
498
+ return tmp_file.name
499
 
 
500
  def create_360_viewer_html(image_paths, audio_paths, output_path):
501
  """Create an HTML file with a 360 viewer and audio player for the given images and audio."""
502
  # Create a list of image data URIs
 
725
 
726
  return output_path
727
 
728
+ # Update the process_and_display function
729
+ def process_and_display(audio_input, generate_audio, chunk_duration):
730
+ # Validate chunk duration
731
+ if chunk_duration is None or chunk_duration <= 0:
732
+ chunk_duration = 10
733
+
734
+ # Show loading indicator
735
+ yield [gr.HTML(f"""
736
+ <div style="text-align: center; margin: 20px;">
737
+ <p style="font-size: 18px; color: #4a4a4a;">Processing audio in {chunk_duration}-second chunks...</p>
738
+ <div style="border: 4px solid #f3f3f3; border-top: 4px solid #3498db; border-radius: 50%; width: 30px; height: 30px; animation: spin 2s linear infinite; margin: 0 auto;"></div>
739
+ <style>@keyframes spin {{ 0% {{ transform: rotate(0deg); }} 100% {{ transform: rotate(360deg); }} }}</style>
740
+ <p style="font-size: 14px; color: #4a4a4a;">This may take several minutes depending on the audio length...</p>
741
+ </div>
742
+ """)] + [gr.Group(visible=False)] * len(group_components) + [None] * (len(output_containers) * 6) + [None, ""]
743
+
744
+ results = get_predictions(audio_input, generate_audio, chunk_duration)
745
+
746
+ # Initialize outputs list
747
+ outputs = []
748
+ group_visibility = []
749
+ all_360_images = [] # Collect all 360 images for the viewer
750
+ all_music_paths = [] # Collect all music paths for the viewer
751
+
752
+ # Process each result
753
+ for i, result in enumerate(results):
754
+ if i < len(output_containers):
755
+ group_visibility.append(gr.Group(visible=True))
756
+ outputs.extend([
757
+ result['emotion'],
758
+ result['transcription'],
759
+ result['sentiment'],
760
+ result['image'],
761
+ result['image_360'],
762
+ result['music']
763
+ ])
764
+ # Collect the 360-processed images and music
765
+ if result['image_360']:
766
+ all_360_images.append(result['image_360']) # Use the 360-processed image
767
+ all_music_paths.append(result['music']) # Can be None if no music generated
768
+ else:
769
+ # If we have more results than containers, just extend with None
770
+ group_visibility.append(gr.Group(visible=False))
771
+ outputs.extend([None] * 6)
772
+
773
+ # Hide remaining containers
774
+ for i in range(len(results), len(output_containers)):
775
+ group_visibility.append(gr.Group(visible=False))
776
+ outputs.extend([None] * 6)
777
+
778
+ # Create 360 viewer HTML if we have 360 images
779
+ viewer_html_path = None
780
+ if all_360_images:
781
+ # Create a timestamp for unique filenames
782
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
783
+ html_filename = f"MyAVE_{timestamp}.html"
784
+
785
+ # Create a temporary directory for our output
786
+ output_dir = tempfile.mkdtemp()
787
+ viewer_html_path = os.path.join(output_dir, html_filename)
788
+
789
+ # Create the HTML file
790
+ create_360_viewer_html(all_360_images, all_music_paths, viewer_html_path)
791
+
792
+ # After processing, return the results along with other outputs
793
+ yield [gr.HTML("")] + group_visibility + outputs + [viewer_html_path, js_output]
794
+
795
+ # Update the clear_all function to handle the new outputs
796
+ def clear_all():
797
+ # Create a list with None for all outputs
798
+ outputs = [None] # For audio input
799
+
800
+ # For group components (set to invisible)
801
+ outputs.extend([gr.Group(visible=False)] * len(group_components))
802
+
803
+ # For all output containers (set to None)
804
+ for _ in output_containers:
805
+ outputs.extend([None, None, None, None, None, None]) # emotion, transcription, sentiment, image, image_360, music
806
+
807
+ # For loading indicator (empty HTML)
808
+ outputs.append(gr.HTML(""))
809
+
810
+ # For chunk duration (reset to 10)
811
+ outputs.append(10)
812
+
813
+ # For example selector (reset to None)
814
+ outputs.append(None)
815
+
816
+ # For viewer (set to None)
817
+ outputs.append(None)
818
+
819
+ # For JavaScript output (empty)
820
+ outputs.append("")
821
+
822
+ return outputs
823
+
824
+ # Function to load example audio (placeholder - you need to implement this)
825
+ def load_example_audio(example_name):
826
+ # This is a placeholder - you need to implement this function
827
+ # Return the path to the example audio file based on the example_name
828
+ return None
829
 
830
  # Create the Gradio interface with proper output handling
831
  with gr.Blocks(title="Affective Virtual Environments - Chunked Processing") as interface:
 
868
  clear_btn = gr.Button("Clear All", variant="secondary")
869
 
870
  # Add a loading indicator
871
+ loading_indicator = gr.HTML("")
 
 
 
 
 
 
872
 
873
  # Create output components for each chunk type
874
  output_containers = []