YoussefA7med commited on
Commit
5f2ffce
·
verified ·
1 Parent(s): 0cdf4c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -118
app.py CHANGED
@@ -8,18 +8,6 @@ from PIL import Image
8
  from dotenv import load_dotenv
9
  import gradio as gr
10
  from gradio_client import Client
11
- import logging
12
-
13
- # Configure logging
14
- logging.basicConfig(
15
- level=logging.INFO,
16
- format='%(asctime)s - %(levelname)s - %(message)s',
17
- handlers=[
18
- logging.FileHandler('app.log'),
19
- logging.StreamHandler()
20
- ]
21
- )
22
- logger = logging.getLogger(__name__)
23
 
24
  # Load environment variables
25
  load_dotenv()
@@ -100,7 +88,7 @@ def img_detector(model, image_url):
100
  "content": [
101
  {
102
  "type": "text",
103
- "text": "What is the product in this image? Please provide a very detailed description."
104
  },
105
  {
106
  "type": "image_url",
@@ -111,7 +99,8 @@ def img_detector(model, image_url):
111
  ]
112
  }
113
  ]
114
- })
 
115
  )
116
 
117
  # Check if the request was successful
@@ -168,7 +157,8 @@ def extract_product_info(vlm_description, lang):
168
  "temperature": random.uniform(0.9, 1),
169
  "max_tokens": 1000,
170
  "response_format": {"type": "json_object"}
171
- }
 
172
  )
173
 
174
  result = response.json()["choices"][0]["message"]["content"]
@@ -188,124 +178,78 @@ def contains_arabic(text):
188
 
189
  # Function to generate audio from text
190
  def text_to_speech(message: str, language: str) -> str:
191
- logger.info(f"Starting TTS for message length: {len(message)}, language: {language}")
192
-
193
  clean_text = re.sub(r'<[^>]+>', '', message)
194
  clean_text = clean_text.lstrip().replace("\n", " ")
195
 
196
  if len(clean_text) > 500:
197
  clean_text = clean_text[:500] + "..."
198
- logger.info("Text truncated to 500 characters")
199
-
200
- logger.info(f"Clean text for TTS: '{clean_text}'")
201
 
202
  filename = f"audio/audio_{uuid.uuid4().hex}.mp3"
203
- logger.info(f"Target audio filename: {filename}")
204
 
205
  # Determine if text contains Arabic
206
  is_arabic = contains_arabic(clean_text)
207
- logger.info(f"Text contains Arabic: {is_arabic}")
208
 
209
- # Use a simpler emotion for better compatibility
210
- emotion = "neutral" if not is_arabic else "neutral, speaking in Arabic"
211
- logger.info(f"TTS emotion: {emotion}")
 
212
 
213
- # Try multiple voice options and simpler parameters
214
- voice_options = ["nova", "alloy", "echo", "fable", "onyx", "shimmer"]
215
-
216
- for voice in voice_options:
217
- try:
218
- # Log TTS API call parameters
219
- logger.info(f"Trying TTS API with voice: {voice}")
220
- logger.info("Calling TTS API with parameters:")
221
- logger.info(f" - password: {'*' * len(TTS_PASSWORD) if TTS_PASSWORD else 'None'}")
222
- logger.info(f" - prompt: '{clean_text}'")
223
- logger.info(f" - voice: {voice}")
224
- logger.info(f" - emotion: {emotion}")
225
- logger.info(f" - use_random_seed: True")
226
- logger.info(f" - specific_seed: 12345")
227
-
228
- # Call the TTS API with simpler parameters
229
- result = tts_client.predict(
230
- password=TTS_PASSWORD,
231
- prompt=clean_text,
232
- voice=voice,
233
- emotion=emotion,
234
- use_random_seed=True,
235
- specific_seed=12345,
236
- api_name="/text_to_speech_app"
237
- )
238
 
239
- # Log detailed result information
240
- logger.info(f"TTS API result type: {type(result)}")
241
- logger.info(f"TTS API result: {result}")
242
-
243
- # Handle different response types
244
- if isinstance(result, tuple):
245
- logger.info(f"Result is tuple with {len(result)} items")
246
-
247
- # Check if this is an error response
248
- if len(result) == 2 and result[0] is None and isinstance(result[1], str):
249
- if "error" in result[1].lower() or "try again" in result[1].lower():
250
- logger.warning(f"TTS API returned error with voice {voice}: {result[1]}")
251
- if voice != voice_options[-1]: # Not the last voice to try
252
- logger.info(f"Trying next voice option...")
253
- continue
254
- else:
255
- logger.error("All voice options failed")
256
- raise Exception(f"TTS API failed with all voices. Last error: {result[1]}")
257
-
258
- for i, item in enumerate(result):
259
- logger.info(f" Tuple item {i}: type={type(item)}, value={item}")
260
-
261
- if isinstance(item, str):
262
- if item.startswith('http'):
263
- logger.info(f"Item {i} is a URL, attempting to download...")
264
- try:
265
- response = requests.get(item)
266
- if response.status_code == 200:
267
- with open(filename, 'wb') as f:
268
- f.write(response.content)
269
- logger.info(f"Successfully downloaded audio to {filename}")
270
- return filename
271
- except Exception as e:
272
- logger.error(f"Failed to download from URL {item}: {str(e)}")
273
- continue
274
-
275
- # If result is a direct URL string
276
- if isinstance(result, str) and result.startswith('http'):
277
- logger.info("Result is a direct URL, attempting to download...")
278
- try:
279
- response = requests.get(result)
280
- if response.status_code == 200:
281
- with open(filename, 'wb') as f:
282
- f.write(response.content)
283
- logger.info(f"Successfully downloaded audio to {filename}")
284
  return filename
285
- except Exception as e:
286
- logger.error(f"Failed to download from URL {result}: {str(e)}")
287
- if voice != voice_options[-1]:
288
- continue
289
- else:
290
- raise
291
 
292
- logger.error(f"Unexpected result format from TTS API with voice {voice}")
293
- if voice != voice_options[-1]:
294
- continue
295
- else:
296
- raise Exception("Unexpected result format from TTS API with all voices")
297
 
298
- except Exception as e:
299
- logger.error(f"Error with voice {voice}: {str(e)}")
300
- if voice != voice_options[-1]:
301
- logger.info("Trying next voice option...")
302
- continue
 
303
  else:
304
- logger.error("All voice options failed")
305
- raise Exception(f"TTS failed with all voices. Last error: {str(e)}")
306
-
307
- logger.error("No successful TTS conversion with any voice option")
308
- return f"Text-to-speech error: Failed with all voice options"
 
 
 
 
 
 
 
 
 
 
 
309
 
310
  # Function to upload image and get base64 URL
311
  def upload_image_and_get_url(image_path):
@@ -330,7 +274,28 @@ def process_image(image_path, model_name, language):
330
  product_info = extract_product_info(vlm_description, language)
331
 
332
  # Generate audio for the description
333
- audio_path = text_to_speech(product_info["description"], language)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
334
 
335
  return (
336
  product_info["product_name"],
@@ -340,6 +305,7 @@ def process_image(image_path, model_name, language):
340
  vlm_description # Return the raw VLM description for debugging
341
  )
342
  except Exception as e:
 
343
  return f"Error: {str(e)}", "Error", "Error processing image", None, str(e)
344
 
345
  # Process image from URL
@@ -356,7 +322,28 @@ def process_image_url(image_url, model_name, language):
356
  product_info = extract_product_info(vlm_description, language)
357
 
358
  # Generate audio for the description
359
- audio_path = text_to_speech(product_info["description"], language)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
360
 
361
  return (
362
  product_info["product_name"],
@@ -366,8 +353,50 @@ def process_image_url(image_url, model_name, language):
366
  vlm_description # Return the raw VLM description for debugging
367
  )
368
  except Exception as e:
 
369
  return f"Error: {str(e)}", "Error", "Error processing image URL", None, str(e)
370
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
  # Show API status in the interface
372
  def get_api_status():
373
  status_text = f"OpenRouter API Keys: {len(OPENROUTER_API_KEYS)} configured\n"
@@ -445,6 +474,19 @@ with gr.Blocks(title="AI Product Description Generator") as demo:
445
  inputs=[url_input, url_model_dropdown, url_language],
446
  outputs=[url_name_output, url_category_output, url_description_output, url_audio_output, url_vlm_raw_output]
447
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
448
 
449
  # Launch the application
450
  if __name__ == "__main__":
 
8
  from dotenv import load_dotenv
9
  import gradio as gr
10
  from gradio_client import Client
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  # Load environment variables
13
  load_dotenv()
 
88
  "content": [
89
  {
90
  "type": "text",
91
+ "text": "What is the product in this image? Please provide a detailed description."
92
  },
93
  {
94
  "type": "image_url",
 
99
  ]
100
  }
101
  ]
102
+ }),
103
+ timeout=30 # Set a reasonable timeout
104
  )
105
 
106
  # Check if the request was successful
 
157
  "temperature": random.uniform(0.9, 1),
158
  "max_tokens": 1000,
159
  "response_format": {"type": "json_object"}
160
+ },
161
+ timeout=30 # Set a reasonable timeout
162
  )
163
 
164
  result = response.json()["choices"][0]["message"]["content"]
 
178
 
179
  # Function to generate audio from text
180
  def text_to_speech(message: str, language: str) -> str:
 
 
181
  clean_text = re.sub(r'<[^>]+>', '', message)
182
  clean_text = clean_text.lstrip().replace("\n", " ")
183
 
184
  if len(clean_text) > 500:
185
  clean_text = clean_text[:500] + "..."
 
 
 
186
 
187
  filename = f"audio/audio_{uuid.uuid4().hex}.mp3"
 
188
 
189
  # Determine if text contains Arabic
190
  is_arabic = contains_arabic(clean_text)
 
191
 
192
+ # Adjust emotion for Arabic text
193
+ emotion = DEFAULT_TTS_EMOTION
194
+ if is_arabic:
195
+ emotion = emotion + " Speaking in Egyptian Arabic dialect."
196
 
197
+ try:
198
+ # Call the TTS API
199
+ result = tts_client.predict(
200
+ password=TTS_PASSWORD,
201
+ prompt=clean_text,
202
+ voice="nova",
203
+ emotion=emotion,
204
+ use_random_seed=True,
205
+ specific_seed=random.randint(1, 100000),
206
+ api_name="/text_to_speech_app"
207
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
+ # Handle different response types
210
+ if isinstance(result, tuple):
211
+ # Check if any item in the tuple is a URL or file path
212
+ for item in result:
213
+ if isinstance(item, str):
214
+ if item.startswith('http'):
215
+ # It's a URL, download it
216
+ response = requests.get(item)
217
+ if response.status_code == 200:
218
+ with open(filename, 'wb') as f:
219
+ f.write(response.content)
220
+ return filename
221
+ elif os.path.exists(item) and os.path.isfile(item):
222
+ # It's a file path, copy it
223
+ import shutil
224
+ shutil.copy(item, filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  return filename
 
 
 
 
 
 
226
 
227
+ # If we got here, we couldn't find a usable audio file in the tuple
228
+ raise Exception(f"No usable audio found in API response tuple: {result}")
 
 
 
229
 
230
+ elif isinstance(result, str):
231
+ # Handle string result (URL or file path)
232
+ if os.path.exists(result):
233
+ # If result is a file path, copy it to our directory
234
+ import shutil
235
+ shutil.copy(result, filename)
236
  else:
237
+ # If result is a URL, download it
238
+ response = requests.get(result)
239
+ if response.status_code == 200:
240
+ with open(filename, 'wb') as f:
241
+ f.write(response.content)
242
+ else:
243
+ raise Exception(f"Failed to download audio from URL: {response.status_code}")
244
+
245
+ return filename
246
+ else:
247
+ # Unknown result type
248
+ raise Exception(f"Unexpected result type from TTS API: {type(result).__name__}")
249
+
250
+ except Exception as e:
251
+ print(f"TTS Error: {str(e)}")
252
+ return f"Text-to-speech error: {str(e)}"
253
 
254
  # Function to upload image and get base64 URL
255
  def upload_image_and_get_url(image_path):
 
274
  product_info = extract_product_info(vlm_description, language)
275
 
276
  # Generate audio for the description
277
+ try:
278
+ audio_path = text_to_speech(product_info["description"], language)
279
+ if audio_path.startswith("Text-to-speech error"):
280
+ print(f"TTS Error: {audio_path}")
281
+ # Return error but continue with other outputs
282
+ return (
283
+ product_info["product_name"],
284
+ product_info["category"],
285
+ product_info["description"],
286
+ None, # No audio
287
+ f"{vlm_description}\n\nTTS Error: {audio_path}"
288
+ )
289
+ except Exception as tts_error:
290
+ print(f"TTS Exception: {str(tts_error)}")
291
+ # Return error but continue with other outputs
292
+ return (
293
+ product_info["product_name"],
294
+ product_info["category"],
295
+ product_info["description"],
296
+ None, # No audio
297
+ f"{vlm_description}\n\nTTS Exception: {str(tts_error)}"
298
+ )
299
 
300
  return (
301
  product_info["product_name"],
 
305
  vlm_description # Return the raw VLM description for debugging
306
  )
307
  except Exception as e:
308
+ print(f"Process Image Error: {str(e)}")
309
  return f"Error: {str(e)}", "Error", "Error processing image", None, str(e)
310
 
311
  # Process image from URL
 
322
  product_info = extract_product_info(vlm_description, language)
323
 
324
  # Generate audio for the description
325
+ try:
326
+ audio_path = text_to_speech(product_info["description"], language)
327
+ if audio_path.startswith("Text-to-speech error"):
328
+ print(f"TTS Error: {audio_path}")
329
+ # Return error but continue with other outputs
330
+ return (
331
+ product_info["product_name"],
332
+ product_info["category"],
333
+ product_info["description"],
334
+ None, # No audio
335
+ f"{vlm_description}\n\nTTS Error: {audio_path}"
336
+ )
337
+ except Exception as tts_error:
338
+ print(f"TTS Exception: {str(tts_error)}")
339
+ # Return error but continue with other outputs
340
+ return (
341
+ product_info["product_name"],
342
+ product_info["category"],
343
+ product_info["description"],
344
+ None, # No audio
345
+ f"{vlm_description}\n\nTTS Exception: {str(tts_error)}"
346
+ )
347
 
348
  return (
349
  product_info["product_name"],
 
353
  vlm_description # Return the raw VLM description for debugging
354
  )
355
  except Exception as e:
356
+ print(f"Process Image URL Error: {str(e)}")
357
  return f"Error: {str(e)}", "Error", "Error processing image URL", None, str(e)
358
 
359
+ # Test TTS API directly
360
+ def test_tts_api():
361
+ try:
362
+ sample_text = "This is a test of the text to speech API."
363
+ result = tts_client.predict(
364
+ password=TTS_PASSWORD,
365
+ prompt=sample_text,
366
+ voice="nova",
367
+ emotion=DEFAULT_TTS_EMOTION,
368
+ use_random_seed=True,
369
+ specific_seed=random.randint(1, 100000),
370
+ api_name="/text_to_speech_app"
371
+ )
372
+
373
+ # Print detailed information about the result
374
+ result_type = type(result).__name__
375
+ result_info = f"Result type: {result_type}"
376
+
377
+ if isinstance(result, tuple):
378
+ result_info += f"\nTuple length: {len(result)}"
379
+ for i, item in enumerate(result):
380
+ result_info += f"\n\nItem {i} type: {type(item).__name__}"
381
+ if isinstance(item, str):
382
+ result_info += f"\nItem {i} string value: {item[:500]}..."
383
+ # Check if it's a file path
384
+ if os.path.exists(item):
385
+ result_info += f"\nItem {i} is an existing file path, size: {os.path.getsize(item)} bytes"
386
+ else:
387
+ result_info += f"\nItem {i} value: {str(item)[:500]}..."
388
+ elif isinstance(result, str):
389
+ result_info += f"\nResult string length: {len(result)}"
390
+ result_info += f"\nResult starts with: {result[:100]}..."
391
+
392
+ # Check if it's a file path
393
+ if os.path.exists(result):
394
+ result_info += f"\nResult is an existing file path, size: {os.path.getsize(result)} bytes"
395
+
396
+ return f"TTS API Test Successful\n{result_info}"
397
+ except Exception as e:
398
+ return f"TTS API Test Failed: {str(e)}"
399
+
400
  # Show API status in the interface
401
  def get_api_status():
402
  status_text = f"OpenRouter API Keys: {len(OPENROUTER_API_KEYS)} configured\n"
 
474
  inputs=[url_input, url_model_dropdown, url_language],
475
  outputs=[url_name_output, url_category_output, url_description_output, url_audio_output, url_vlm_raw_output]
476
  )
477
+
478
+ with gr.TabItem("Debug Tools"):
479
+ gr.Markdown("## Debug Tools")
480
+ gr.Markdown("Use these tools to test the API connections and diagnose issues.")
481
+
482
+ test_tts_button = gr.Button("Test TTS API")
483
+ tts_test_output = gr.Textbox(label="TTS API Test Results", lines=10)
484
+
485
+ test_tts_button.click(
486
+ fn=test_tts_api,
487
+ inputs=[],
488
+ outputs=[tts_test_output]
489
+ )
490
 
491
  # Launch the application
492
  if __name__ == "__main__":