Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,18 +8,6 @@ from PIL import Image
|
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
import gradio as gr
|
| 10 |
from gradio_client import Client
|
| 11 |
-
import logging
|
| 12 |
-
|
| 13 |
-
# Configure logging
|
| 14 |
-
logging.basicConfig(
|
| 15 |
-
level=logging.INFO,
|
| 16 |
-
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 17 |
-
handlers=[
|
| 18 |
-
logging.FileHandler('app.log'),
|
| 19 |
-
logging.StreamHandler()
|
| 20 |
-
]
|
| 21 |
-
)
|
| 22 |
-
logger = logging.getLogger(__name__)
|
| 23 |
|
| 24 |
# Load environment variables
|
| 25 |
load_dotenv()
|
|
@@ -100,7 +88,7 @@ def img_detector(model, image_url):
|
|
| 100 |
"content": [
|
| 101 |
{
|
| 102 |
"type": "text",
|
| 103 |
-
"text": "What is the product in this image? Please provide a
|
| 104 |
},
|
| 105 |
{
|
| 106 |
"type": "image_url",
|
|
@@ -111,7 +99,8 @@ def img_detector(model, image_url):
|
|
| 111 |
]
|
| 112 |
}
|
| 113 |
]
|
| 114 |
-
})
|
|
|
|
| 115 |
)
|
| 116 |
|
| 117 |
# Check if the request was successful
|
|
@@ -168,7 +157,8 @@ def extract_product_info(vlm_description, lang):
|
|
| 168 |
"temperature": random.uniform(0.9, 1),
|
| 169 |
"max_tokens": 1000,
|
| 170 |
"response_format": {"type": "json_object"}
|
| 171 |
-
}
|
|
|
|
| 172 |
)
|
| 173 |
|
| 174 |
result = response.json()["choices"][0]["message"]["content"]
|
|
@@ -188,124 +178,78 @@ def contains_arabic(text):
|
|
| 188 |
|
| 189 |
# Function to generate audio from text
|
| 190 |
def text_to_speech(message: str, language: str) -> str:
|
| 191 |
-
logger.info(f"Starting TTS for message length: {len(message)}, language: {language}")
|
| 192 |
-
|
| 193 |
clean_text = re.sub(r'<[^>]+>', '', message)
|
| 194 |
clean_text = clean_text.lstrip().replace("\n", " ")
|
| 195 |
|
| 196 |
if len(clean_text) > 500:
|
| 197 |
clean_text = clean_text[:500] + "..."
|
| 198 |
-
logger.info("Text truncated to 500 characters")
|
| 199 |
-
|
| 200 |
-
logger.info(f"Clean text for TTS: '{clean_text}'")
|
| 201 |
|
| 202 |
filename = f"audio/audio_{uuid.uuid4().hex}.mp3"
|
| 203 |
-
logger.info(f"Target audio filename: {filename}")
|
| 204 |
|
| 205 |
# Determine if text contains Arabic
|
| 206 |
is_arabic = contains_arabic(clean_text)
|
| 207 |
-
logger.info(f"Text contains Arabic: {is_arabic}")
|
| 208 |
|
| 209 |
-
#
|
| 210 |
-
emotion =
|
| 211 |
-
|
|
|
|
| 212 |
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
logger.info(f" - emotion: {emotion}")
|
| 225 |
-
logger.info(f" - use_random_seed: True")
|
| 226 |
-
logger.info(f" - specific_seed: 12345")
|
| 227 |
-
|
| 228 |
-
# Call the TTS API with simpler parameters
|
| 229 |
-
result = tts_client.predict(
|
| 230 |
-
password=TTS_PASSWORD,
|
| 231 |
-
prompt=clean_text,
|
| 232 |
-
voice=voice,
|
| 233 |
-
emotion=emotion,
|
| 234 |
-
use_random_seed=True,
|
| 235 |
-
specific_seed=12345,
|
| 236 |
-
api_name="/text_to_speech_app"
|
| 237 |
-
)
|
| 238 |
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
logger.error("All voice options failed")
|
| 256 |
-
raise Exception(f"TTS API failed with all voices. Last error: {result[1]}")
|
| 257 |
-
|
| 258 |
-
for i, item in enumerate(result):
|
| 259 |
-
logger.info(f" Tuple item {i}: type={type(item)}, value={item}")
|
| 260 |
-
|
| 261 |
-
if isinstance(item, str):
|
| 262 |
-
if item.startswith('http'):
|
| 263 |
-
logger.info(f"Item {i} is a URL, attempting to download...")
|
| 264 |
-
try:
|
| 265 |
-
response = requests.get(item)
|
| 266 |
-
if response.status_code == 200:
|
| 267 |
-
with open(filename, 'wb') as f:
|
| 268 |
-
f.write(response.content)
|
| 269 |
-
logger.info(f"Successfully downloaded audio to {filename}")
|
| 270 |
-
return filename
|
| 271 |
-
except Exception as e:
|
| 272 |
-
logger.error(f"Failed to download from URL {item}: {str(e)}")
|
| 273 |
-
continue
|
| 274 |
-
|
| 275 |
-
# If result is a direct URL string
|
| 276 |
-
if isinstance(result, str) and result.startswith('http'):
|
| 277 |
-
logger.info("Result is a direct URL, attempting to download...")
|
| 278 |
-
try:
|
| 279 |
-
response = requests.get(result)
|
| 280 |
-
if response.status_code == 200:
|
| 281 |
-
with open(filename, 'wb') as f:
|
| 282 |
-
f.write(response.content)
|
| 283 |
-
logger.info(f"Successfully downloaded audio to {filename}")
|
| 284 |
return filename
|
| 285 |
-
except Exception as e:
|
| 286 |
-
logger.error(f"Failed to download from URL {result}: {str(e)}")
|
| 287 |
-
if voice != voice_options[-1]:
|
| 288 |
-
continue
|
| 289 |
-
else:
|
| 290 |
-
raise
|
| 291 |
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
continue
|
| 295 |
-
else:
|
| 296 |
-
raise Exception("Unexpected result format from TTS API with all voices")
|
| 297 |
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
if
|
| 301 |
-
|
| 302 |
-
|
|
|
|
| 303 |
else:
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
|
| 310 |
# Function to upload image and get base64 URL
|
| 311 |
def upload_image_and_get_url(image_path):
|
|
@@ -330,7 +274,28 @@ def process_image(image_path, model_name, language):
|
|
| 330 |
product_info = extract_product_info(vlm_description, language)
|
| 331 |
|
| 332 |
# Generate audio for the description
|
| 333 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
|
| 335 |
return (
|
| 336 |
product_info["product_name"],
|
|
@@ -340,6 +305,7 @@ def process_image(image_path, model_name, language):
|
|
| 340 |
vlm_description # Return the raw VLM description for debugging
|
| 341 |
)
|
| 342 |
except Exception as e:
|
|
|
|
| 343 |
return f"Error: {str(e)}", "Error", "Error processing image", None, str(e)
|
| 344 |
|
| 345 |
# Process image from URL
|
|
@@ -356,7 +322,28 @@ def process_image_url(image_url, model_name, language):
|
|
| 356 |
product_info = extract_product_info(vlm_description, language)
|
| 357 |
|
| 358 |
# Generate audio for the description
|
| 359 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
|
| 361 |
return (
|
| 362 |
product_info["product_name"],
|
|
@@ -366,8 +353,50 @@ def process_image_url(image_url, model_name, language):
|
|
| 366 |
vlm_description # Return the raw VLM description for debugging
|
| 367 |
)
|
| 368 |
except Exception as e:
|
|
|
|
| 369 |
return f"Error: {str(e)}", "Error", "Error processing image URL", None, str(e)
|
| 370 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
# Show API status in the interface
|
| 372 |
def get_api_status():
|
| 373 |
status_text = f"OpenRouter API Keys: {len(OPENROUTER_API_KEYS)} configured\n"
|
|
@@ -445,6 +474,19 @@ with gr.Blocks(title="AI Product Description Generator") as demo:
|
|
| 445 |
inputs=[url_input, url_model_dropdown, url_language],
|
| 446 |
outputs=[url_name_output, url_category_output, url_description_output, url_audio_output, url_vlm_raw_output]
|
| 447 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
|
| 449 |
# Launch the application
|
| 450 |
if __name__ == "__main__":
|
|
|
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
import gradio as gr
|
| 10 |
from gradio_client import Client
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# Load environment variables
|
| 13 |
load_dotenv()
|
|
|
|
| 88 |
"content": [
|
| 89 |
{
|
| 90 |
"type": "text",
|
| 91 |
+
"text": "What is the product in this image? Please provide a detailed description."
|
| 92 |
},
|
| 93 |
{
|
| 94 |
"type": "image_url",
|
|
|
|
| 99 |
]
|
| 100 |
}
|
| 101 |
]
|
| 102 |
+
}),
|
| 103 |
+
timeout=30 # Set a reasonable timeout
|
| 104 |
)
|
| 105 |
|
| 106 |
# Check if the request was successful
|
|
|
|
| 157 |
"temperature": random.uniform(0.9, 1),
|
| 158 |
"max_tokens": 1000,
|
| 159 |
"response_format": {"type": "json_object"}
|
| 160 |
+
},
|
| 161 |
+
timeout=30 # Set a reasonable timeout
|
| 162 |
)
|
| 163 |
|
| 164 |
result = response.json()["choices"][0]["message"]["content"]
|
|
|
|
| 178 |
|
| 179 |
# Function to generate audio from text
|
| 180 |
def text_to_speech(message: str, language: str) -> str:
|
|
|
|
|
|
|
| 181 |
clean_text = re.sub(r'<[^>]+>', '', message)
|
| 182 |
clean_text = clean_text.lstrip().replace("\n", " ")
|
| 183 |
|
| 184 |
if len(clean_text) > 500:
|
| 185 |
clean_text = clean_text[:500] + "..."
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
filename = f"audio/audio_{uuid.uuid4().hex}.mp3"
|
|
|
|
| 188 |
|
| 189 |
# Determine if text contains Arabic
|
| 190 |
is_arabic = contains_arabic(clean_text)
|
|
|
|
| 191 |
|
| 192 |
+
# Adjust emotion for Arabic text
|
| 193 |
+
emotion = DEFAULT_TTS_EMOTION
|
| 194 |
+
if is_arabic:
|
| 195 |
+
emotion = emotion + " Speaking in Egyptian Arabic dialect."
|
| 196 |
|
| 197 |
+
try:
|
| 198 |
+
# Call the TTS API
|
| 199 |
+
result = tts_client.predict(
|
| 200 |
+
password=TTS_PASSWORD,
|
| 201 |
+
prompt=clean_text,
|
| 202 |
+
voice="nova",
|
| 203 |
+
emotion=emotion,
|
| 204 |
+
use_random_seed=True,
|
| 205 |
+
specific_seed=random.randint(1, 100000),
|
| 206 |
+
api_name="/text_to_speech_app"
|
| 207 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
|
| 209 |
+
# Handle different response types
|
| 210 |
+
if isinstance(result, tuple):
|
| 211 |
+
# Check if any item in the tuple is a URL or file path
|
| 212 |
+
for item in result:
|
| 213 |
+
if isinstance(item, str):
|
| 214 |
+
if item.startswith('http'):
|
| 215 |
+
# It's a URL, download it
|
| 216 |
+
response = requests.get(item)
|
| 217 |
+
if response.status_code == 200:
|
| 218 |
+
with open(filename, 'wb') as f:
|
| 219 |
+
f.write(response.content)
|
| 220 |
+
return filename
|
| 221 |
+
elif os.path.exists(item) and os.path.isfile(item):
|
| 222 |
+
# It's a file path, copy it
|
| 223 |
+
import shutil
|
| 224 |
+
shutil.copy(item, filename)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 225 |
return filename
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
+
# If we got here, we couldn't find a usable audio file in the tuple
|
| 228 |
+
raise Exception(f"No usable audio found in API response tuple: {result}")
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
+
elif isinstance(result, str):
|
| 231 |
+
# Handle string result (URL or file path)
|
| 232 |
+
if os.path.exists(result):
|
| 233 |
+
# If result is a file path, copy it to our directory
|
| 234 |
+
import shutil
|
| 235 |
+
shutil.copy(result, filename)
|
| 236 |
else:
|
| 237 |
+
# If result is a URL, download it
|
| 238 |
+
response = requests.get(result)
|
| 239 |
+
if response.status_code == 200:
|
| 240 |
+
with open(filename, 'wb') as f:
|
| 241 |
+
f.write(response.content)
|
| 242 |
+
else:
|
| 243 |
+
raise Exception(f"Failed to download audio from URL: {response.status_code}")
|
| 244 |
+
|
| 245 |
+
return filename
|
| 246 |
+
else:
|
| 247 |
+
# Unknown result type
|
| 248 |
+
raise Exception(f"Unexpected result type from TTS API: {type(result).__name__}")
|
| 249 |
+
|
| 250 |
+
except Exception as e:
|
| 251 |
+
print(f"TTS Error: {str(e)}")
|
| 252 |
+
return f"Text-to-speech error: {str(e)}"
|
| 253 |
|
| 254 |
# Function to upload image and get base64 URL
|
| 255 |
def upload_image_and_get_url(image_path):
|
|
|
|
| 274 |
product_info = extract_product_info(vlm_description, language)
|
| 275 |
|
| 276 |
# Generate audio for the description
|
| 277 |
+
try:
|
| 278 |
+
audio_path = text_to_speech(product_info["description"], language)
|
| 279 |
+
if audio_path.startswith("Text-to-speech error"):
|
| 280 |
+
print(f"TTS Error: {audio_path}")
|
| 281 |
+
# Return error but continue with other outputs
|
| 282 |
+
return (
|
| 283 |
+
product_info["product_name"],
|
| 284 |
+
product_info["category"],
|
| 285 |
+
product_info["description"],
|
| 286 |
+
None, # No audio
|
| 287 |
+
f"{vlm_description}\n\nTTS Error: {audio_path}"
|
| 288 |
+
)
|
| 289 |
+
except Exception as tts_error:
|
| 290 |
+
print(f"TTS Exception: {str(tts_error)}")
|
| 291 |
+
# Return error but continue with other outputs
|
| 292 |
+
return (
|
| 293 |
+
product_info["product_name"],
|
| 294 |
+
product_info["category"],
|
| 295 |
+
product_info["description"],
|
| 296 |
+
None, # No audio
|
| 297 |
+
f"{vlm_description}\n\nTTS Exception: {str(tts_error)}"
|
| 298 |
+
)
|
| 299 |
|
| 300 |
return (
|
| 301 |
product_info["product_name"],
|
|
|
|
| 305 |
vlm_description # Return the raw VLM description for debugging
|
| 306 |
)
|
| 307 |
except Exception as e:
|
| 308 |
+
print(f"Process Image Error: {str(e)}")
|
| 309 |
return f"Error: {str(e)}", "Error", "Error processing image", None, str(e)
|
| 310 |
|
| 311 |
# Process image from URL
|
|
|
|
| 322 |
product_info = extract_product_info(vlm_description, language)
|
| 323 |
|
| 324 |
# Generate audio for the description
|
| 325 |
+
try:
|
| 326 |
+
audio_path = text_to_speech(product_info["description"], language)
|
| 327 |
+
if audio_path.startswith("Text-to-speech error"):
|
| 328 |
+
print(f"TTS Error: {audio_path}")
|
| 329 |
+
# Return error but continue with other outputs
|
| 330 |
+
return (
|
| 331 |
+
product_info["product_name"],
|
| 332 |
+
product_info["category"],
|
| 333 |
+
product_info["description"],
|
| 334 |
+
None, # No audio
|
| 335 |
+
f"{vlm_description}\n\nTTS Error: {audio_path}"
|
| 336 |
+
)
|
| 337 |
+
except Exception as tts_error:
|
| 338 |
+
print(f"TTS Exception: {str(tts_error)}")
|
| 339 |
+
# Return error but continue with other outputs
|
| 340 |
+
return (
|
| 341 |
+
product_info["product_name"],
|
| 342 |
+
product_info["category"],
|
| 343 |
+
product_info["description"],
|
| 344 |
+
None, # No audio
|
| 345 |
+
f"{vlm_description}\n\nTTS Exception: {str(tts_error)}"
|
| 346 |
+
)
|
| 347 |
|
| 348 |
return (
|
| 349 |
product_info["product_name"],
|
|
|
|
| 353 |
vlm_description # Return the raw VLM description for debugging
|
| 354 |
)
|
| 355 |
except Exception as e:
|
| 356 |
+
print(f"Process Image URL Error: {str(e)}")
|
| 357 |
return f"Error: {str(e)}", "Error", "Error processing image URL", None, str(e)
|
| 358 |
|
| 359 |
+
# Test TTS API directly
|
| 360 |
+
def test_tts_api():
|
| 361 |
+
try:
|
| 362 |
+
sample_text = "This is a test of the text to speech API."
|
| 363 |
+
result = tts_client.predict(
|
| 364 |
+
password=TTS_PASSWORD,
|
| 365 |
+
prompt=sample_text,
|
| 366 |
+
voice="nova",
|
| 367 |
+
emotion=DEFAULT_TTS_EMOTION,
|
| 368 |
+
use_random_seed=True,
|
| 369 |
+
specific_seed=random.randint(1, 100000),
|
| 370 |
+
api_name="/text_to_speech_app"
|
| 371 |
+
)
|
| 372 |
+
|
| 373 |
+
# Print detailed information about the result
|
| 374 |
+
result_type = type(result).__name__
|
| 375 |
+
result_info = f"Result type: {result_type}"
|
| 376 |
+
|
| 377 |
+
if isinstance(result, tuple):
|
| 378 |
+
result_info += f"\nTuple length: {len(result)}"
|
| 379 |
+
for i, item in enumerate(result):
|
| 380 |
+
result_info += f"\n\nItem {i} type: {type(item).__name__}"
|
| 381 |
+
if isinstance(item, str):
|
| 382 |
+
result_info += f"\nItem {i} string value: {item[:500]}..."
|
| 383 |
+
# Check if it's a file path
|
| 384 |
+
if os.path.exists(item):
|
| 385 |
+
result_info += f"\nItem {i} is an existing file path, size: {os.path.getsize(item)} bytes"
|
| 386 |
+
else:
|
| 387 |
+
result_info += f"\nItem {i} value: {str(item)[:500]}..."
|
| 388 |
+
elif isinstance(result, str):
|
| 389 |
+
result_info += f"\nResult string length: {len(result)}"
|
| 390 |
+
result_info += f"\nResult starts with: {result[:100]}..."
|
| 391 |
+
|
| 392 |
+
# Check if it's a file path
|
| 393 |
+
if os.path.exists(result):
|
| 394 |
+
result_info += f"\nResult is an existing file path, size: {os.path.getsize(result)} bytes"
|
| 395 |
+
|
| 396 |
+
return f"TTS API Test Successful\n{result_info}"
|
| 397 |
+
except Exception as e:
|
| 398 |
+
return f"TTS API Test Failed: {str(e)}"
|
| 399 |
+
|
| 400 |
# Show API status in the interface
|
| 401 |
def get_api_status():
|
| 402 |
status_text = f"OpenRouter API Keys: {len(OPENROUTER_API_KEYS)} configured\n"
|
|
|
|
| 474 |
inputs=[url_input, url_model_dropdown, url_language],
|
| 475 |
outputs=[url_name_output, url_category_output, url_description_output, url_audio_output, url_vlm_raw_output]
|
| 476 |
)
|
| 477 |
+
|
| 478 |
+
with gr.TabItem("Debug Tools"):
|
| 479 |
+
gr.Markdown("## Debug Tools")
|
| 480 |
+
gr.Markdown("Use these tools to test the API connections and diagnose issues.")
|
| 481 |
+
|
| 482 |
+
test_tts_button = gr.Button("Test TTS API")
|
| 483 |
+
tts_test_output = gr.Textbox(label="TTS API Test Results", lines=10)
|
| 484 |
+
|
| 485 |
+
test_tts_button.click(
|
| 486 |
+
fn=test_tts_api,
|
| 487 |
+
inputs=[],
|
| 488 |
+
outputs=[tts_test_output]
|
| 489 |
+
)
|
| 490 |
|
| 491 |
# Launch the application
|
| 492 |
if __name__ == "__main__":
|