Spaces:

YoussefA7med
/

Goods_Detector_VLM

Runtime error

App Files Files Community

YoussefA7med commited on Jun 17, 2025

Commit

5f2ffce

verified ·

1 Parent(s): 0cdf4c6

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -118

app.py CHANGED Viewed

@@ -8,18 +8,6 @@ from PIL import Image
 from dotenv import load_dotenv
 import gradio as gr
 from gradio_client import Client
-import logging
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.FileHandler('app.log'),
-        logging.StreamHandler()
-    ]
-)
-logger = logging.getLogger(__name__)
 # Load environment variables
 load_dotenv()
@@ -100,7 +88,7 @@ def img_detector(model, image_url):
                             "content": [
                                 {
                                     "type": "text",
-                                    "text": "What is the product in this image? Please provide a very detailed description."
                                 },
                                 {
                                     "type": "image_url",
@@ -111,7 +99,8 @@ def img_detector(model, image_url):
                             ]
                         }
                     ]
-                })
             )
             # Check if the request was successful
@@ -168,7 +157,8 @@ def extract_product_info(vlm_description, lang):
                 "temperature": random.uniform(0.9, 1),
                 "max_tokens": 1000,
                 "response_format": {"type": "json_object"}
-            }
         )
         result = response.json()["choices"][0]["message"]["content"]
@@ -188,124 +178,78 @@ def contains_arabic(text):
 # Function to generate audio from text
 def text_to_speech(message: str, language: str) -> str:
-    logger.info(f"Starting TTS for message length: {len(message)}, language: {language}")
     clean_text = re.sub(r'<[^>]+>', '', message)
     clean_text = clean_text.lstrip().replace("\n", " ")
     if len(clean_text) > 500:
         clean_text = clean_text[:500] + "..."
-        logger.info("Text truncated to 500 characters")
-    logger.info(f"Clean text for TTS: '{clean_text}'")
     filename = f"audio/audio_{uuid.uuid4().hex}.mp3"
-    logger.info(f"Target audio filename: {filename}")
     # Determine if text contains Arabic
     is_arabic = contains_arabic(clean_text)
-    logger.info(f"Text contains Arabic: {is_arabic}")
-    # Use a simpler emotion for better compatibility
-    emotion = "neutral" if not is_arabic else "neutral, speaking in Arabic"
-    logger.info(f"TTS emotion: {emotion}")
-    # Try multiple voice options and simpler parameters
-    voice_options = ["nova", "alloy", "echo", "fable", "onyx", "shimmer"]
-    for voice in voice_options:
-        try:
-            # Log TTS API call parameters
-            logger.info(f"Trying TTS API with voice: {voice}")
-            logger.info("Calling TTS API with parameters:")
-            logger.info(f"  - password: {'*' * len(TTS_PASSWORD) if TTS_PASSWORD else 'None'}")
-            logger.info(f"  - prompt: '{clean_text}'")
-            logger.info(f"  - voice: {voice}")
-            logger.info(f"  - emotion: {emotion}")
-            logger.info(f"  - use_random_seed: True")
-            logger.info(f"  - specific_seed: 12345")
-            # Call the TTS API with simpler parameters
-            result = tts_client.predict(
-                password=TTS_PASSWORD,
-                prompt=clean_text,
-                voice=voice,
-                emotion=emotion,
-                use_random_seed=True,
-                specific_seed=12345,
-                api_name="/text_to_speech_app"
-            )
-            # Log detailed result information
-            logger.info(f"TTS API result type: {type(result)}")
-            logger.info(f"TTS API result: {result}")
-            # Handle different response types
-            if isinstance(result, tuple):
-                logger.info(f"Result is tuple with {len(result)} items")
-                # Check if this is an error response
-                if len(result) == 2 and result[0] is None and isinstance(result[1], str):
-                    if "error" in result[1].lower() or "try again" in result[1].lower():
-                        logger.warning(f"TTS API returned error with voice {voice}: {result[1]}")
-                        if voice != voice_options[-1]:  # Not the last voice to try
-                            logger.info(f"Trying next voice option...")
-                            continue
-                        else:
-                            logger.error("All voice options failed")
-                            raise Exception(f"TTS API failed with all voices. Last error: {result[1]}")
-                for i, item in enumerate(result):
-                    logger.info(f"  Tuple item {i}: type={type(item)}, value={item}")
-                    if isinstance(item, str):
-                        if item.startswith('http'):
-                            logger.info(f"Item {i} is a URL, attempting to download...")
-                            try:
-                                response = requests.get(item)
-                                if response.status_code == 200:
-                                    with open(filename, 'wb') as f:
-                                        f.write(response.content)
-                                    logger.info(f"Successfully downloaded audio to {filename}")
-                                    return filename
-                            except Exception as e:
-                                logger.error(f"Failed to download from URL {item}: {str(e)}")
-                                continue
-            # If result is a direct URL string
-            if isinstance(result, str) and result.startswith('http'):
-                logger.info("Result is a direct URL, attempting to download...")
-                try:
-                    response = requests.get(result)
-                    if response.status_code == 200:
-                        with open(filename, 'wb') as f:
-                            f.write(response.content)
-                        logger.info(f"Successfully downloaded audio to {filename}")
                         return filename
-                except Exception as e:
-                    logger.error(f"Failed to download from URL {result}: {str(e)}")
-                    if voice != voice_options[-1]:
-                        continue
-                    else:
-                        raise
-            logger.error(f"Unexpected result format from TTS API with voice {voice}")
-            if voice != voice_options[-1]:
-                continue
-            else:
-                raise Exception("Unexpected result format from TTS API with all voices")
-        except Exception as e:
-            logger.error(f"Error with voice {voice}: {str(e)}")
-            if voice != voice_options[-1]:
-                logger.info("Trying next voice option...")
-                continue
             else:
-                logger.error("All voice options failed")
-                raise Exception(f"TTS failed with all voices. Last error: {str(e)}")
-    logger.error("No successful TTS conversion with any voice option")
-    return f"Text-to-speech error: Failed with all voice options"
 # Function to upload image and get base64 URL
 def upload_image_and_get_url(image_path):
@@ -330,7 +274,28 @@ def process_image(image_path, model_name, language):
         product_info = extract_product_info(vlm_description, language)
         # Generate audio for the description
-        audio_path = text_to_speech(product_info["description"], language)
         return (
             product_info["product_name"],
@@ -340,6 +305,7 @@ def process_image(image_path, model_name, language):
             vlm_description  # Return the raw VLM description for debugging
         )
     except Exception as e:
         return f"Error: {str(e)}", "Error", "Error processing image", None, str(e)
 # Process image from URL
@@ -356,7 +322,28 @@ def process_image_url(image_url, model_name, language):
         product_info = extract_product_info(vlm_description, language)
         # Generate audio for the description
-        audio_path = text_to_speech(product_info["description"], language)
         return (
             product_info["product_name"],
@@ -366,8 +353,50 @@ def process_image_url(image_url, model_name, language):
             vlm_description  # Return the raw VLM description for debugging
         )
     except Exception as e:
         return f"Error: {str(e)}", "Error", "Error processing image URL", None, str(e)
 # Show API status in the interface
 def get_api_status():
     status_text = f"OpenRouter API Keys: {len(OPENROUTER_API_KEYS)} configured\n"
@@ -445,6 +474,19 @@ with gr.Blocks(title="AI Product Description Generator") as demo:
                 inputs=[url_input, url_model_dropdown, url_language],
                 outputs=[url_name_output, url_category_output, url_description_output, url_audio_output, url_vlm_raw_output]
             )
 # Launch the application
 if __name__ == "__main__":

 from dotenv import load_dotenv
 import gradio as gr
 from gradio_client import Client
 # Load environment variables
 load_dotenv()
                             "content": [
                                 {
                                     "type": "text",
+                                    "text": "What is the product in this image? Please provide a detailed description."
                                 },
                                 {
                                     "type": "image_url",
                             ]
                         }
                     ]
+                }),
+                timeout=30  # Set a reasonable timeout
             )
             # Check if the request was successful
                 "temperature": random.uniform(0.9, 1),
                 "max_tokens": 1000,
                 "response_format": {"type": "json_object"}
+            },
+            timeout=30  # Set a reasonable timeout
         )
         result = response.json()["choices"][0]["message"]["content"]
 # Function to generate audio from text
 def text_to_speech(message: str, language: str) -> str:
     clean_text = re.sub(r'<[^>]+>', '', message)
     clean_text = clean_text.lstrip().replace("\n", " ")
     if len(clean_text) > 500:
         clean_text = clean_text[:500] + "..."
     filename = f"audio/audio_{uuid.uuid4().hex}.mp3"
     # Determine if text contains Arabic
     is_arabic = contains_arabic(clean_text)
+    # Adjust emotion for Arabic text
+    emotion = DEFAULT_TTS_EMOTION
+    if is_arabic:
+        emotion = emotion + " Speaking in Egyptian Arabic dialect."
+    try:
+        # Call the TTS API
+        result = tts_client.predict(
+            password=TTS_PASSWORD,
+            prompt=clean_text,
+            voice="nova",
+            emotion=emotion,
+            use_random_seed=True,
+            specific_seed=random.randint(1, 100000),
+            api_name="/text_to_speech_app"
+        )
+        # Handle different response types
+        if isinstance(result, tuple):
+            # Check if any item in the tuple is a URL or file path
+            for item in result:
+                if isinstance(item, str):
+                    if item.startswith('http'):
+                        # It's a URL, download it
+                        response = requests.get(item)
+                        if response.status_code == 200:
+                            with open(filename, 'wb') as f:
+                                f.write(response.content)
+                            return filename
+                    elif os.path.exists(item) and os.path.isfile(item):
+                        # It's a file path, copy it
+                        import shutil
+                        shutil.copy(item, filename)
                         return filename
+            # If we got here, we couldn't find a usable audio file in the tuple
+            raise Exception(f"No usable audio found in API response tuple: {result}")
+        elif isinstance(result, str):
+            # Handle string result (URL or file path)
+            if os.path.exists(result):
+                # If result is a file path, copy it to our directory
+                import shutil
+                shutil.copy(result, filename)
             else:
+                # If result is a URL, download it
+                response = requests.get(result)
+                if response.status_code == 200:
+                    with open(filename, 'wb') as f:
+                        f.write(response.content)
+                else:
+                    raise Exception(f"Failed to download audio from URL: {response.status_code}")
+            return filename
+        else:
+            # Unknown result type
+            raise Exception(f"Unexpected result type from TTS API: {type(result).__name__}")
+    except Exception as e:
+        print(f"TTS Error: {str(e)}")
+        return f"Text-to-speech error: {str(e)}"
 # Function to upload image and get base64 URL
 def upload_image_and_get_url(image_path):
         product_info = extract_product_info(vlm_description, language)
         # Generate audio for the description
+        try:
+            audio_path = text_to_speech(product_info["description"], language)
+            if audio_path.startswith("Text-to-speech error"):
+                print(f"TTS Error: {audio_path}")
+                # Return error but continue with other outputs
+                return (
+                    product_info["product_name"],
+                    product_info["category"],
+                    product_info["description"],
+                    None,  # No audio
+                    f"{vlm_description}\n\nTTS Error: {audio_path}"
+                )
+        except Exception as tts_error:
+            print(f"TTS Exception: {str(tts_error)}")
+            # Return error but continue with other outputs
+            return (
+                product_info["product_name"],
+                product_info["category"],
+                product_info["description"],
+                None,  # No audio
+                f"{vlm_description}\n\nTTS Exception: {str(tts_error)}"
+            )
         return (
             product_info["product_name"],
             vlm_description  # Return the raw VLM description for debugging
         )
     except Exception as e:
+        print(f"Process Image Error: {str(e)}")
         return f"Error: {str(e)}", "Error", "Error processing image", None, str(e)
 # Process image from URL
         product_info = extract_product_info(vlm_description, language)
         # Generate audio for the description
+        try:
+            audio_path = text_to_speech(product_info["description"], language)
+            if audio_path.startswith("Text-to-speech error"):
+                print(f"TTS Error: {audio_path}")
+                # Return error but continue with other outputs
+                return (
+                    product_info["product_name"],
+                    product_info["category"],
+                    product_info["description"],
+                    None,  # No audio
+                    f"{vlm_description}\n\nTTS Error: {audio_path}"
+                )
+        except Exception as tts_error:
+            print(f"TTS Exception: {str(tts_error)}")
+            # Return error but continue with other outputs
+            return (
+                product_info["product_name"],
+                product_info["category"],
+                product_info["description"],
+                None,  # No audio
+                f"{vlm_description}\n\nTTS Exception: {str(tts_error)}"
+            )
         return (
             product_info["product_name"],
             vlm_description  # Return the raw VLM description for debugging
         )
     except Exception as e:
+        print(f"Process Image URL Error: {str(e)}")
         return f"Error: {str(e)}", "Error", "Error processing image URL", None, str(e)
+# Test TTS API directly
+def test_tts_api():
+    try:
+        sample_text = "This is a test of the text to speech API."
+        result = tts_client.predict(
+            password=TTS_PASSWORD,
+            prompt=sample_text,
+            voice="nova",
+            emotion=DEFAULT_TTS_EMOTION,
+            use_random_seed=True,
+            specific_seed=random.randint(1, 100000),
+            api_name="/text_to_speech_app"
+        )
+        # Print detailed information about the result
+        result_type = type(result).__name__
+        result_info = f"Result type: {result_type}"
+        if isinstance(result, tuple):
+            result_info += f"\nTuple length: {len(result)}"
+            for i, item in enumerate(result):
+                result_info += f"\n\nItem {i} type: {type(item).__name__}"
+                if isinstance(item, str):
+                    result_info += f"\nItem {i} string value: {item[:500]}..."
+                    # Check if it's a file path
+                    if os.path.exists(item):
+                        result_info += f"\nItem {i} is an existing file path, size: {os.path.getsize(item)} bytes"
+                else:
+                    result_info += f"\nItem {i} value: {str(item)[:500]}..."
+        elif isinstance(result, str):
+            result_info += f"\nResult string length: {len(result)}"
+            result_info += f"\nResult starts with: {result[:100]}..."
+            # Check if it's a file path
+            if os.path.exists(result):
+                result_info += f"\nResult is an existing file path, size: {os.path.getsize(result)} bytes"
+        return f"TTS API Test Successful\n{result_info}"
+    except Exception as e:
+        return f"TTS API Test Failed: {str(e)}"
 # Show API status in the interface
 def get_api_status():
     status_text = f"OpenRouter API Keys: {len(OPENROUTER_API_KEYS)} configured\n"
                 inputs=[url_input, url_model_dropdown, url_language],
                 outputs=[url_name_output, url_category_output, url_description_output, url_audio_output, url_vlm_raw_output]
             )
+        with gr.TabItem("Debug Tools"):
+            gr.Markdown("## Debug Tools")
+            gr.Markdown("Use these tools to test the API connections and diagnose issues.")
+            test_tts_button = gr.Button("Test TTS API")
+            tts_test_output = gr.Textbox(label="TTS API Test Results", lines=10)
+            test_tts_button.click(
+                fn=test_tts_api,
+                inputs=[],
+                outputs=[tts_test_output]
+            )
 # Launch the application
 if __name__ == "__main__":