TTS_API_Image

Sleeping

App Files Files Community

khurrameycon commited on Apr 6, 2025

Commit

3a240c4

verified ·

1 Parent(s): 5c6448d

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -32

app.py CHANGED Viewed

@@ -116,8 +116,17 @@ logger = logging.getLogger(__name__)
 class TextImageRequest(BaseModel):
     text: Optional[str] = None
     image_base64: Optional[str] = None
-    voice: str = "af_heart"
     speed: float = 1.0
 class AudioResponse(BaseModel):
     status: str
@@ -145,31 +154,47 @@ def llm_chat_response(text, image_base64=None):
         logger.info("Initializing InferenceClient...")
         client = InferenceClient(
-            provider="hf-inference",  # Updated to the provider shown in the sample
             api_key=HF_TOKEN
         )
-        # System message for better context
-        system_message = "You are a helpful assistant that provides concise responses."
         try:
             if image_base64:
                 logger.info("Processing request with image")
                 messages = [
-                    {"role": "system", "content": system_message},
-                    {"role": "user", "content": [
-                        {"type": "text", "text": text if text else "Describe what you see in the image in one line only"},
-                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}}
-                    ]}
                 ]
             else:
                 logger.info("Processing text-only request")
                 messages = [
-                    {"role": "system", "content": system_message},
-                    {"role": "user", "content": text + " Describe in one line only."}
                 ]
             logger.info("Sending request to model...")
             completion = client.chat.completions.create(
                 model="meta-llama/Llama-3.2-11B-Vision-Instruct",
                 messages=messages,
@@ -178,27 +203,29 @@ def llm_chat_response(text, image_base64=None):
             logger.info(f"Received response from model")
-            # Simplified response handling based on the sample code
-            if not completion.choices or len(completion.choices) == 0:
-                logger.error("No choices returned from model.")
-                raise HTTPException(status_code=500, detail="Model returned no choices.")
-            # Extract the content directly using the expected format
             try:
-                # Get message from first choice
-                message = completion.choices[0].message
-                # Extract content from message
-                if hasattr(message, "content"):
-                    return message.content
-                elif isinstance(message, dict) and "content" in message:
-                    return message["content"]
-                else:
-                    logger.error(f"Unexpected message format: {message}")
-                    raise HTTPException(status_code=500, detail="Unexpected message format from model")
             except Exception as e:
                 logger.error(f"Error extracting message content: {str(e)}")
-                raise HTTPException(status_code=500, detail=f"Failed to extract response content: {str(e)}")
         except Exception as e:
             logger.error(f"Error during model inference: {str(e)}")
@@ -250,12 +277,17 @@ async def generate_audio(request: TextImageRequest):
         text_reply = llm_chat_response(user_text, request.image_base64)
         logger.info(f"LLM response: {text_reply}")
         # Generate audio
-        logger.info(f"Generating audio using voice={request.voice}, speed={request.speed}")
         try:
             generator = pipeline(
                 text_reply,
-                voice=request.voice,
                 speed=request.speed,
                 split_pattern=r'\n+'
             )
@@ -273,7 +305,7 @@ async def generate_audio(request: TextImageRequest):
                 # Convert to 16-bit signed integers
                 pcm_data = (audio_numpy * 32767).astype(np.int16)
-                # Convert to bytes (automatically uses row-major order)
                 raw_audio = pcm_data.tobytes()
                 # Return PCM data with minimal necessary headers

 class TextImageRequest(BaseModel):
     text: Optional[str] = None
     image_base64: Optional[str] = None
+    voice: str = "af_heart"  # Default voice that we know exists
     speed: float = 1.0
+    # List of known available voices - update this based on what's actually available
+    AVAILABLE_VOICES = ["af_heart"]  # Add more voices as they become available
+    # Validate that the voice exists
+    def validate_voice(self):
+        if self.voice not in self.AVAILABLE_VOICES:
+            return "af_heart"  # Default to a voice we know exists
+        return self.voice
 class AudioResponse(BaseModel):
     status: str
         logger.info("Initializing InferenceClient...")
         client = InferenceClient(
+            provider="together",  # Using the provider shown in the sample
             api_key=HF_TOKEN
         )
         try:
+            # IMPORTANT: Following exactly the format from the sample code
             if image_base64:
                 logger.info("Processing request with image")
+                prompt = text if text else "Describe this image in one sentence."
                 messages = [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": prompt
+                            },
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:image/jpeg;base64,{image_base64}"
+                                }
+                            }
+                        ]
+                    }
                 ]
             else:
                 logger.info("Processing text-only request")
                 messages = [
+                    {
+                        "role": "user",
+                        "content": text + " Describe in one line only."
+                    }
                 ]
             logger.info("Sending request to model...")
+            # Log the exact message structure we're sending
+            logger.info(f"Message structure: {messages}")
+            # Use the exact model name and parameters from the sample
             completion = client.chat.completions.create(
                 model="meta-llama/Llama-3.2-11B-Vision-Instruct",
                 messages=messages,
             logger.info(f"Received response from model")
+            # Very simple response handling exactly like the sample code
+            logger.info(f"Model response received: {completion}")
             try:
+                # Extract response using the exact approach from the sample code
+                response = completion.choices[0].message.content
+                logger.info(f"Extracted response content: {response}")
+                return response
             except Exception as e:
                 logger.error(f"Error extracting message content: {str(e)}")
+                logger.error(f"Attempting alternative extraction method...")
+                # Fallback method if the above fails
+                try:
+                    if hasattr(completion.choices[0], "message"):
+                        if hasattr(completion.choices[0].message, "content"):
+                            return completion.choices[0].message.content
+                    # Last resort - try accessing as dictionary
+                    return completion.choices[0]["message"]["content"]
+                except Exception as e2:
+                    logger.error(f"All extraction methods failed: {str(e2)}")
+                    return "I couldn't process that input. Please try again with a different query."
         except Exception as e:
             logger.error(f"Error during model inference: {str(e)}")
         text_reply = llm_chat_response(user_text, request.image_base64)
         logger.info(f"LLM response: {text_reply}")
+        # Validate voice parameter
+        validated_voice = request.validate_voice()
+        if validated_voice != request.voice:
+            logger.warning(f"Requested voice '{request.voice}' not available, using '{validated_voice}' instead")
         # Generate audio
+        logger.info(f"Generating audio using voice={validated_voice}, speed={request.speed}")
         try:
             generator = pipeline(
                 text_reply,
+                voice=validated_voice,
                 speed=request.speed,
                 split_pattern=r'\n+'
             )
                 # Convert to 16-bit signed integers
                 pcm_data = (audio_numpy * 32767).astype(np.int16)
+                # Convert to bytes (automatically uses row-major order)
                 raw_audio = pcm_data.tobytes()
                 # Return PCM data with minimal necessary headers