TTS_API_Image_fallback

Sleeping

App Files Files Community

khurrameycon commited on Apr 6, 2025

Commit

60e1507

verified ·

1 Parent(s): 3a240c4

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -107

app.py CHANGED Viewed

@@ -94,20 +94,19 @@
 #     return Response("No audio generated", status_code=400)
-from fastapi import FastAPI, Response, HTTPException
-from fastapi.responses import FileResponse, JSONResponse
 from kokoro import KPipeline
-import soundfile as sf
 import os
 import numpy as np
 import torch
 from huggingface_hub import InferenceClient
 from pydantic import BaseModel
 import base64
-from io import BytesIO
-from PIL import Image
 import logging
 from typing import Optional
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -118,14 +117,13 @@ class TextImageRequest(BaseModel):
     image_base64: Optional[str] = None
     voice: str = "af_heart"  # Default voice that we know exists
     speed: float = 1.0
     # List of known available voices - update this based on what's actually available
     AVAILABLE_VOICES = ["af_heart"]  # Add more voices as they become available
-    # Validate that the voice exists
     def validate_voice(self):
         if self.voice not in self.AVAILABLE_VOICES:
-            return "af_heart"  # Default to a voice we know exists
         return self.voice
 class AudioResponse(BaseModel):
@@ -143,8 +141,14 @@ app = FastAPI(
     version="1.0.0"
 )
 def llm_chat_response(text, image_base64=None):
-    """Function to get responses from LLM with text and optionally image input."""
     try:
         HF_TOKEN = os.getenv("HF_TOKEN")
         logger.info("Checking HF_TOKEN...")
@@ -154,98 +158,92 @@ def llm_chat_response(text, image_base64=None):
         logger.info("Initializing InferenceClient...")
         client = InferenceClient(
-            provider="together",  # Using the provider shown in the sample
             api_key=HF_TOKEN
         )
-        try:
-            # IMPORTANT: Following exactly the format from the sample code
-            if image_base64:
-                logger.info("Processing request with image")
-                prompt = text if text else "Describe this image in one sentence."
-                messages = [
-                    {
-                        "role": "user",
-                        "content": [
-                            {
-                                "type": "text",
-                                "text": prompt
-                            },
-                            {
-                                "type": "image_url",
-                                "image_url": {
-                                    "url": f"data:image/jpeg;base64,{image_base64}"
-                                }
-                            }
-                        ]
-                    }
-                ]
-            else:
-                logger.info("Processing text-only request")
-                messages = [
-                    {
-                        "role": "user",
-                        "content": text + " Describe in one line only."
-                    }
-                ]
-            logger.info("Sending request to model...")
-            # Log the exact message structure we're sending
-            logger.info(f"Message structure: {messages}")
-            # Use the exact model name and parameters from the sample
-            completion = client.chat.completions.create(
-                model="meta-llama/Llama-3.2-11B-Vision-Instruct",
-                messages=messages,
-                max_tokens=500
-            )
-            logger.info(f"Received response from model")
-            # Very simple response handling exactly like the sample code
-            logger.info(f"Model response received: {completion}")
             try:
-                # Extract response using the exact approach from the sample code
-                response = completion.choices[0].message.content
-                logger.info(f"Extracted response content: {response}")
-                return response
             except Exception as e:
-                logger.error(f"Error extracting message content: {str(e)}")
-                logger.error(f"Attempting alternative extraction method...")
-                # Fallback method if the above fails
-                try:
-                    if hasattr(completion.choices[0], "message"):
-                        if hasattr(completion.choices[0].message, "content"):
-                            return completion.choices[0].message.content
-                    # Last resort - try accessing as dictionary
-                    return completion.choices[0]["message"]["content"]
-                except Exception as e2:
-                    logger.error(f"All extraction methods failed: {str(e2)}")
-                    return "I couldn't process that input. Please try again with a different query."
         except Exception as e:
-            logger.error(f"Error during model inference: {str(e)}")
-            # Fallback response in case of error
-            return "I couldn't process that input. Please try again with a different image or text query."
     except Exception as e:
         logger.error(f"Error in llm_chat_response: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
-# Initialize pipeline once at startup
 try:
     logger.info("Initializing KPipeline...")
     pipeline = KPipeline(lang_code='a')
     logger.info("KPipeline initialized successfully")
 except Exception as e:
     logger.error(f"Failed to initialize KPipeline: {str(e)}")
-    # We'll let the app start anyway, but log the error
-@app.post("/generate", response_model=None, responses={
     200: {"content": {"application/octet-stream": {}}},
     400: {"model": ErrorResponse},
     500: {"model": ErrorResponse}
@@ -254,14 +252,12 @@ async def generate_audio(request: TextImageRequest):
     """
     Generate audio from text and optionally analyze an image.
-    - If text is provided, uses that as input
-    - If image is provided, analyzes the image
-    - Converts the LLM response to speech using the specified voice and speed
     """
     try:
-        logger.info(f"Received audio generation request")
-        # If no text is provided but image is provided, use default prompt
         user_text = request.text if request.text is not None else ""
         if not user_text and request.image_base64:
             user_text = "Describe what you see in the image"
@@ -272,17 +268,14 @@ async def generate_audio(request: TextImageRequest):
                 content={"error": "Request must include either text or image_base64"}
             )
-        # Generate response using text and image if provided
         logger.info("Getting LLM response...")
         text_reply = llm_chat_response(user_text, request.image_base64)
         logger.info(f"LLM response: {text_reply}")
-        # Validate voice parameter
         validated_voice = request.validate_voice()
         if validated_voice != request.voice:
             logger.warning(f"Requested voice '{request.voice}' not available, using '{validated_voice}' instead")
-        # Generate audio
         logger.info(f"Generating audio using voice={validated_voice}, speed={request.speed}")
         try:
             generator = pipeline(
@@ -292,28 +285,20 @@ async def generate_audio(request: TextImageRequest):
                 split_pattern=r'\n+'
             )
-            # Process only the first segment for demo
             for i, (gs, ps, audio) in enumerate(generator):
                 logger.info(f"Audio generated successfully: segment {i}")
                 # Convert PyTorch tensor to NumPy array
                 audio_numpy = audio.cpu().numpy()
-                # Convert to 16-bit PCM
-                # Ensure the audio is in the range [-1, 1]
                 audio_numpy = np.clip(audio_numpy, -1, 1)
-                # Convert to 16-bit signed integers
                 pcm_data = (audio_numpy * 32767).astype(np.int16)
-                # Convert to bytes (automatically uses row-major order)
                 raw_audio = pcm_data.tobytes()
-                # Return PCM data with minimal necessary headers
                 return Response(
                     content=raw_audio,
                     media_type="application/octet-stream",
                     headers={
-                        "Content-Disposition": f'attachment; filename="output.pcm"',
                         "X-Sample-Rate": "24000",
                         "X-Bits-Per-Sample": "16",
                         "X-Endianness": "little"
@@ -342,18 +327,18 @@ async def generate_audio(request: TextImageRequest):
 @app.get("/")
 async def root():
-    return {"message": "Welcome to the Text-to-Speech API with Vision Support. Use POST /generate endpoint with 'text' and optionally 'image_base64' for queries."}
 @app.exception_handler(404)
-async def not_found_handler(request, exc):
     return JSONResponse(
         status_code=404,
         content={"error": "Endpoint not found. Please use POST /generate for queries."}
     )
 @app.exception_handler(405)
-async def method_not_allowed_handler(request, exc):
     return JSONResponse(
         status_code=405,
         content={"error": "Method not allowed. Please check the API documentation."}
-    )

 #     return Response("No audio generated", status_code=400)
+from fastapi import FastAPI, Response, HTTPException, Request
+from fastapi.responses import JSONResponse
+from fastapi.staticfiles import StaticFiles
 from kokoro import KPipeline
 import os
 import numpy as np
 import torch
 from huggingface_hub import InferenceClient
 from pydantic import BaseModel
 import base64
 import logging
 from typing import Optional
+import uuid
 # Set up logging
 logging.basicConfig(level=logging.INFO)
     image_base64: Optional[str] = None
     voice: str = "af_heart"  # Default voice that we know exists
     speed: float = 1.0
     # List of known available voices - update this based on what's actually available
     AVAILABLE_VOICES = ["af_heart"]  # Add more voices as they become available
     def validate_voice(self):
         if self.voice not in self.AVAILABLE_VOICES:
+            return "af_heart"  # Default to a known available voice
         return self.voice
 class AudioResponse(BaseModel):
     version="1.0.0"
 )
+# Create and mount static images directory so images are accessible via URL
+STATIC_DIR = "static_images"
+if not os.path.exists(STATIC_DIR):
+    os.makedirs(STATIC_DIR)
+app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
 def llm_chat_response(text, image_base64=None):
+    """Get responses from LLM with text and optionally an image input."""
     try:
         HF_TOKEN = os.getenv("HF_TOKEN")
         logger.info("Checking HF_TOKEN...")
         logger.info("Initializing InferenceClient...")
         client = InferenceClient(
+            provider="hf-inference",  # Using correct provider as per sample
             api_key=HF_TOKEN
         )
+        if image_base64:
+            logger.info("Processing request with image")
+            # Save the base64 image to the static folder
+            filename = f"{uuid.uuid4()}.jpg"
+            image_path = os.path.join(STATIC_DIR, filename)
             try:
+                image_data = base64.b64decode(image_base64)
             except Exception as e:
+                logger.error(f"Error decoding base64 image: {str(e)}")
+                raise HTTPException(status_code=400, detail="Invalid base64 image data")
+            with open(image_path, "wb") as f:
+                f.write(image_data)
+            # Construct image URL (assumes BASE_URL environment variable or defaults to localhost)
+            base_url = os.getenv("BASE_URL", "http://localhost:8000")
+            image_url = f"{base_url}/static/{filename}"
+            prompt = text if text else "Describe this image in one sentence."
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": prompt
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": image_url
+                            }
+                        }
+                    ]
+                }
+            ]
+        else:
+            logger.info("Processing text-only request")
+            messages = [
+                {
+                    "role": "user",
+                    "content": text + " Describe in one line only."
+                }
+            ]
+        logger.info("Sending request to model...")
+        logger.info(f"Message structure: {messages}")
+        completion = client.chat.completions.create(
+            model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+            messages=messages,
+            max_tokens=500
+        )
+        logger.info("Received response from model")
+        logger.info(f"Model response received: {completion}")
+        try:
+            response = completion.choices[0].message.content
+            logger.info(f"Extracted response content: {response}")
+            return response
         except Exception as e:
+            logger.error(f"Error extracting message content: {str(e)}")
+            try:
+                if hasattr(completion.choices[0], "message") and hasattr(completion.choices[0].message, "content"):
+                    return completion.choices[0].message.content
+                return completion.choices[0]["message"]["content"]
+            except Exception as e2:
+                logger.error(f"All extraction methods failed: {str(e2)}")
+                return "I couldn't process that input. Please try again with a different query."
     except Exception as e:
         logger.error(f"Error in llm_chat_response: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
+# Initialize the audio generation pipeline once at startup
 try:
     logger.info("Initializing KPipeline...")
     pipeline = KPipeline(lang_code='a')
     logger.info("KPipeline initialized successfully")
 except Exception as e:
     logger.error(f"Failed to initialize KPipeline: {str(e)}")
+    # The app starts regardless but logs the error
+@app.post("/generate", responses={
     200: {"content": {"application/octet-stream": {}}},
     400: {"model": ErrorResponse},
     500: {"model": ErrorResponse}
     """
     Generate audio from text and optionally analyze an image.
+    - If text is provided, it is used as input.
+    - If an image is provided (base64), it is saved and a URL is generated for processing.
+    - The LLM response is then converted to speech.
     """
     try:
+        logger.info("Received audio generation request")
         user_text = request.text if request.text is not None else ""
         if not user_text and request.image_base64:
             user_text = "Describe what you see in the image"
                 content={"error": "Request must include either text or image_base64"}
             )
         logger.info("Getting LLM response...")
         text_reply = llm_chat_response(user_text, request.image_base64)
         logger.info(f"LLM response: {text_reply}")
         validated_voice = request.validate_voice()
         if validated_voice != request.voice:
             logger.warning(f"Requested voice '{request.voice}' not available, using '{validated_voice}' instead")
         logger.info(f"Generating audio using voice={validated_voice}, speed={request.speed}")
         try:
             generator = pipeline(
                 split_pattern=r'\n+'
             )
             for i, (gs, ps, audio) in enumerate(generator):
                 logger.info(f"Audio generated successfully: segment {i}")
                 # Convert PyTorch tensor to NumPy array
                 audio_numpy = audio.cpu().numpy()
+                # Clip values to range [-1, 1] and convert to 16-bit PCM
                 audio_numpy = np.clip(audio_numpy, -1, 1)
                 pcm_data = (audio_numpy * 32767).astype(np.int16)
                 raw_audio = pcm_data.tobytes()
                 return Response(
                     content=raw_audio,
                     media_type="application/octet-stream",
                     headers={
+                        "Content-Disposition": 'attachment; filename="output.pcm"',
                         "X-Sample-Rate": "24000",
                         "X-Bits-Per-Sample": "16",
                         "X-Endianness": "little"
 @app.get("/")
 async def root():
+    return {"message": "Welcome to the Text-to-Speech API with Vision Support. Use POST /generate with 'text' and optionally 'image_base64' for queries."}
 @app.exception_handler(404)
+async def not_found_handler(request: Request, exc):
     return JSONResponse(
         status_code=404,
         content={"error": "Endpoint not found. Please use POST /generate for queries."}
     )
 @app.exception_handler(405)
+async def method_not_allowed_handler(request: Request, exc):
     return JSONResponse(
         status_code=405,
         content={"error": "Method not allowed. Please check the API documentation."}
+    )