TTS_API_Image_fallback

Sleeping

App Files Files Community

khurrameycon commited on Apr 6, 2025

Commit

b2195c3

verified ·

1 Parent(s): 63d6fee

Update app.py

Browse files

Files changed (1) hide show

app.py +169 -71

app.py CHANGED Viewed

@@ -94,43 +94,68 @@
 #     return Response("No audio generated", status_code=400)
 import os
-import logging
 import base64
 from typing import Optional
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import JSONResponse
-from pydantic import BaseModel
-from huggingface_hub import InferenceClient
-from requests.exceptions import HTTPError
 import uuid
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Initialize FastAPI app
 app = FastAPI(
-    title="LLM Chat API",
-    description="API for getting chat responses from Llama model (supports text and image input)",
     version="1.0.0"
 )
-# Directory to save images
-STATIC_DIR = "static_images"
-if not os.path.exists(STATIC_DIR):
-    os.makedirs(STATIC_DIR)
-# Pydantic models
-class ChatRequest(BaseModel):
-    text: str
-    image_url: Optional[str] = None  # In this updated version, this field is expected to be a base64 encoded image
-class ChatResponse(BaseModel):
-    response: str
-    status: str
-def llm_chat_response(text: str, image_base64: Optional[str] = None) -> str:
     try:
         HF_TOKEN = os.getenv("HF_TOKEN")
         logger.info("Checking HF_TOKEN...")
@@ -140,41 +165,32 @@ def llm_chat_response(text: str, image_base64: Optional[str] = None) -> str:
         logger.info("Initializing InferenceClient...")
         client = InferenceClient(
-            provider="hf-inference",  # Updated provider
             api_key=HF_TOKEN
         )
-        # Build the messages payload.
-        # For text-only queries, append a default instruction.
         message_content = [{
             "type": "text",
             "text": text + ("" if image_base64 else " describe in one line only")
         }]
         if image_base64:
-            logger.info("Saving base64 encoded image to file...")
-            # Decode and save the image locally
-            filename = f"{uuid.uuid4()}.jpg"
-            image_path = os.path.join(STATIC_DIR, filename)
-            try:
-                image_data = base64.b64decode(image_base64)
-            except Exception as e:
-                logger.error(f"Error decoding image: {str(e)}")
-                raise HTTPException(status_code=400, detail="Invalid base64 image data")
-            with open(image_path, "wb") as f:
-                f.write(image_data)
-            # Construct public URL for the saved image.
-            # Set BASE_URL to your public URL if needed.
-            base_url = os.getenv("BASE_URL", "http://localhost:8000")
-            public_image_url = f"{base_url}/{STATIC_DIR}/{filename}"
-            logger.info(f"Using saved image URL: {public_image_url}")
             message_content.append({
                 "type": "image_url",
-                "image_url": {"url": public_image_url}
             })
         messages = [{
             "role": "user",
             "content": message_content
@@ -187,23 +203,19 @@ def llm_chat_response(text: str, image_base64: Optional[str] = None) -> str:
                 messages=messages,
                 max_tokens=500
             )
-        except HTTPError as http_err:
-            logger.error(f"HTTP error occurred: {http_err.response.text}")
-            raise HTTPException(status_code=500, detail=http_err.response.text)
-        logger.info(f"Raw model response: {completion}")
-        if getattr(completion, "error", None):
-            error_details = completion.error
-            error_message = error_details.get("message", "Unknown error")
-            logger.error(f"Model returned error: {error_message}")
-            raise HTTPException(status_code=500, detail=f"Model returned error: {error_message}")
         if not completion.choices or len(completion.choices) == 0:
             logger.error("No choices returned from model.")
             raise HTTPException(status_code=500, detail="Model returned no choices.")
-        # Extract the response message from the first choice.
         choice = completion.choices[0]
         response_message = None
         if hasattr(choice, "message"):
@@ -226,35 +238,122 @@ def llm_chat_response(text: str, image_base64: Optional[str] = None) -> str:
             raise HTTPException(status_code=500, detail="Model message did not include content.")
         return content
     except Exception as e:
         logger.error(f"Error in llm_chat_response: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))
-@app.post("/chat", response_model=ChatResponse)
-async def chat(request: ChatRequest):
     try:
-        logger.info(f"Received chat request with text: {request.text}")
-        if request.image_url:
-            logger.info("Image data provided.")
-        response = llm_chat_response(request.text, request.image_url)
-        return ChatResponse(response=response, status="success")
-    except HTTPException as he:
-        logger.error(f"HTTP Exception in chat endpoint: {str(he)}")
-        raise he
     except Exception as e:
-        logger.error(f"Unexpected error in chat endpoint: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))
 @app.get("/")
 async def root():
-    return {"message": "Welcome to the LLM Chat API. Use POST /chat endpoint with 'text' and optionally 'image_url' (base64 encoded) for queries."}
 @app.exception_handler(404)
 async def not_found_handler(request, exc):
     return JSONResponse(
         status_code=404,
-        content={"error": "Endpoint not found. Please use POST /chat for queries."}
     )
 @app.exception_handler(405)
@@ -262,5 +361,4 @@ async def method_not_allowed_handler(request, exc):
     return JSONResponse(
         status_code=405,
         content={"error": "Method not allowed. Please check the API documentation."}
-    )

 #     return Response("No audio generated", status_code=400)
+from fastapi import FastAPI, Response, HTTPException
+from fastapi.responses import FileResponse, JSONResponse
+from kokoro import KPipeline
+import soundfile as sf
 import os
+import numpy as np
+import torch
+from huggingface_hub import InferenceClient
+from pydantic import BaseModel
 import base64
+from io import BytesIO
+from PIL import Image
+import logging
 from typing import Optional
 import uuid
+import pathlib
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Create a directory for temporary image storage
+TEMP_DIR = pathlib.Path("./temp_images")
+TEMP_DIR.mkdir(exist_ok=True)
+class TextImageRequest(BaseModel):
+    text: Optional[str] = None
+    image_base64: Optional[str] = None
+    voice: str = "af_heart"
+    speed: float = 1.0
+class AudioResponse(BaseModel):
+    status: str
+    message: str
 # Initialize FastAPI app
 app = FastAPI(
+    title="Text-to-Speech API with Vision Support",
+    description="API for generating speech from text with optional image analysis",
     version="1.0.0"
 )
+def save_base64_image(image_base64):
+    """Save base64 image to a temporary file and return the file path"""
+    try:
+        # Generate a unique filename
+        filename = f"{uuid.uuid4()}.jpg"
+        filepath = TEMP_DIR / filename
+        # Decode and save the image
+        image_data = base64.b64decode(image_base64)
+        with open(filepath, "wb") as f:
+            f.write(image_data)
+        # Return the file URL (using file:// protocol)
+        return f"file://{filepath.absolute()}"
+    except Exception as e:
+        logger.error(f"Error saving base64 image: {str(e)}")
+        raise HTTPException(status_code=400, detail=f"Invalid base64 image data: {str(e)}")
+def llm_chat_response(text, image_base64=None):
+    """Function to get responses from LLM with text and optionally image input."""
     try:
         HF_TOKEN = os.getenv("HF_TOKEN")
         logger.info("Checking HF_TOKEN...")
         logger.info("Initializing InferenceClient...")
         client = InferenceClient(
+            provider="sambanova",  # Using sambanova as in your working example
             api_key=HF_TOKEN
         )
+        # Build the messages payload using the format from your working example
         message_content = [{
             "type": "text",
             "text": text + ("" if image_base64 else " describe in one line only")
         }]
         if image_base64:
+            logger.info("Processing base64 image...")
+            # Save the base64 image to a file and get the file URL
+            image_url = save_base64_image(image_base64)
+            logger.info(f"Image saved at: {image_url}")
+            # Create data URI
+            data_uri = f"data:image/jpeg;base64,{image_base64}"
+            # Add image to message content
             message_content.append({
                 "type": "image_url",
+                "image_url": {"url": data_uri}
             })
+        # Construct the messages array exactly as in your working example
         messages = [{
             "role": "user",
             "content": message_content
                 messages=messages,
                 max_tokens=500
             )
+        except Exception as http_err:
+            # Log HTTP errors from the request
+            logger.error(f"HTTP error occurred: {str(http_err)}")
+            raise HTTPException(status_code=500, detail=str(http_err))
+        logger.info(f"Raw model response received")
+        # Extract the response using the same method as your working code
         if not completion.choices or len(completion.choices) == 0:
             logger.error("No choices returned from model.")
             raise HTTPException(status_code=500, detail="Model returned no choices.")
+        # Extract the response message from the first choice
         choice = completion.choices[0]
         response_message = None
         if hasattr(choice, "message"):
             raise HTTPException(status_code=500, detail="Model message did not include content.")
         return content
     except Exception as e:
         logger.error(f"Error in llm_chat_response: {str(e)}")
+        # Fallback response in case of error
+        return "I couldn't process that input. Please try again with a different image or text query."
+# Initialize pipeline once at startup
+try:
+    logger.info("Initializing KPipeline...")
+    pipeline = KPipeline(lang_code='a')
+    logger.info("KPipeline initialized successfully")
+except Exception as e:
+    logger.error(f"Failed to initialize KPipeline: {str(e)}")
+    # We'll let the app start anyway, but log the error
+@app.post("/generate")
+async def generate_audio(request: TextImageRequest):
+    """
+    Generate audio from text and optionally analyze an image.
+    - If text is provided, uses that as input
+    - If image is provided, analyzes the image
+    - Converts the LLM response to speech using the specified voice and speed
+    """
     try:
+        logger.info(f"Received audio generation request")
+        # If no text is provided but image is provided, use default prompt
+        user_text = request.text if request.text is not None else ""
+        if not user_text and request.image_base64:
+            user_text = "Describe what you see in the image"
+        elif not user_text and not request.image_base64:
+            logger.error("Neither text nor image provided in request")
+            return JSONResponse(
+                status_code=400,
+                content={"error": "Request must include either text or image_base64"}
+            )
+        # Generate response using text and image if provided
+        logger.info("Getting LLM response...")
+        text_reply = llm_chat_response(user_text, request.image_base64)
+        logger.info(f"LLM response: {text_reply}")
+        # Generate audio
+        logger.info(f"Generating audio using voice={request.voice}, speed={request.speed}")
+        try:
+            generator = pipeline(
+                text_reply,
+                voice=request.voice,
+                speed=request.speed,
+                split_pattern=r'\n+'
+            )
+            # Process only the first segment for demo
+            for i, (gs, ps, audio) in enumerate(generator):
+                logger.info(f"Audio generated successfully: segment {i}")
+                # Convert PyTorch tensor to NumPy array
+                audio_numpy = audio.cpu().numpy()
+                # Convert to 16-bit PCM
+                # Ensure the audio is in the range [-1, 1]
+                audio_numpy = np.clip(audio_numpy, -1, 1)
+                # Convert to 16-bit signed integers
+                pcm_data = (audio_numpy * 32767).astype(np.int16)
+                # Convert to bytes (automatically uses row-major order)
+                raw_audio = pcm_data.tobytes()
+                # Return PCM data with minimal necessary headers
+                return Response(
+                    content=raw_audio,
+                    media_type="application/octet-stream",
+                    headers={
+                        "Content-Disposition": f'attachment; filename="output.pcm"',
+                        "X-Sample-Rate": "24000",
+                        "X-Bits-Per-Sample": "16",
+                        "X-Endianness": "little"
+                    }
+                )
+            logger.error("No audio segments generated")
+            return JSONResponse(
+                status_code=400,
+                content={"error": "No audio generated", "detail": "The pipeline did not produce any audio"}
+            )
+        except Exception as e:
+            logger.error(f"Error generating audio: {str(e)}")
+            return JSONResponse(
+                status_code=500,
+                content={"error": "Audio generation failed", "detail": str(e)}
+            )
     except Exception as e:
+        logger.error(f"Unexpected error in generate_audio endpoint: {str(e)}")
+        return JSONResponse(
+            status_code=500,
+            content={"error": "Internal server error", "detail": str(e)}
+        )
 @app.get("/")
 async def root():
+    return {"message": "Welcome to the Text-to-Speech API with Vision Support. Use POST /generate endpoint with 'text' and optionally 'image_base64' for queries."}
+# Cleanup function to periodically remove old temporary images
+@app.on_event("startup")
+async def startup_event():
+    # You could add scheduled tasks here to clean up old images
+    pass
 @app.exception_handler(404)
 async def not_found_handler(request, exc):
     return JSONResponse(
         status_code=404,
+        content={"error": "Endpoint not found. Please use POST /generate for queries."}
     )
 @app.exception_handler(405)
     return JSONResponse(
         status_code=405,
         content={"error": "Method not allowed. Please check the API documentation."}
+    )