TTS_API_Image_fallback

Sleeping

App Files Files Community

khurrameycon commited on Apr 6, 2025

Commit

2bb03a8

verified ·

1 Parent(s): a1a0caf

Update app.py

Browse files

Files changed (1) hide show

app.py +209 -78

app.py CHANGED Viewed

@@ -95,9 +95,8 @@
 #     return Response("No audio generated", status_code=400)
-from fastapi import FastAPI, Response
-from fastapi.responses import FileResponse
 from kokoro import KPipeline
 import soundfile as sf
 import os
@@ -108,98 +107,230 @@ from pydantic import BaseModel
 import base64
 from io import BytesIO
 from PIL import Image
 class TextImageRequest(BaseModel):
-    text: str = None
-    image_base64: str = None
     voice: str = "af_heart"
     speed: float = 1.0
 def llm_chat_response(text, image_base64=None):
-    HF_TOKEN = os.getenv("HF_TOKEN")
-    client = InferenceClient(api_key=HF_TOKEN)
-    # For image + text requests, we need to use the conversational format
-    # with proper message structure
-    system_message = "You are a helpful assistant that provides concise responses."
     try:
-        if image_base64:
-            messages = [
-                {"role": "system", "content": system_message},
-                {"role": "user", "content": [
-                    {"type": "text", "text": text if text else "Describe what you see in the image in one line only"},
-                    {"type": "image", "source": {"data": f"data:image/jpeg;base64,{image_base64}"}}
-                ]}
-            ]
-        else:
-            messages = [
-                {"role": "system", "content": system_message},
-                {"role": "user", "content": text + " Describe in one line only."}
-            ]
-        # Call the API
-        response_from_llama = client.chat.completions.create(
-            model="meta-llama/Llama-3.2-11B-Vision-Instruct",
-            messages=messages,
-            max_tokens=500
         )
-        return response_from_llama.choices[0].message['content']
     except Exception as e:
-        print(f"Error calling LLM API: {e}")
-        # Fallback response in case of error
-        return "I couldn't process that input. Please try again with a different image or text query."
-app = FastAPI()
 # Initialize pipeline once at startup
-pipeline = KPipeline(lang_code='a')
-@app.post("/generate")
 async def generate_audio(request: TextImageRequest):
-    # If no text is provided but image is provided, use default prompt
-    user_text = request.text
-    if user_text is None and request.image_base64:
-        user_text = "Describe what you see in the image"
-    elif user_text is None:
-        user_text = ""
-    # Generate response using text and image if provided
-    text_reply = llm_chat_response(user_text, request.image_base64)
-    # Generate audio
-    generator = pipeline(
-        text_reply,
-        voice=request.voice,
-        speed=request.speed,
-        split_pattern=r'\n+'
-    )
-    # Process only the first segment for demo
-    for i, (gs, ps, audio) in enumerate(generator):
-        # Convert PyTorch tensor to NumPy array
-        audio_numpy = audio.cpu().numpy()
-        # Convert to 16-bit PCM
-        # Ensure the audio is in the range [-1, 1]
-        audio_numpy = np.clip(audio_numpy, -1, 1)
-        # Convert to 16-bit signed integers
-        pcm_data = (audio_numpy * 32767).astype(np.int16)
-        # Convert to bytes (automatically uses row-major order)
-        raw_audio = pcm_data.tobytes()
-        # Return PCM data with minimal necessary headers
-        return Response(
-            content=raw_audio,
-            media_type="application/octet-stream",
-            headers={
-                "Content-Disposition": f'attachment; filename="output.pcm"',
-                "X-Sample-Rate": "24000",
-                "X-Bits-Per-Sample": "16",
-                "X-Endianness": "little"
-            }
-        )
-    return Response("No audio generated", status_code=400)

 #     return Response("No audio generated", status_code=400)
+from fastapi import FastAPI, Response, HTTPException
+from fastapi.responses import FileResponse, JSONResponse
 from kokoro import KPipeline
 import soundfile as sf
 import os
 import base64
 from io import BytesIO
 from PIL import Image
+import logging
+from typing import Optional
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 class TextImageRequest(BaseModel):
+    text: Optional[str] = None
+    image_base64: Optional[str] = None
     voice: str = "af_heart"
     speed: float = 1.0
+class AudioResponse(BaseModel):
+    status: str
+    message: str
+class ErrorResponse(BaseModel):
+    error: str
+    detail: Optional[str] = None
+# Initialize FastAPI app
+app = FastAPI(
+    title="Text-to-Speech API with Vision Support",
+    description="API for generating speech from text with optional image analysis",
+    version="1.0.0"
+)
 def llm_chat_response(text, image_base64=None):
+    """Function to get responses from LLM with text and optionally image input."""
     try:
+        HF_TOKEN = os.getenv("HF_TOKEN")
+        logger.info("Checking HF_TOKEN...")
+        if not HF_TOKEN:
+            logger.error("HF_TOKEN not found in environment variables")
+            raise HTTPException(status_code=500, detail="HF_TOKEN not configured")
+        logger.info("Initializing InferenceClient...")
+        client = InferenceClient(
+            provider="sambanova",  # Specify provider if needed
+            api_key=HF_TOKEN
         )
+        # System message for better context
+        system_message = "You are a helpful assistant that provides concise responses."
+        try:
+            if image_base64:
+                logger.info("Processing request with image")
+                messages = [
+                    {"role": "system", "content": system_message},
+                    {"role": "user", "content": [
+                        {"type": "text", "text": text if text else "Describe what you see in the image in one line only"},
+                        {"type": "image", "source": {"data": f"data:image/jpeg;base64,{image_base64}"}}
+                    ]}
+                ]
+            else:
+                logger.info("Processing text-only request")
+                messages = [
+                    {"role": "system", "content": system_message},
+                    {"role": "user", "content": text + " Describe in one line only."}
+                ]
+            logger.info("Sending request to model...")
+            completion = client.chat.completions.create(
+                model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+                messages=messages,
+                max_tokens=500
+            )
+            logger.info(f"Received response from model")
+            # Handle potential different response formats
+            if not completion.choices or len(completion.choices) == 0:
+                logger.error("No choices returned from model.")
+                raise HTTPException(status_code=500, detail="Model returned no choices.")
+            # Extract the response message from the first choice
+            choice = completion.choices[0]
+            response_message = None
+            if hasattr(choice, "message"):
+                response_message = choice.message
+            elif isinstance(choice, dict):
+                response_message = choice.get("message")
+            if not response_message:
+                logger.error(f"Response message is empty: {choice}")
+                raise HTTPException(status_code=500, detail="Model response did not include a message.")
+            content = None
+            if isinstance(response_message, dict):
+                content = response_message.get("content")
+            if content is None and hasattr(response_message, "content"):
+                content = response_message.content
+            if not content:
+                logger.error(f"Message content is missing: {response_message}")
+                raise HTTPException(status_code=500, detail="Model message did not include content.")
+            return content
+        except Exception as e:
+            logger.error(f"Error during model inference: {str(e)}")
+            # Fallback response in case of error
+            return "I couldn't process that input. Please try again with a different image or text query."
     except Exception as e:
+        logger.error(f"Error in llm_chat_response: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
 # Initialize pipeline once at startup
+try:
+    logger.info("Initializing KPipeline...")
+    pipeline = KPipeline(lang_code='a')
+    logger.info("KPipeline initialized successfully")
+except Exception as e:
+    logger.error(f"Failed to initialize KPipeline: {str(e)}")
+    # We'll let the app start anyway, but log the error
+@app.post("/generate", response_model=None, responses={
+    200: {"content": {"application/octet-stream": {}}},
+    400: {"model": ErrorResponse},
+    500: {"model": ErrorResponse}
+})
 async def generate_audio(request: TextImageRequest):
+    """
+    Generate audio from text and optionally analyze an image.
+    - If text is provided, uses that as input
+    - If image is provided, analyzes the image
+    - Converts the LLM response to speech using the specified voice and speed
+    """
+    try:
+        logger.info(f"Received audio generation request")
+        # If no text is provided but image is provided, use default prompt
+        user_text = request.text if request.text is not None else ""
+        if not user_text and request.image_base64:
+            user_text = "Describe what you see in the image"
+        elif not user_text and not request.image_base64:
+            logger.error("Neither text nor image provided in request")
+            return JSONResponse(
+                status_code=400,
+                content={"error": "Request must include either text or image_base64"}
+            )
+        # Generate response using text and image if provided
+        logger.info("Getting LLM response...")
+        text_reply = llm_chat_response(user_text, request.image_base64)
+        logger.info(f"LLM response: {text_reply}")
+        # Generate audio
+        logger.info(f"Generating audio using voice={request.voice}, speed={request.speed}")
+        try:
+            generator = pipeline(
+                text_reply,
+                voice=request.voice,
+                speed=request.speed,
+                split_pattern=r'\n+'
+            )
+            # Process only the first segment for demo
+            for i, (gs, ps, audio) in enumerate(generator):
+                logger.info(f"Audio generated successfully: segment {i}")
+                # Convert PyTorch tensor to NumPy array
+                audio_numpy = audio.cpu().numpy()
+                # Convert to 16-bit PCM
+                # Ensure the audio is in the range [-1, 1]
+                audio_numpy = np.clip(audio_numpy, -1, 1)
+                # Convert to 16-bit signed integers
+                pcm_data = (audio_numpy * 32767).astype(np.int16)
+                # Convert to bytes (automatically uses row-major order)
+                raw_audio = pcm_data.tobytes()
+                # Return PCM data with minimal necessary headers
+                return Response(
+                    content=raw_audio,
+                    media_type="application/octet-stream",
+                    headers={
+                        "Content-Disposition": f'attachment; filename="output.pcm"',
+                        "X-Sample-Rate": "24000",
+                        "X-Bits-Per-Sample": "16",
+                        "X-Endianness": "little"
+                    }
+                )
+            logger.error("No audio segments generated")
+            return JSONResponse(
+                status_code=400,
+                content={"error": "No audio generated", "detail": "The pipeline did not produce any audio"}
+            )
+        except Exception as e:
+            logger.error(f"Error generating audio: {str(e)}")
+            return JSONResponse(
+                status_code=500,
+                content={"error": "Audio generation failed", "detail": str(e)}
+            )
+    except Exception as e:
+        logger.error(f"Unexpected error in generate_audio endpoint: {str(e)}")
+        return JSONResponse(
+            status_code=500,
+            content={"error": "Internal server error", "detail": str(e)}
+        )
+@app.get("/")
+async def root():
+    return {"message": "Welcome to the Text-to-Speech API with Vision Support. Use POST /generate endpoint with 'text' and optionally 'image_base64' for queries."}
+@app.exception_handler(404)
+async def not_found_handler(request, exc):
+    return JSONResponse(
+        status_code=404,
+        content={"error": "Endpoint not found. Please use POST /generate for queries."}
+    )
+@app.exception_handler(405)
+async def method_not_allowed_handler(request, exc):
+    return JSONResponse(
+        status_code=405,
+        content={"error": "Method not allowed. Please check the API documentation."}
+    )