TTS_API_Image_fallback

Sleeping

App Files Files Community

khurrameycon commited on Apr 6, 2025

Commit

63d6fee

verified ·

1 Parent(s): 313833e

Update app.py

Browse files

Files changed (1) hide show

app.py +135 -153

app.py CHANGED Viewed

@@ -95,190 +95,172 @@
 #     return Response("No audio generated", status_code=400)
 import os
-import uuid
-import base64
 import logging
-from fastapi import FastAPI, HTTPException, Response, Request
 from fastapi.responses import JSONResponse
-from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel
-from typing import Optional, ClassVar, List
 from huggingface_hub import InferenceClient
-import numpy as np
-import torch
-from kokoro import KPipeline  # Your audio generation pipeline
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Create FastAPI app
 app = FastAPI(
-    title="Text-to-Speech API with Vision Support",
-    description="This API uses meta-llama/Llama-3.2-11B-Vision-Instruct which requires an image input.",
     version="1.0.0"
 )
-# Mount a static directory for serving saved images
 STATIC_DIR = "static_images"
 if not os.path.exists(STATIC_DIR):
     os.makedirs(STATIC_DIR)
-app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
-# Pydantic model for request
-class TextImageRequest(BaseModel):
-    text: Optional[str] = None
-    image_base64: Optional[str] = None
-    voice: str = "af_heart"  # Default voice
-    speed: float = 1.0
-    # Use ClassVar so that Pydantic doesn't treat this as a model field.
-    AVAILABLE_VOICES: ClassVar[List[str]] = ["af_heart"]
-    def validate_voice(self):
-        if self.voice not in self.AVAILABLE_VOICES:
-            return "af_heart"
-        return self.voice
-# Pydantic model for error responses
-class ErrorResponse(BaseModel):
-    error: str
-    detail: Optional[str] = None
-def llm_chat_response(prompt: str, image_base64: str) -> str:
-    HF_TOKEN = os.getenv("HF_TOKEN")
-    logger.info("Checking HF_TOKEN...")
-    if not HF_TOKEN:
-        logger.error("HF_TOKEN not configured")
-        raise HTTPException(status_code=500, detail="HF_TOKEN not configured")
-    logger.info("Initializing InferenceClient...")
-    client = InferenceClient(
-        provider="hf-inference",
-        api_key=HF_TOKEN
-    )
-    # Save the base64-encoded image locally
-    filename = f"{uuid.uuid4()}.jpg"
-    image_path = os.path.join(STATIC_DIR, filename)
     try:
-        image_data = base64.b64decode(image_base64)
-    except Exception as e:
-        logger.error(f"Error decoding image: {str(e)}")
-        raise HTTPException(status_code=400, detail="Invalid base64 image data")
-    with open(image_path, "wb") as f:
-        f.write(image_data)
-    # Construct the public URL for the saved image.
-    # Set BASE_URL to your public URL if needed.
-    base_url = os.getenv("BASE_URL", "http://localhost:8000")
-    image_url = f"{base_url}/static/{filename}"
-    # Build the message payload exactly as in the reference:
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": prompt
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                }
-            ]
-        }
-    ]
-    logger.info(f"Message structure: {messages}")
-    try:
-        completion = client.chat.completions.create(
-            model="meta-llama/Llama-3.2-11B-Vision-Instruct",
-            messages=messages,
-            max_tokens=500,
         )
-        response = completion.choices[0].message.content
-        logger.info(f"Extracted response: {response}")
-        return response
     except Exception as e:
-        logger.error(f"Error during model inference: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
-# Initialize the audio generation pipeline (KPipeline)
-try:
-    logger.info("Initializing KPipeline...")
-    pipeline = KPipeline(lang_code='a')
-    logger.info("KPipeline initialized successfully")
-except Exception as e:
-    logger.error(f"Failed to initialize KPipeline: {str(e)}")
-    # The API will run but audio generation will fail if the pipeline is not ready.
-@app.post("/generate", responses={
-    200: {"content": {"application/octet-stream": {}}},
-    400: {"model": ErrorResponse},
-    500: {"model": ErrorResponse}
-})
-async def generate_audio(request: TextImageRequest):
-    """
-    Generate audio from a multimodal (text+image) input.
-    This model requires an image input.
-    """
-    logger.info("Received generation request")
-    # The model requires an image; if missing, return an error.
-    if not request.image_base64:
-        raise HTTPException(status_code=400, detail="This model requires an image input.")
-    prompt = request.text if request.text else "Describe this image in one sentence."
-    logger.info("Calling the LLM model")
-    text_reply = llm_chat_response(prompt, request.image_base64)
-    logger.info(f"LLM response: {text_reply}")
-    validated_voice = request.validate_voice()
-    if validated_voice != request.voice:
-        logger.warning(f"Voice '{request.voice}' not available; using '{validated_voice}' instead")
-    # Convert the text reply to audio using the KPipeline.
-    logger.info(f"Generating audio using voice={validated_voice}, speed={request.speed}")
     try:
-        generator = pipeline(
-            text_reply,
-            voice=validated_voice,
-            speed=request.speed,
-            split_pattern=r'\n+'
-        )
-        for _, _, audio in generator:
-            audio_numpy = audio.cpu().numpy()
-            audio_numpy = np.clip(audio_numpy, -1, 1)
-            pcm_data = (audio_numpy * 32767).astype(np.int16)
-            raw_audio = pcm_data.tobytes()
-            return Response(
-                content=raw_audio,
-                media_type="application/octet-stream",
-                headers={
-                    "Content-Disposition": 'attachment; filename="output.pcm"',
-                    "X-Sample-Rate": "24000",
-                    "X-Bits-Per-Sample": "16",
-                    "X-Endianness": "little"
-                }
-            )
-        raise HTTPException(status_code=400, detail="No audio segments generated.")
     except Exception as e:
-        logger.error(f"Error generating audio: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/")
 async def root():
-    return {"message": "Welcome to the Text-to-Speech API with Vision Support. Use POST /generate with text and image_base64."}
 @app.exception_handler(404)
-async def not_found_handler(request: Request, exc):
-    return JSONResponse(status_code=404, content={"error": "Endpoint not found."})
 @app.exception_handler(405)
-async def method_not_allowed_handler(request: Request, exc):
-    return JSONResponse(status_code=405, content={"error": "Method not allowed."})

 #     return Response("No audio generated", status_code=400)
 import os
 import logging
+import base64
+from typing import Optional
+from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel
 from huggingface_hub import InferenceClient
+from requests.exceptions import HTTPError
+import uuid
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Initialize FastAPI app
 app = FastAPI(
+    title="LLM Chat API",
+    description="API for getting chat responses from Llama model (supports text and image input)",
     version="1.0.0"
 )
+# Directory to save images
 STATIC_DIR = "static_images"
 if not os.path.exists(STATIC_DIR):
     os.makedirs(STATIC_DIR)
+# Pydantic models
+class ChatRequest(BaseModel):
+    text: str
+    image_url: Optional[str] = None  # In this updated version, this field is expected to be a base64 encoded image
+class ChatResponse(BaseModel):
+    response: str
+    status: str
+def llm_chat_response(text: str, image_base64: Optional[str] = None) -> str:
     try:
+        HF_TOKEN = os.getenv("HF_TOKEN")
+        logger.info("Checking HF_TOKEN...")
+        if not HF_TOKEN:
+            logger.error("HF_TOKEN not found in environment variables")
+            raise HTTPException(status_code=500, detail="HF_TOKEN not configured")
+        logger.info("Initializing InferenceClient...")
+        client = InferenceClient(
+            provider="hf-inference",  # Updated provider
+            api_key=HF_TOKEN
         )
+        # Build the messages payload.
+        # For text-only queries, append a default instruction.
+        message_content = [{
+            "type": "text",
+            "text": text + ("" if image_base64 else " describe in one line only")
+        }]
+        if image_base64:
+            logger.info("Saving base64 encoded image to file...")
+            # Decode and save the image locally
+            filename = f"{uuid.uuid4()}.jpg"
+            image_path = os.path.join(STATIC_DIR, filename)
+            try:
+                image_data = base64.b64decode(image_base64)
+            except Exception as e:
+                logger.error(f"Error decoding image: {str(e)}")
+                raise HTTPException(status_code=400, detail="Invalid base64 image data")
+            with open(image_path, "wb") as f:
+                f.write(image_data)
+            # Construct public URL for the saved image.
+            # Set BASE_URL to your public URL if needed.
+            base_url = os.getenv("BASE_URL", "http://localhost:8000")
+            public_image_url = f"{base_url}/{STATIC_DIR}/{filename}"
+            logger.info(f"Using saved image URL: {public_image_url}")
+            message_content.append({
+                "type": "image_url",
+                "image_url": {"url": public_image_url}
+            })
+        messages = [{
+            "role": "user",
+            "content": message_content
+        }]
+        logger.info("Sending request to model...")
+        try:
+            completion = client.chat.completions.create(
+                model="meta-llama/Llama-3.2-11B-Vision-Instruct",
+                messages=messages,
+                max_tokens=500
+            )
+        except HTTPError as http_err:
+            logger.error(f"HTTP error occurred: {http_err.response.text}")
+            raise HTTPException(status_code=500, detail=http_err.response.text)
+        logger.info(f"Raw model response: {completion}")
+        if getattr(completion, "error", None):
+            error_details = completion.error
+            error_message = error_details.get("message", "Unknown error")
+            logger.error(f"Model returned error: {error_message}")
+            raise HTTPException(status_code=500, detail=f"Model returned error: {error_message}")
+        if not completion.choices or len(completion.choices) == 0:
+            logger.error("No choices returned from model.")
+            raise HTTPException(status_code=500, detail="Model returned no choices.")
+        # Extract the response message from the first choice.
+        choice = completion.choices[0]
+        response_message = None
+        if hasattr(choice, "message"):
+            response_message = choice.message
+        elif isinstance(choice, dict):
+            response_message = choice.get("message")
+        if not response_message:
+            logger.error(f"Response message is empty: {choice}")
+            raise HTTPException(status_code=500, detail="Model response did not include a message.")
+        content = None
+        if isinstance(response_message, dict):
+            content = response_message.get("content")
+        if content is None and hasattr(response_message, "content"):
+            content = response_message.content
+        if not content:
+            logger.error(f"Message content is missing: {response_message}")
+            raise HTTPException(status_code=500, detail="Model message did not include content.")
+        return content
     except Exception as e:
+        logger.error(f"Error in llm_chat_response: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
+@app.post("/chat", response_model=ChatResponse)
+async def chat(request: ChatRequest):
     try:
+        logger.info(f"Received chat request with text: {request.text}")
+        if request.image_url:
+            logger.info("Image data provided.")
+        response = llm_chat_response(request.text, request.image_url)
+        return ChatResponse(response=response, status="success")
+    except HTTPException as he:
+        logger.error(f"HTTP Exception in chat endpoint: {str(he)}")
+        raise he
     except Exception as e:
+        logger.error(f"Unexpected error in chat endpoint: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/")
 async def root():
+    return {"message": "Welcome to the LLM Chat API. Use POST /chat endpoint with 'text' and optionally 'image_url' (base64 encoded) for queries."}
 @app.exception_handler(404)
+async def not_found_handler(request, exc):
+    return JSONResponse(
+        status_code=404,
+        content={"error": "Endpoint not found. Please use POST /chat for queries."}
+    )
 @app.exception_handler(405)
+async def method_not_allowed_handler(request, exc):
+    return JSONResponse(
+        status_code=405,
+        content={"error": "Method not allowed. Please check the API documentation."}
+    )