TTS_API_Image_fallback

Sleeping

App Files Files Community

khurrameycon commited on Apr 6, 2025

Commit

313833e

verified ·

1 Parent(s): 4196bc7

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -35

app.py CHANGED Viewed

@@ -106,7 +106,7 @@ from typing import Optional, ClassVar, List
 from huggingface_hub import InferenceClient
 import numpy as np
 import torch
-from kokoro import KPipeline  # Assuming you have this pipeline for audio generation
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -115,7 +115,7 @@ logger = logging.getLogger(__name__)
 # Create FastAPI app
 app = FastAPI(
     title="Text-to-Speech API with Vision Support",
-    description="This API uses meta-llama/Llama-3.2-11B-Vision-Instruct, which requires an image input.",
     version="1.0.0"
 )
@@ -140,17 +140,12 @@ class TextImageRequest(BaseModel):
             return "af_heart"
         return self.voice
-# (Optional) Pydantic models for responses
-class AudioResponse(BaseModel):
-    status: str
-    message: str
 class ErrorResponse(BaseModel):
     error: str
     detail: Optional[str] = None
-# Function to call the LLM model following the reference code exactly
-def llm_chat_response(text: str, image_base64: str) -> str:
     HF_TOKEN = os.getenv("HF_TOKEN")
     logger.info("Checking HF_TOKEN...")
     if not HF_TOKEN:
@@ -163,7 +158,7 @@ def llm_chat_response(text: str, image_base64: str) -> str:
         api_key=HF_TOKEN
     )
-    # Save the base64-encoded image locally so it is accessible via a URL
     filename = f"{uuid.uuid4()}.jpg"
     image_path = os.path.join(STATIC_DIR, filename)
     try:
@@ -176,19 +171,25 @@ def llm_chat_response(text: str, image_base64: str) -> str:
         f.write(image_data)
     # Construct the public URL for the saved image.
-    # BASE_URL should be set to your public URL if not running locally.
     base_url = os.getenv("BASE_URL", "http://localhost:8000")
     image_url = f"{base_url}/static/{filename}"
-    # Build the message exactly as in the reference code.
-    # This model requires a list with two items: one for text and one for the image.
-    prompt = text if text else "Describe this image in one sentence."
     messages = [
         {
             "role": "user",
             "content": [
-                {"type": "text", "text": prompt},
-                {"type": "image_url", "image_url": {"url": image_url}}
             ]
         }
     ]
@@ -198,7 +199,7 @@ def llm_chat_response(text: str, image_base64: str) -> str:
         completion = client.chat.completions.create(
             model="meta-llama/Llama-3.2-11B-Vision-Instruct",
             messages=messages,
-            max_tokens=500
         )
         response = completion.choices[0].message.content
         logger.info(f"Extracted response: {response}")
@@ -207,14 +208,14 @@ def llm_chat_response(text: str, image_base64: str) -> str:
         logger.error(f"Error during model inference: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
-# Initialize audio generation pipeline (your audio conversion pipeline)
 try:
     logger.info("Initializing KPipeline...")
     pipeline = KPipeline(lang_code='a')
     logger.info("KPipeline initialized successfully")
 except Exception as e:
     logger.error(f"Failed to initialize KPipeline: {str(e)}")
-    # The API can still run, but audio generation will fail.
 @app.post("/generate", responses={
     200: {"content": {"application/octet-stream": {}}},
@@ -224,44 +225,37 @@ except Exception as e:
 async def generate_audio(request: TextImageRequest):
     """
     Generate audio from a multimodal (text+image) input.
-    This model does not support text-only inputs.
     """
     logger.info("Received generation request")
-    # Ensure an image is provided because the model is multimodal.
     if not request.image_base64:
         raise HTTPException(status_code=400, detail="This model requires an image input.")
-    # Get the text prompt. If none is provided, use a default.
-    user_text = request.text if request.text else "Describe this image in one sentence."
-    # Get the LLM's response
     logger.info("Calling the LLM model")
-    text_reply = llm_chat_response(user_text, request.image_base64)
     logger.info(f"LLM response: {text_reply}")
-    # Validate voice parameter (if needed for audio generation)
     validated_voice = request.validate_voice()
     if validated_voice != request.voice:
         logger.warning(f"Voice '{request.voice}' not available; using '{validated_voice}' instead")
-    # Convert the text reply to audio using your audio pipeline
     logger.info(f"Generating audio using voice={validated_voice}, speed={request.speed}")
     try:
-        # Generate audio segments (assumes pipeline yields segments)
         generator = pipeline(
             text_reply,
             voice=validated_voice,
             speed=request.speed,
             split_pattern=r'\n+'
         )
-        for i, (gs, ps, audio) in enumerate(generator):
-            logger.info(f"Audio generated, segment {i}")
-            # Convert audio tensor to 16-bit PCM bytes
             audio_numpy = audio.cpu().numpy()
             audio_numpy = np.clip(audio_numpy, -1, 1)
             pcm_data = (audio_numpy * 32767).astype(np.int16)
             raw_audio = pcm_data.tobytes()
             return Response(
                 content=raw_audio,
                 media_type="application/octet-stream",
@@ -279,7 +273,7 @@ async def generate_audio(request: TextImageRequest):
 @app.get("/")
 async def root():
-    return {"message": "Welcome! Use POST /generate with text and image_base64."}
 @app.exception_handler(404)
 async def not_found_handler(request: Request, exc):
@@ -288,4 +282,3 @@ async def not_found_handler(request: Request, exc):
 @app.exception_handler(405)
 async def method_not_allowed_handler(request: Request, exc):
     return JSONResponse(status_code=405, content={"error": "Method not allowed."})

 from huggingface_hub import InferenceClient
 import numpy as np
 import torch
+from kokoro import KPipeline  # Your audio generation pipeline
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 # Create FastAPI app
 app = FastAPI(
     title="Text-to-Speech API with Vision Support",
+    description="This API uses meta-llama/Llama-3.2-11B-Vision-Instruct which requires an image input.",
     version="1.0.0"
 )
             return "af_heart"
         return self.voice
+# Pydantic model for error responses
 class ErrorResponse(BaseModel):
     error: str
     detail: Optional[str] = None
+def llm_chat_response(prompt: str, image_base64: str) -> str:
     HF_TOKEN = os.getenv("HF_TOKEN")
     logger.info("Checking HF_TOKEN...")
     if not HF_TOKEN:
         api_key=HF_TOKEN
     )
+    # Save the base64-encoded image locally
     filename = f"{uuid.uuid4()}.jpg"
     image_path = os.path.join(STATIC_DIR, filename)
     try:
         f.write(image_data)
     # Construct the public URL for the saved image.
+    # Set BASE_URL to your public URL if needed.
     base_url = os.getenv("BASE_URL", "http://localhost:8000")
     image_url = f"{base_url}/static/{filename}"
+    # Build the message payload exactly as in the reference:
     messages = [
         {
             "role": "user",
             "content": [
+                {
+                    "type": "text",
+                    "text": prompt
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                }
             ]
         }
     ]
         completion = client.chat.completions.create(
             model="meta-llama/Llama-3.2-11B-Vision-Instruct",
             messages=messages,
+            max_tokens=500,
         )
         response = completion.choices[0].message.content
         logger.info(f"Extracted response: {response}")
         logger.error(f"Error during model inference: {str(e)}")
         raise HTTPException(status_code=500, detail=str(e))
+# Initialize the audio generation pipeline (KPipeline)
 try:
     logger.info("Initializing KPipeline...")
     pipeline = KPipeline(lang_code='a')
     logger.info("KPipeline initialized successfully")
 except Exception as e:
     logger.error(f"Failed to initialize KPipeline: {str(e)}")
+    # The API will run but audio generation will fail if the pipeline is not ready.
 @app.post("/generate", responses={
     200: {"content": {"application/octet-stream": {}}},
 async def generate_audio(request: TextImageRequest):
     """
     Generate audio from a multimodal (text+image) input.
+    This model requires an image input.
     """
     logger.info("Received generation request")
+    # The model requires an image; if missing, return an error.
     if not request.image_base64:
         raise HTTPException(status_code=400, detail="This model requires an image input.")
+    prompt = request.text if request.text else "Describe this image in one sentence."
     logger.info("Calling the LLM model")
+    text_reply = llm_chat_response(prompt, request.image_base64)
     logger.info(f"LLM response: {text_reply}")
     validated_voice = request.validate_voice()
     if validated_voice != request.voice:
         logger.warning(f"Voice '{request.voice}' not available; using '{validated_voice}' instead")
+    # Convert the text reply to audio using the KPipeline.
     logger.info(f"Generating audio using voice={validated_voice}, speed={request.speed}")
     try:
         generator = pipeline(
             text_reply,
             voice=validated_voice,
             speed=request.speed,
             split_pattern=r'\n+'
         )
+        for _, _, audio in generator:
             audio_numpy = audio.cpu().numpy()
             audio_numpy = np.clip(audio_numpy, -1, 1)
             pcm_data = (audio_numpy * 32767).astype(np.int16)
             raw_audio = pcm_data.tobytes()
             return Response(
                 content=raw_audio,
                 media_type="application/octet-stream",
 @app.get("/")
 async def root():
+    return {"message": "Welcome to the Text-to-Speech API with Vision Support. Use POST /generate with text and image_base64."}
 @app.exception_handler(404)
 async def not_found_handler(request: Request, exc):
 @app.exception_handler(405)
 async def method_not_allowed_handler(request: Request, exc):
     return JSONResponse(status_code=405, content={"error": "Method not allowed."})