Spaces:

vinzcyun
/

tts

Sleeping

App Files Files Community

vinzcyun commited on Aug 16, 2025

Commit

747a950

verified ·

1 Parent(s): 8a7188c

Update main.py

Browse files

Files changed (1) hide show

main.py +117 -36

main.py CHANGED Viewed

@@ -1,11 +1,13 @@
-from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse
-from pydantic import BaseModel, Field
-from typing import Literal, Optional
 from google import genai
 from google.genai import types
 import io, wave, base64
 import logging
 # Set up logging to see more details about errors
 logging.basicConfig(level=logging.INFO)
@@ -14,78 +16,160 @@ logger = logging.getLogger(__name__)
 app = FastAPI(title="OpenAI-compatible TTS (Gemini via google-genai)")
 class OpenAITTSRequest(BaseModel):
-    # OpenAI-style (để tương thích client)
-    model: str = Field(..., description="OpenAI-style model (chỉ để tương thích)")
-    voice: str = Field(..., description="Tên giọng TTS (Gemini prebuilt voice)")
     input: str = Field(..., description="Văn bản cần đọc")
     response_format: Optional[Literal["wav", "pcm"]] = Field(default="wav", description="Định dạng output")
-    # Alternative field name for compatibility
     format: Optional[Literal["wav", "pcm"]] = Field(default=None, description="Định dạng output (alternative)")
-    # Thông tin Gemini do user cung cấp (bắt buộc)
-    gemini_api_key: str = Field(..., description="Google API key cho Gemini")
-    gemini_model: str = Field(..., description="Tên model Gemini TTS (vd: gemini-2.5-flash-preview-tts)")
-    # Optional OpenAI compatibility fields
     speed: Optional[float] = Field(default=1.0, ge=0.25, le=4.0, description="Tốc độ giọng nói")
-SR = 24000  # Gemini TTS trả PCM s16le 24kHz mono
 def pcm_to_wav_bytes(pcm: bytes, sr: int = SR) -> bytes:
     buf = io.BytesIO()
     with wave.open(buf, "wb") as wf:
         wf.setnchannels(1)
-        wf.setsampwidth(2)      # 16-bit (sampwidth=2 bytes)
         wf.setframerate(sr)
         wf.writeframes(pcm)
     return buf.getvalue()
 @app.post("/v1/audio/speech")
-async def audio_speech(body: OpenAITTSRequest):
-    # Log the incoming request for debugging
-    logger.info(f"Received TTS request: model={body.model}, voice={body.voice}")
-    # Determine output format - check both fields for compatibility
-    output_format = body.format or body.response_format or "wav"
     # Validate input text
-    if not body.input or not body.input.strip():
         raise HTTPException(status_code=400, detail="Input text cannot be empty")
-    # Khởi tạo client với API key do user cung cấp (không lưu trữ)
     try:
-        client = genai.Client(api_key=body.gemini_api_key)
     except Exception as e:
         logger.error(f"Failed to initialize GenAI client: {e}")
         raise HTTPException(status_code=400, detail=f"Không khởi tạo được Google GenAI client: {e!s}")
-    # Cấu hình TTS theo SDK chính thức
     config = types.GenerateContentConfig(
         response_modalities=["AUDIO"],
         speech_config=types.SpeechConfig(
             voice_config=types.VoiceConfig(
                 prebuilt_voice_config=types.PrebuiltVoiceConfig(
-                    voice_name=body.voice
                 )
             )
         )
     )
     try:
-        logger.info(f"Calling Gemini TTS with model: {body.gemini_model}")
         resp = client.models.generate_content(
-            model=body.gemini_model,
-            contents=body.input,
             config=config
         )
     except Exception as e:
         logger.error(f"Gemini TTS API error: {e}")
-        # Forward lỗi từ SDK/Upstream
         raise HTTPException(status_code=502, detail=f"Lỗi gọi Gemini TTS: {e!s}")
     # Lấy dữ liệu audio
     try:
         inline = resp.candidates[0].content.parts[0].inline_data
-        data = inline.data  # có thể là bytes hoặc base64 str (tuỳ version SDK)
     except (IndexError, AttributeError) as e:
         logger.error(f"Failed to extract audio data: {e}")
         raise HTTPException(status_code=500, detail="Không tìm thấy audio trong phản hồi Gemini")
@@ -93,7 +177,6 @@ async def audio_speech(body: OpenAITTSRequest):
     if isinstance(data, (bytes, bytearray)):
         pcm = bytes(data)
     else:
-        # fallback: nếu SDK trả base64 string
         try:
             pcm = base64.b64decode(data)
         except Exception as e:
@@ -114,16 +197,14 @@ async def audio_speech(body: OpenAITTSRequest):
             headers={"Content-Disposition": 'inline; filename="speech.wav"'}
         )
-@app.exception_handler(422)
-async def validation_exception_handler(request, exc):
-    logger.error(f"Validation error: {exc}")
-    return HTTPException(status_code=422, detail=f"Validation error: {exc}")
 @app.get("/")
 def root():
     return {
         "ok": True,
-        "usage": "POST /v1/audio/speech với {model, voice, input, response_format(wav|pcm), gemini_api_key, gemini_model}",
         "example": {
             "model": "tts-1",
             "voice": "en-US-Journey-F",

+from fastapi import FastAPI, HTTPException, Request
 from fastapi.responses import StreamingResponse
+from fastapi.exceptions import RequestValidationError
+from pydantic import BaseModel, Field, ValidationError
+from typing import Literal, Optional, Any
 from google import genai
 from google.genai import types
 import io, wave, base64
 import logging
+import json
 # Set up logging to see more details about errors
 logging.basicConfig(level=logging.INFO)
 app = FastAPI(title="OpenAI-compatible TTS (Gemini via google-genai)")
 class OpenAITTSRequest(BaseModel):
+    # OpenAI-style (để tương thích client) - tất cả đều optional với default
+    model: Optional[str] = Field(default="tts-1", description="OpenAI-style model")
+    voice: Optional[str] = Field(default="en-US-Journey-F", description="Tên giọng TTS")
     input: str = Field(..., description="Văn bản cần đọc")
     response_format: Optional[Literal["wav", "pcm"]] = Field(default="wav", description="Định dạng output")
     format: Optional[Literal["wav", "pcm"]] = Field(default=None, description="Định dạng output (alternative)")
     speed: Optional[float] = Field(default=1.0, ge=0.25, le=4.0, description="Tốc độ giọng nói")
+    # Thông tin Gemini
+    gemini_api_key: str = Field(..., description="Google API key cho Gemini")
+    gemini_model: Optional[str] = Field(default="gemini-2.0-flash-exp", description="Tên model Gemini TTS")
+    class Config:
+        # Allow extra fields để tránh lỗi khi client gửi thêm field không mong đợi
+        extra = "allow"
+SR = 24000
 def pcm_to_wav_bytes(pcm: bytes, sr: int = SR) -> bytes:
     buf = io.BytesIO()
     with wave.open(buf, "wb") as wf:
         wf.setnchannels(1)
+        wf.setsampwidth(2)
         wf.setframerate(sr)
         wf.writeframes(pcm)
     return buf.getvalue()
+@app.exception_handler(RequestValidationError)
+async def validation_exception_handler(request: Request, exc: RequestValidationError):
+    # Log raw request body để debug
+    body = None
+    try:
+        body = await request.body()
+        body_str = body.decode('utf-8')
+        logger.error(f"Raw request body: {body_str}")
+        # Try to parse as JSON to see what we received
+        try:
+            json_body = json.loads(body_str)
+            logger.error(f"Parsed JSON: {json_body}")
+        except:
+            logger.error("Body is not valid JSON")
+    except Exception as e:
+        logger.error(f"Could not read request body: {e}")
+    logger.error(f"Validation error details: {exc.errors()}")
+    return HTTPException(
+        status_code=422,
+        detail={
+            "error": "Validation failed",
+            "details": exc.errors(),
+            "received_body": body.decode('utf-8') if body else None
+        }
+    )
+# Alternative endpoint that accepts any JSON and logs it
+@app.post("/v1/audio/speech/debug")
+async def audio_speech_debug(request: Request):
+    body = await request.body()
+    content_type = request.headers.get("content-type", "")
+    logger.info(f"Debug endpoint - Content-Type: {content_type}")
+    logger.info(f"Debug endpoint - Raw body: {body.decode('utf-8')}")
+    try:
+        json_data = json.loads(body.decode('utf-8'))
+        logger.info(f"Debug endpoint - Parsed JSON: {json_data}")
+        # Try to create the model manually
+        try:
+            request_model = OpenAITTSRequest(**json_data)
+            logger.info(f"Debug endpoint - Model created successfully: {request_model}")
+        except ValidationError as ve:
+            logger.error(f"Debug endpoint - Validation error: {ve.errors()}")
+            return {"error": "validation_failed", "details": ve.errors()}
+        except Exception as e:
+            logger.error(f"Debug endpoint - Other error: {e}")
+            return {"error": "unknown_error", "details": str(e)}
+    except json.JSONDecodeError as e:
+        logger.error(f"Debug endpoint - JSON decode error: {e}")
+        return {"error": "invalid_json", "details": str(e)}
+    return {"status": "success", "message": "Request would be processed normally"}
 @app.post("/v1/audio/speech")
+async def audio_speech(request: Request):
+    # Log incoming request
+    logger.info(f"Headers: {dict(request.headers)}")
+    # Read raw body first
+    body = await request.body()
+    logger.info(f"Raw body: {body.decode('utf-8')}")
+    try:
+        # Parse JSON manually first
+        json_data = json.loads(body.decode('utf-8'))
+        logger.info(f"Parsed JSON keys: {list(json_data.keys())}")
+        # Create Pydantic model
+        body_model = OpenAITTSRequest(**json_data)
+    except json.JSONDecodeError as e:
+        logger.error(f"JSON decode error: {e}")
+        raise HTTPException(status_code=400, detail=f"Invalid JSON: {e}")
+    except ValidationError as e:
+        logger.error(f"Pydantic validation error: {e.errors()}")
+        raise HTTPException(status_code=422, detail={"validation_errors": e.errors()})
+    except Exception as e:
+        logger.error(f"Unexpected error during parsing: {e}")
+        raise HTTPException(status_code=400, detail=f"Request parsing error: {e}")
+    # Determine output format
+    output_format = body_model.format or body_model.response_format or "wav"
     # Validate input text
+    if not body_model.input or not body_model.input.strip():
         raise HTTPException(status_code=400, detail="Input text cannot be empty")
+    # Khởi tạo client với API key
     try:
+        client = genai.Client(api_key=body_model.gemini_api_key)
     except Exception as e:
         logger.error(f"Failed to initialize GenAI client: {e}")
         raise HTTPException(status_code=400, detail=f"Không khởi tạo được Google GenAI client: {e!s}")
+    # Cấu hình TTS
     config = types.GenerateContentConfig(
         response_modalities=["AUDIO"],
         speech_config=types.SpeechConfig(
             voice_config=types.VoiceConfig(
                 prebuilt_voice_config=types.PrebuiltVoiceConfig(
+                    voice_name=body_model.voice
                 )
             )
         )
     )
     try:
+        logger.info(f"Calling Gemini TTS with model: {body_model.gemini_model}")
         resp = client.models.generate_content(
+            model=body_model.gemini_model,
+            contents=body_model.input,
             config=config
         )
     except Exception as e:
         logger.error(f"Gemini TTS API error: {e}")
         raise HTTPException(status_code=502, detail=f"Lỗi gọi Gemini TTS: {e!s}")
     # Lấy dữ liệu audio
     try:
         inline = resp.candidates[0].content.parts[0].inline_data
+        data = inline.data
     except (IndexError, AttributeError) as e:
         logger.error(f"Failed to extract audio data: {e}")
         raise HTTPException(status_code=500, detail="Không tìm thấy audio trong phản hồi Gemini")
     if isinstance(data, (bytes, bytearray)):
         pcm = bytes(data)
     else:
         try:
             pcm = base64.b64decode(data)
         except Exception as e:
             headers={"Content-Disposition": 'inline; filename="speech.wav"'}
         )
 @app.get("/")
 def root():
     return {
         "ok": True,
+        "usage": "POST /v1/audio/speech",
+        "debug_endpoint": "/v1/audio/speech/debug (để test request format)",
+        "required_fields": ["input", "gemini_api_key"],
+        "optional_fields": ["model", "voice", "response_format", "gemini_model", "speed"],
         "example": {
             "model": "tts-1",
             "voice": "en-US-Journey-F",