livekit-tts / server.py
abc1181's picture
Update server.py
778ccba verified
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse, JSONResponse
from kokoro import KPipeline
import soundfile as sf
import io
import uvicorn
import langdetect
app = FastAPI()
# βœ… Pre-load both pipelines at startup so there's no delay mid-conversation
pipeline_en = KPipeline(lang_code='a') # English / Hinglish
pipeline_hi = KPipeline(lang_code='h') # Hindi
def detect_pipeline(text: str) -> KPipeline:
"""Return the right Kokoro pipeline based on detected language."""
try:
lang = langdetect.detect(text)
if lang == 'hi':
return pipeline_hi
else:
return pipeline_en # handles English and Hinglish
except Exception:
return pipeline_en # default to English on failure
def detect_voice(text: str) -> str:
"""Pick the right voice for the detected language."""
try:
lang = langdetect.detect(text)
if lang == 'hi':
return "hf_alpha" # βœ… Kokoro's Hindi voice
else:
return "af_heart" # your existing English voice
except Exception:
return "af_heart"
# Health checks
@app.get("/")
@app.get("/v1")
async def root():
return {"status": "ok", "service": "kokoro-tts", "languages": ["en", "hi"]}
# Main TTS endpoint
@app.post("/v1/audio/speech")
async def tts(request: Request):
try:
data = await request.json()
text = data.get("input", "")
if not text:
return JSONResponse({"error": "No input text"}, status_code=400)
# βœ… Allow agent.py to override voice via request body
voice = detect_voice(text)
pipeline = detect_pipeline(text)
generator = pipeline(text, voice=voice, speed=1)
for _, _, audio in generator:
out = io.BytesIO()
sf.write(out, audio, 24000, format='mp3')
out.seek(0)
return StreamingResponse(out, media_type="audio/mpeg")
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
@app.post("/v1/audio/speech/test")
async def tts_test(request: Request):
try:
data = await request.json()
text = data.get("input", "")
ratio = float(data.get("ratio", 0.5)) # blend ratio from request
hindi_voice = pipeline_hi.voices["hf_alpha"]
english_voice = pipeline_en.voices["af_heart"]
blended_voice = torch.lerp(english_voice, hindi_voice, ratio)
generator = pipeline_en(text, voice=blended_voice, speed=1)
for _, _, audio in generator:
out = io.BytesIO()
sf.write(out, audio, 24000, format='mp3')
out.seek(0)
return StreamingResponse(out, media_type="audio/mpeg")
except Exception as e:
return JSONResponse({"error": str(e)}, status_code=500)
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)