from fastapi import FastAPI, Request from fastapi.responses import StreamingResponse, JSONResponse from kokoro import KPipeline import soundfile as sf import io import uvicorn import langdetect app = FastAPI() # ✅ Pre-load both pipelines at startup so there's no delay mid-conversation pipeline_en = KPipeline(lang_code='a') # English / Hinglish pipeline_hi = KPipeline(lang_code='h') # Hindi def detect_pipeline(text: str) -> KPipeline: """Return the right Kokoro pipeline based on detected language.""" try: lang = langdetect.detect(text) if lang == 'hi': return pipeline_hi else: return pipeline_en # handles English and Hinglish except Exception: return pipeline_en # default to English on failure def detect_voice(text: str) -> str: """Pick the right voice for the detected language.""" try: lang = langdetect.detect(text) if lang == 'hi': return "hf_alpha" # ✅ Kokoro's Hindi voice else: return "af_heart" # your existing English voice except Exception: return "af_heart" # Health checks @app.get("/") @app.get("/v1") async def root(): return {"status": "ok", "service": "kokoro-tts", "languages": ["en", "hi"]} # Main TTS endpoint @app.post("/v1/audio/speech") async def tts(request: Request): try: data = await request.json() text = data.get("input", "") if not text: return JSONResponse({"error": "No input text"}, status_code=400) # ✅ Allow agent.py to override voice via request body voice = detect_voice(text) pipeline = detect_pipeline(text) generator = pipeline(text, voice=voice, speed=1) for _, _, audio in generator: out = io.BytesIO() sf.write(out, audio, 24000, format='mp3') out.seek(0) return StreamingResponse(out, media_type="audio/mpeg") except Exception as e: return JSONResponse({"error": str(e)}, status_code=500) @app.post("/v1/audio/speech/test") async def tts_test(request: Request): try: data = await request.json() text = data.get("input", "") ratio = float(data.get("ratio", 0.5)) # blend ratio from request hindi_voice = pipeline_hi.voices["hf_alpha"] english_voice = pipeline_en.voices["af_heart"] blended_voice = torch.lerp(english_voice, hindi_voice, ratio) generator = pipeline_en(text, voice=blended_voice, speed=1) for _, _, audio in generator: out = io.BytesIO() sf.write(out, audio, 24000, format='mp3') out.seek(0) return StreamingResponse(out, media_type="audio/mpeg") except Exception as e: return JSONResponse({"error": str(e)}, status_code=500) if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000)