| import os |
| import time |
| import asyncio |
| from pathlib import Path |
| from typing import Optional |
| import numpy as np |
| import soundfile as sf |
| import torch |
| from fastapi import FastAPI, HTTPException, BackgroundTasks |
| from fastapi.responses import FileResponse, JSONResponse, StreamingResponse |
| from fastapi.staticfiles import StaticFiles |
| from fastapi.middleware.cors import CORSMiddleware |
| from pydantic import BaseModel |
| import logging |
| import io |
|
|
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
| |
| app = FastAPI( |
| title="Kokoro TTS API", |
| description="Text-to-Speech API powered by Kokoro-82M", |
| version="1.0.0", |
| ) |
|
|
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins=["*"], |
| allow_methods=["*"], |
| allow_headers=["*"], |
| ) |
|
|
| |
| STATIC_DIR = Path("static") |
| STATIC_DIR.mkdir(exist_ok=True) |
|
|
| app.mount("/static", StaticFiles(directory=str(STATIC_DIR)), name="static") |
|
|
| |
| CUDA_AVAILABLE = torch.cuda.is_available() |
| device = "cuda" if CUDA_AVAILABLE else "cpu" |
| logger.info(f"Device: {device}") |
|
|
| model = None |
| pipelines = {} |
|
|
| def load_model(): |
| global model, pipelines |
| try: |
| from kokoro import KPipeline, KModel |
| logger.info("Loading Kokoro model...") |
| model = KModel(repo_id="hexgrad/Kokoro-82M").to(device).eval() |
|
|
| lang_codes = ["a", "b", "e", "f", "h", "i", "j", "p", "z"] |
| for code in lang_codes: |
| try: |
| pipelines[code] = KPipeline(lang_code=code, model=False) |
| except Exception as e: |
| logger.warning(f"Pipeline '{code}' failed: {e}") |
|
|
| if "a" in pipelines: |
| pipelines["a"].g2p.lexicon.golds["kokoro"] = "kˈOkəɹO" |
| if "b" in pipelines: |
| pipelines["b"].g2p.lexicon.golds["kokoro"] = "kˈQkəɹQ" |
|
|
| logger.info(f"Model loaded. Pipelines: {list(pipelines.keys())}") |
| except Exception as e: |
| logger.error(f"Model load failed: {e}") |
|
|
| |
| @app.on_event("startup") |
| async def startup_event(): |
| loop = asyncio.get_event_loop() |
| await loop.run_in_executor(None, load_model) |
|
|
| |
| VOICES = { |
| |
| "af_heart": {"label": "Heart", "lang": "en-US", "gender": "female", "flag": "🇺🇸", "code": "a"}, |
| "af_bella": {"label": "Bella", "lang": "en-US", "gender": "female", "flag": "🇺🇸", "code": "a"}, |
| "af_nicole": {"label": "Nicole", "lang": "en-US", "gender": "female", "flag": "🇺🇸", "code": "a"}, |
| "af_aoede": {"label": "Aoede", "lang": "en-US", "gender": "female", "flag": "🇺🇸", "code": "a"}, |
| "af_kore": {"label": "Kore", "lang": "en-US", "gender": "female", "flag": "🇺🇸", "code": "a"}, |
| "af_sarah": {"label": "Sarah", "lang": "en-US", "gender": "female", "flag": "🇺🇸", "code": "a"}, |
| "af_nova": {"label": "Nova", "lang": "en-US", "gender": "female", "flag": "🇺🇸", "code": "a"}, |
| "af_sky": {"label": "Sky", "lang": "en-US", "gender": "female", "flag": "🇺🇸", "code": "a"}, |
| "af_river": {"label": "River", "lang": "en-US", "gender": "female", "flag": "🇺🇸", "code": "a"}, |
| "am_michael": {"label": "Michael", "lang": "en-US", "gender": "male", "flag": "🇺🇸", "code": "a"}, |
| "am_fenrir": {"label": "Fenrir", "lang": "en-US", "gender": "male", "flag": "🇺🇸", "code": "a"}, |
| "am_puck": {"label": "Puck", "lang": "en-US", "gender": "male", "flag": "🇺🇸", "code": "a"}, |
| "am_echo": {"label": "Echo", "lang": "en-US", "gender": "male", "flag": "🇺🇸", "code": "a"}, |
| "am_eric": {"label": "Eric", "lang": "en-US", "gender": "male", "flag": "🇺🇸", "code": "a"}, |
| "am_liam": {"label": "Liam", "lang": "en-US", "gender": "male", "flag": "🇺🇸", "code": "a"}, |
| "am_adam": {"label": "Adam", "lang": "en-US", "gender": "male", "flag": "🇺🇸", "code": "a"}, |
| |
| "bf_emma": {"label": "Emma", "lang": "en-GB", "gender": "female", "flag": "🇬🇧", "code": "b"}, |
| "bf_isabella": {"label": "Isabella","lang": "en-GB", "gender": "female", "flag": "🇬🇧", "code": "b"}, |
| "bf_alice": {"label": "Alice", "lang": "en-GB", "gender": "female", "flag": "🇬🇧", "code": "b"}, |
| "bf_lily": {"label": "Lily", "lang": "en-GB", "gender": "female", "flag": "🇬🇧", "code": "b"}, |
| "bm_george": {"label": "George", "lang": "en-GB", "gender": "male", "flag": "🇬🇧", "code": "b"}, |
| "bm_fable": {"label": "Fable", "lang": "en-GB", "gender": "male", "flag": "🇬🇧", "code": "b"}, |
| "bm_lewis": {"label": "Lewis", "lang": "en-GB", "gender": "male", "flag": "🇬🇧", "code": "b"}, |
| "bm_daniel": {"label": "Daniel", "lang": "en-GB", "gender": "male", "flag": "🇬🇧", "code": "b"}, |
| |
| "ef_dora": {"label": "Dora", "lang": "es", "gender": "female", "flag": "🇪🇸", "code": "e"}, |
| "em_alex": {"label": "Alex", "lang": "es", "gender": "male", "flag": "🇪🇸", "code": "e"}, |
| |
| "ff_siwis": {"label": "Siwis", "lang": "fr", "gender": "female", "flag": "🇫🇷", "code": "f"}, |
| |
| "hf_alpha": {"label": "Alpha", "lang": "hi", "gender": "female", "flag": "🇮🇳", "code": "h"}, |
| "hf_beta": {"label": "Beta", "lang": "hi", "gender": "female", "flag": "🇮🇳", "code": "h"}, |
| "hm_omega": {"label": "Omega", "lang": "hi", "gender": "male", "flag": "🇮🇳", "code": "h"}, |
| "hm_psi": {"label": "Psi", "lang": "hi", "gender": "male", "flag": "🇮🇳", "code": "h"}, |
| |
| "if_sara": {"label": "Sara", "lang": "it", "gender": "female", "flag": "🇮🇹", "code": "i"}, |
| "im_nicola": {"label": "Nicola", "lang": "it", "gender": "male", "flag": "🇮🇹", "code": "i"}, |
| |
| "jf_alpha": {"label": "Alpha", "lang": "ja", "gender": "female", "flag": "🇯🇵", "code": "j"}, |
| "jf_gongitsune":{"label": "Gongitsune","lang": "ja", "gender": "female", "flag": "🇯🇵", "code": "j"}, |
| "jf_nezumi": {"label": "Nezumi", "lang": "ja", "gender": "female", "flag": "🇯🇵", "code": "j"}, |
| "jm_kumo": {"label": "Kumo", "lang": "ja", "gender": "male", "flag": "🇯🇵", "code": "j"}, |
| |
| "pf_dora": {"label": "Dora", "lang": "pt", "gender": "female", "flag": "🇧🇷", "code": "p"}, |
| "pm_alex": {"label": "Alex", "lang": "pt", "gender": "male", "flag": "🇧🇷", "code": "p"}, |
| |
| "zf_xiaobei": {"label": "Xiaobei", "lang": "zh", "gender": "female", "flag": "🇨🇳", "code": "z"}, |
| "zf_xiaoxiao": {"label": "Xiaoxiao", "lang": "zh", "gender": "female", "flag": "🇨🇳", "code": "z"}, |
| "zm_yunjian": {"label": "Yunjian", "lang": "zh", "gender": "male", "flag": "🇨🇳", "code": "z"}, |
| "zm_yunxi": {"label": "Yunxi", "lang": "zh", "gender": "male", "flag": "🇨🇳", "code": "z"}, |
| } |
|
|
| |
| class TTSRequest(BaseModel): |
| text: str |
| voice: str = "af_heart" |
| speed: float = 1.0 |
| output_format: str = "wav" |
|
|
|
|
| |
| def _synthesize_to_bytes(text: str, voice: str, speed: float, output_format: str) -> tuple: |
| if model is None: |
| raise RuntimeError("Model not loaded yet") |
| voice_info = VOICES.get(voice) |
| if not voice_info: |
| raise ValueError(f"Unknown voice: {voice}") |
| pipeline = pipelines.get(voice_info["code"]) |
| if not pipeline: |
| raise ValueError(f"No pipeline for lang code: {voice_info['code']}") |
| voice_pack = pipeline.load_voice(voice) |
| all_audio = [] |
| for _, ps, _ in pipeline(text, voice, speed, split_pattern=r"\n+"): |
| ref_s = voice_pack[len(ps) - 1].to(device) |
| all_audio.append(model(ps, ref_s, speed).cpu().numpy()) |
| if not all_audio: |
| raise RuntimeError("No audio generated") |
| final_audio = np.concatenate(all_audio) |
| duration = len(final_audio) / 24000 |
| buf = io.BytesIO() |
| sf.write(buf, final_audio, 24000, format="WAV") |
| return buf.getvalue(), duration |
|
|
| |
|
|
| @app.get("/") |
| async def root(): |
| return FileResponse("static/index.html") |
|
|
| @app.get("/health") |
| async def health(): |
| return { |
| "status": "ok", |
| "model_loaded": model is not None, |
| "device": device, |
| "cuda": CUDA_AVAILABLE, |
| "pipelines": list(pipelines.keys()), |
| } |
|
|
| @app.get("/voices") |
| async def list_voices(): |
| available = { |
| k: v for k, v in VOICES.items() if v["code"] in pipelines |
| } |
| |
| grouped = {} |
| for vid, info in available.items(): |
| lang = info["lang"] |
| if lang not in grouped: |
| grouped[lang] = [] |
| grouped[lang].append({"id": vid, **info}) |
| return {"voices": available, "grouped": grouped, "total": len(available)} |
|
|
| @app.post("/tts") |
| async def text_to_speech(request: TTSRequest): |
| if not request.text.strip(): |
| raise HTTPException(400, "text cannot be empty") |
| if request.voice not in VOICES: |
| raise HTTPException(400, f"Unknown voice. GET /voices for list.") |
| if not 0.5 <= request.speed <= 2.0: |
| raise HTTPException(400, "speed must be between 0.5 and 2.0") |
| if request.output_format not in ("wav", "mp3"): |
| raise HTTPException(400, "output_format must be wav or mp3") |
| if model is None: |
| raise HTTPException(503, "Model is still loading, please retry in a moment") |
|
|
| try: |
| loop = asyncio.get_event_loop() |
| audio_bytes, duration = await loop.run_in_executor( |
| None, |
| lambda: _synthesize_to_bytes(request.text, request.voice, request.speed, request.output_format), |
| ) |
| except Exception as e: |
| logger.error(f"TTS error: {e}") |
| raise HTTPException(500, str(e)) |
|
|
| fmt = request.output_format |
| return StreamingResponse( |
| io.BytesIO(audio_bytes), |
| media_type="audio/mpeg" if fmt == "mp3" else "audio/wav", |
| headers={ |
| "Content-Disposition": f'attachment; filename="kokoro_{request.voice}.{fmt}"', |
| "X-Duration-Seconds": str(round(duration, 2)), |
| }, |
| ) |
|
|
|
|