File size: 2,306 Bytes
eca1fe4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import io
import numpy as np
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
import soundfile as sf
from kokoro import KPipeline

app = FastAPI(title="Kokoro TTS API")

# Allow requests from any origin (your Vercel frontend)
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["POST", "GET"],
    allow_headers=["*"],
)

# Load the pipeline once at startup — reused for all requests
# 'a' = American English. Add more pipelines if you want other languages.
print("Loading Kokoro TTS pipeline...")
pipeline = KPipeline(lang_code='a')
print("Kokoro TTS pipeline ready!")


class TTSRequest(BaseModel):
    text: str
    voice: str = "af_bella"
    speed: float = 1.0


@app.get("/")
def root():
    return {"status": "ok", "service": "Kokoro TTS API"}


@app.get("/health")
def health():
    return {"status": "healthy"}


@app.post("/tts")
async def tts(request: TTSRequest):
    if not request.text or len(request.text.strip()) == 0:
        raise HTTPException(status_code=400, detail="Text is required")

    if len(request.text) > 5000:
        raise HTTPException(status_code=400, detail="Text too long (max 5000 chars per request)")

    try:
        # Generate audio chunks from Kokoro
        audio_chunks = []
        for _gs, _ps, audio in pipeline(
            request.text,
            voice=request.voice,
            speed=request.speed
        ):
            audio_chunks.append(audio)

        if not audio_chunks:
            raise HTTPException(status_code=500, detail="No audio generated")

        # Combine all chunks into a single float32 array
        combined = np.concatenate(audio_chunks)

        # Encode as WAV and return as a streaming binary response
        buf = io.BytesIO()
        sf.write(buf, combined, samplerate=24000, format="WAV", subtype="PCM_16")
        buf.seek(0)

        return StreamingResponse(
            buf,
            media_type="audio/wav",
            headers={"Content-Disposition": "attachment; filename=audio.wav"}
        )

    except HTTPException:
        raise
    except Exception as e:
        print(f"TTS Error: {e}")
        raise HTTPException(status_code=500, detail=str(e))