File size: 1,510 Bytes
97f0fe1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import io, base64, tempfile, os
from fastapi import FastAPI
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
import torchaudio as ta
from chatterbox.tts import ChatterboxTTS

app = FastAPI()

# Load model once at startup
print("Loading ChatterboxTTS model...")
model = ChatterboxTTS.from_pretrained(device="cpu")
print("Model ready.")


class TTSRequest(BaseModel):
    text: str
    ref_audio: str | None = None        # base64-encoded audio file
    exaggeration: float = 0.5
    cfg_weight: float = 0.5
    temperature: float = 0.8


@app.get("/health")
def health():
    return {"status": "ok"}


@app.post("/synthesize")
def synthesize(req: TTSRequest):
    ref_path = None

    # Write ref audio to a temp file if provided
    if req.ref_audio:
        audio_bytes = base64.b64decode(req.ref_audio)
        tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
        tmp.write(audio_bytes)
        tmp.close()
        ref_path = tmp.name

    try:
        wav = model.generate(
            req.text,
            audio_prompt_path=ref_path,
            exaggeration=req.exaggeration,
            cfg_weight=req.cfg_weight,
            temperature=req.temperature,
        )
    finally:
        if ref_path and os.path.exists(ref_path):
            os.unlink(ref_path)

    # Write wav to buffer and return as audio/wav
    buf = io.BytesIO()
    ta.save(buf, wav, model.sr, format="wav")
    buf.seek(0)

    return StreamingResponse(buf, media_type="audio/wav")