import io, base64, tempfile, os from fastapi import FastAPI from fastapi.responses import StreamingResponse from pydantic import BaseModel import torchaudio as ta from chatterbox.tts import ChatterboxTTS app = FastAPI() # Load model once at startup print("Loading ChatterboxTTS model...") model = ChatterboxTTS.from_pretrained(device="cpu") print("Model ready.") class TTSRequest(BaseModel): text: str ref_audio: str | None = None # base64-encoded audio file exaggeration: float = 0.5 cfg_weight: float = 0.5 temperature: float = 0.8 @app.get("/health") def health(): return {"status": "ok"} @app.post("/synthesize") def synthesize(req: TTSRequest): ref_path = None # Write ref audio to a temp file if provided if req.ref_audio: audio_bytes = base64.b64decode(req.ref_audio) tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) tmp.write(audio_bytes) tmp.close() ref_path = tmp.name try: wav = model.generate( req.text, audio_prompt_path=ref_path, exaggeration=req.exaggeration, cfg_weight=req.cfg_weight, temperature=req.temperature, ) finally: if ref_path and os.path.exists(ref_path): os.unlink(ref_path) # Write wav to buffer and return as audio/wav buf = io.BytesIO() ta.save(buf, wav, model.sr, format="wav") buf.seek(0) return StreamingResponse(buf, media_type="audio/wav")