Deploy: CPU-optimized TTS with critical thread safety fixes
Browse files- Dockerfile +30 -0
- README.md +112 -7
- __pycache__/app.cpython-311.pyc +0 -0
- app.py +705 -0
- requirements.txt +29 -0
Dockerfile
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
ENV PYTHONDONTWRITEBYTECODE=1 \
|
| 4 |
+
PYTHONUNBUFFERED=1 \
|
| 5 |
+
HF_HOME=/data/hf \
|
| 6 |
+
TRANSFORMERS_CACHE=/data/hf/transformers \
|
| 7 |
+
TORCH_HOME=/data/hf/torch \
|
| 8 |
+
COQUI_TOS_AGREED=1 \
|
| 9 |
+
OMP_NUM_THREADS=1 \
|
| 10 |
+
MKL_NUM_THREADS=1 \
|
| 11 |
+
NUMEXPR_NUM_THREADS=2 \
|
| 12 |
+
KMP_BLOCKTIME=1 \
|
| 13 |
+
KMP_AFFINITY="granularity=fine,compact,1,0"
|
| 14 |
+
|
| 15 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 16 |
+
git \
|
| 17 |
+
ffmpeg \
|
| 18 |
+
libsndfile1 \
|
| 19 |
+
build-essential \
|
| 20 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 21 |
+
|
| 22 |
+
WORKDIR /app
|
| 23 |
+
|
| 24 |
+
COPY requirements.txt /app/requirements.txt
|
| 25 |
+
RUN pip install --no-cache-dir -r /app/requirements.txt
|
| 26 |
+
|
| 27 |
+
COPY app.py /app/app.py
|
| 28 |
+
|
| 29 |
+
EXPOSE 7860
|
| 30 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1"]
|
README.md
CHANGED
|
@@ -1,10 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# HF Spaces CPU TTS API (Docker)
|
| 2 |
+
|
| 3 |
+
API-only Space with **separate endpoints** for:
|
| 4 |
+
|
| 5 |
+
- **XTTS v2** (voice cloning with an uploaded reference clip; Polish by default)
|
| 6 |
+
- **Parler-TTS mini multilingual v1.1** (fast, high-quality Polish TTS; style controlled by text description)
|
| 7 |
+
- **Piper** (backup, local voices; bring your own `.onnx` voice files)
|
| 8 |
+
|
| 9 |
+
Runs on **HF Spaces free CPU (2 vCPU / 16GB RAM)** with CPU-friendly defaults:
|
| 10 |
+
- **Chunking** (sentence-based) to avoid timeouts on long text
|
| 11 |
+
- **Streaming** via SSE (each chunk returned as a standalone WAV)
|
| 12 |
+
- Optional **torch.compile** and optional **dynamic int8 quantization** hooks
|
| 13 |
+
|
| 14 |
---
|
| 15 |
+
|
| 16 |
+
## Endpoints
|
| 17 |
+
|
| 18 |
+
### Health
|
| 19 |
+
- `GET /health`
|
| 20 |
+
|
| 21 |
+
### XTTS v2
|
| 22 |
+
- `POST /v1/xtts/synthesize` (multipart/form-data; WAV bytes)
|
| 23 |
+
- `POST /v1/xtts/stream` (SSE; base64 WAV chunks)
|
| 24 |
+
|
| 25 |
+
### Parler
|
| 26 |
+
- `POST /v1/parler/synthesize` (JSON; WAV bytes)
|
| 27 |
+
- `POST /v1/parler/stream` (JSON; SSE base64 WAV chunks)
|
| 28 |
+
|
| 29 |
+
### Piper
|
| 30 |
+
- `GET /v1/piper/voices`
|
| 31 |
+
- `POST /v1/piper/synthesize` (JSON; WAV bytes)
|
| 32 |
+
|
| 33 |
+
OpenAPI docs:
|
| 34 |
+
- `/docs`
|
| 35 |
+
|
| 36 |
---
|
| 37 |
|
| 38 |
+
## Usage examples
|
| 39 |
+
|
| 40 |
+
### XTTS voice cloning (file upload)
|
| 41 |
+
```bash
|
| 42 |
+
curl -X POST "http://localhost:7860/v1/xtts/synthesize" \
|
| 43 |
+
-F "text=Cześć! To jest test głosu." \
|
| 44 |
+
-F "language=pl" \
|
| 45 |
+
-F "chunking=true" \
|
| 46 |
+
-F "speaker_wav=@reference.wav" \
|
| 47 |
+
--output out.wav
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
### XTTS streaming (SSE)
|
| 51 |
+
This streams **multiple WAV chunks** (base64) as events. Your client should decode each `wav_b64` and play/append.
|
| 52 |
+
```bash
|
| 53 |
+
curl -N -X POST "http://localhost:7860/v1/xtts/stream" \
|
| 54 |
+
-H "Content-Type: application/json" \
|
| 55 |
+
-d '{"text":"Cześć! To jest dłuższy tekst. Druga fraza. Trzecia fraza.","language":"pl","chunking":true}'
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
### Parler synth
|
| 59 |
+
```bash
|
| 60 |
+
curl -X POST "http://localhost:7860/v1/parler/synthesize" \
|
| 61 |
+
-H "Content-Type: application/json" \
|
| 62 |
+
-d '{
|
| 63 |
+
"text":"Cześć! To Parler w języku polskim.",
|
| 64 |
+
"description":"A calm female Polish voice, close-mic, warm tone, subtle smile, studio quality."
|
| 65 |
+
}' \
|
| 66 |
+
--output parler.wav
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
### Piper voices + synth
|
| 70 |
+
```bash
|
| 71 |
+
curl "http://localhost:7860/v1/piper/voices"
|
| 72 |
+
|
| 73 |
+
curl -X POST "http://localhost:7860/v1/piper/synthesize" \
|
| 74 |
+
-H "Content-Type: application/json" \
|
| 75 |
+
-d '{"text":"To jest Piper jako kopia zapasowa.","voice_id":"pl_PL-gosia-medium"}' \
|
| 76 |
+
--output piper.wav
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
---
|
| 80 |
+
|
| 81 |
+
## Environment variables (important knobs)
|
| 82 |
+
|
| 83 |
+
### XTTS
|
| 84 |
+
- `XTTS_MODEL_NAME` (default: `tts_models/multilingual/multi-dataset/xtts_v2`)
|
| 85 |
+
- `XTTS_DEFAULT_LANGUAGE` (default: `pl`)
|
| 86 |
+
- `XTTS_TORCH_COMPILE=1` to attempt `torch.compile()` (best-effort)
|
| 87 |
+
- `XTTS_DYNAMIC_INT8=1` to attempt dynamic int8 quantization (best-effort)
|
| 88 |
+
|
| 89 |
+
### Parler
|
| 90 |
+
- `PARLER_MODEL_NAME` (default: `parler-tts/parler-tts-mini-multilingual-v1.1`)
|
| 91 |
+
- `PARLER_DEFAULT_DESCRIPTION` (default is neutral Polish)
|
| 92 |
+
- `PARLER_SEED` (default: `0`)
|
| 93 |
+
- `PARLER_TORCH_COMPILE=1` (best-effort)
|
| 94 |
+
- `PARLER_DYNAMIC_INT8=1` (best-effort)
|
| 95 |
+
|
| 96 |
+
### Chunking / joining
|
| 97 |
+
- `CHUNK_MAX_CHARS` (default: 260)
|
| 98 |
+
- `CHUNK_MAX_WORDS` (default: 40)
|
| 99 |
+
- `CHUNK_MAX_SENTENCES` (default: 8)
|
| 100 |
+
- `JOIN_SILENCE_MS` (default: 60)
|
| 101 |
+
|
| 102 |
+
### Piper
|
| 103 |
+
Bring your own Piper `.onnx` voices:
|
| 104 |
+
- Put voice files in `/data/piper` (auto-scanned) **OR**
|
| 105 |
+
- Set `PIPER_VOICES_JSON='{"voice_id":"/data/piper/voice.onnx"}'`
|
| 106 |
+
- Optionally set `PIPER_VOICES_DIR` (default: `/data/piper`)
|
| 107 |
+
|
| 108 |
+
---
|
| 109 |
+
|
| 110 |
+
## Notes on “streaming”
|
| 111 |
+
XTTS and Parler streaming here is implemented by:
|
| 112 |
+
1) **Sentence chunking** (fast + stable on CPU)
|
| 113 |
+
2) Returning each chunk as its own **WAV** event over SSE
|
| 114 |
+
|
| 115 |
+
This avoids needing the full WAV length upfront and prevents long-run timeouts on free Spaces CPU.
|
__pycache__/app.cpython-311.pyc
ADDED
|
Binary file (38.9 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,705 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HF Spaces (Docker) CPU-only TTS API
|
| 3 |
+
- Separate endpoints per service: XTTS v2, Parler-TTS mini multilingual, Piper.
|
| 4 |
+
- CPU-friendly defaults for 2 vCPU / 16 GB RAM:
|
| 5 |
+
- Sentence chunking (default ON)
|
| 6 |
+
- Streaming via SSE (each chunk returned as standalone WAV)
|
| 7 |
+
- Optional torch.compile, optional dynamic int8 quantization hooks
|
| 8 |
+
Notes:
|
| 9 |
+
- This repo is intentionally API-only (no Gradio UI).
|
| 10 |
+
- HF Spaces expects a web server on port 7860.
|
| 11 |
+
"""
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
import asyncio
|
| 15 |
+
import base64
|
| 16 |
+
import io
|
| 17 |
+
import json
|
| 18 |
+
import os
|
| 19 |
+
import re
|
| 20 |
+
import tempfile
|
| 21 |
+
import threading
|
| 22 |
+
import time
|
| 23 |
+
from dataclasses import dataclass
|
| 24 |
+
from functools import lru_cache
|
| 25 |
+
from typing import Dict, Generator, Iterable, List, Optional, Tuple
|
| 26 |
+
|
| 27 |
+
import numpy as np
|
| 28 |
+
import soundfile as sf
|
| 29 |
+
import torch
|
| 30 |
+
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
|
| 31 |
+
from fastapi.responses import Response, StreamingResponse
|
| 32 |
+
from pydantic import BaseModel, Field
|
| 33 |
+
|
| 34 |
+
# --- Optional deps (import lazily where possible) ---
|
| 35 |
+
# XTTS (Coqui TTS)
|
| 36 |
+
from TTS.api import TTS
|
| 37 |
+
|
| 38 |
+
# Parler-TTS (transformers)
|
| 39 |
+
from transformers import AutoTokenizer, set_seed
|
| 40 |
+
try:
|
| 41 |
+
from parler_tts import ParlerTTSForConditionalGeneration
|
| 42 |
+
except Exception:
|
| 43 |
+
ParlerTTSForConditionalGeneration = None # type: ignore
|
| 44 |
+
|
| 45 |
+
# Piper fallback
|
| 46 |
+
try:
|
| 47 |
+
from piper.voice import PiperVoice
|
| 48 |
+
except Exception:
|
| 49 |
+
PiperVoice = None # type: ignore
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
# -----------------------
|
| 53 |
+
# Settings / knobs
|
| 54 |
+
# -----------------------
|
| 55 |
+
def _env_bool(name: str, default: bool = False) -> bool:
|
| 56 |
+
v = os.getenv(name)
|
| 57 |
+
if v is None:
|
| 58 |
+
return default
|
| 59 |
+
return v.strip().lower() in {"1", "true", "yes", "y", "on"}
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
@dataclass(frozen=True)
|
| 63 |
+
class Settings:
|
| 64 |
+
# XTTS v2
|
| 65 |
+
xtts_model_name: str = os.getenv("XTTS_MODEL_NAME", "tts_models/multilingual/multi-dataset/xtts_v2")
|
| 66 |
+
xtts_default_language: str = os.getenv("XTTS_DEFAULT_LANGUAGE", "pl")
|
| 67 |
+
xtts_torch_compile: bool = _env_bool("XTTS_TORCH_COMPILE", False)
|
| 68 |
+
xtts_dynamic_int8: bool = _env_bool("XTTS_DYNAMIC_INT8", False)
|
| 69 |
+
|
| 70 |
+
# Parler
|
| 71 |
+
parler_model_name: str = os.getenv("PARLER_MODEL_NAME", "parler-tts/parler-tts-mini-multilingual-v1.1")
|
| 72 |
+
parler_default_description: str = os.getenv(
|
| 73 |
+
"PARLER_DEFAULT_DESCRIPTION",
|
| 74 |
+
# Keep this neutral; users can override per-request.
|
| 75 |
+
"A clear, natural, studio-recorded voice speaking Polish with steady pacing.",
|
| 76 |
+
)
|
| 77 |
+
parler_seed: int = int(os.getenv("PARLER_SEED", "0"))
|
| 78 |
+
parler_torch_compile: bool = _env_bool("PARLER_TORCH_COMPILE", False)
|
| 79 |
+
parler_dynamic_int8: bool = _env_bool("PARLER_DYNAMIC_INT8", False)
|
| 80 |
+
|
| 81 |
+
# Piper
|
| 82 |
+
# Provide a JSON mapping voice_id -> model_path; or mount voices into /data/piper and auto-scan.
|
| 83 |
+
piper_voices_json: str = os.getenv("PIPER_VOICES_JSON", "")
|
| 84 |
+
piper_voices_dir: str = os.getenv("PIPER_VOICES_DIR", "/data/piper")
|
| 85 |
+
|
| 86 |
+
# Chunking / streaming defaults
|
| 87 |
+
chunk_max_chars: int = int(os.getenv("CHUNK_MAX_CHARS", "260"))
|
| 88 |
+
chunk_max_words: int = int(os.getenv("CHUNK_MAX_WORDS", "40"))
|
| 89 |
+
chunk_max_sentences: int = int(os.getenv("CHUNK_MAX_SENTENCES", "8")) # cap chunks to avoid CPU timeouts
|
| 90 |
+
join_silence_ms: int = int(os.getenv("JOIN_SILENCE_MS", "60"))
|
| 91 |
+
|
| 92 |
+
# Runtime
|
| 93 |
+
num_threads: int = int(os.getenv("OMP_NUM_THREADS", "2")) # set via Dockerfile already
|
| 94 |
+
request_timeout_s: int = int(os.getenv("REQUEST_TIMEOUT_S", "240"))
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
S = Settings()
|
| 98 |
+
|
| 99 |
+
# Conservative CPU threading.
|
| 100 |
+
torch.set_num_threads(S.num_threads)
|
| 101 |
+
torch.set_num_interop_threads(max(1, S.num_threads // 2))
|
| 102 |
+
|
| 103 |
+
# -----------------------
|
| 104 |
+
# Utilities
|
| 105 |
+
# -----------------------
|
| 106 |
+
_SENT_SPLIT_RE = re.compile(r"(?<=[\.\!\?\:\;])\s+|\n+")
|
| 107 |
+
_WS_RE = re.compile(r"\s+")
|
| 108 |
+
|
| 109 |
+
def normalize_text(text: str) -> str:
|
| 110 |
+
text = text.strip()
|
| 111 |
+
text = _WS_RE.sub(" ", text)
|
| 112 |
+
return text
|
| 113 |
+
|
| 114 |
+
def split_text_into_chunks(
|
| 115 |
+
text: str,
|
| 116 |
+
max_chars: int = S.chunk_max_chars,
|
| 117 |
+
max_words: int = S.chunk_max_words,
|
| 118 |
+
max_sentences: int = S.chunk_max_sentences,
|
| 119 |
+
) -> List[str]:
|
| 120 |
+
"""
|
| 121 |
+
Chunking strategy:
|
| 122 |
+
1) Sentence split (cheap, deterministic).
|
| 123 |
+
2) Merge sentences into chunks constrained by max_chars and max_words.
|
| 124 |
+
"""
|
| 125 |
+
text = normalize_text(text)
|
| 126 |
+
if not text:
|
| 127 |
+
return []
|
| 128 |
+
|
| 129 |
+
sents = [s.strip() for s in _SENT_SPLIT_RE.split(text) if s.strip()]
|
| 130 |
+
chunks: List[str] = []
|
| 131 |
+
cur: List[str] = []
|
| 132 |
+
cur_chars = 0
|
| 133 |
+
cur_words = 0
|
| 134 |
+
|
| 135 |
+
def flush():
|
| 136 |
+
nonlocal cur, cur_chars, cur_words
|
| 137 |
+
if cur:
|
| 138 |
+
chunks.append(" ".join(cur).strip())
|
| 139 |
+
cur = []
|
| 140 |
+
cur_chars = 0
|
| 141 |
+
cur_words = 0
|
| 142 |
+
|
| 143 |
+
for sent in sents:
|
| 144 |
+
w = sent.split()
|
| 145 |
+
sent_words = len(w)
|
| 146 |
+
sent_chars = len(sent)
|
| 147 |
+
if (cur_chars + sent_chars > max_chars) or (cur_words + sent_words > max_words):
|
| 148 |
+
flush()
|
| 149 |
+
cur.append(sent)
|
| 150 |
+
cur_chars += sent_chars + 1
|
| 151 |
+
cur_words += sent_words
|
| 152 |
+
if max_sentences and len(chunks) + (1 if cur else 0) >= max_sentences:
|
| 153 |
+
flush()
|
| 154 |
+
break
|
| 155 |
+
|
| 156 |
+
flush()
|
| 157 |
+
return chunks
|
| 158 |
+
|
| 159 |
+
def wav_bytes_from_audio(audio: np.ndarray, sr: int) -> bytes:
|
| 160 |
+
audio = np.asarray(audio, dtype=np.float32)
|
| 161 |
+
buf = io.BytesIO()
|
| 162 |
+
sf.write(buf, audio, sr, format="WAV", subtype="PCM_16")
|
| 163 |
+
return buf.getvalue()
|
| 164 |
+
|
| 165 |
+
def concat_audio(chunks: List[np.ndarray], sr: int, silence_ms: int = S.join_silence_ms) -> np.ndarray:
|
| 166 |
+
if not chunks:
|
| 167 |
+
return np.zeros((1,), dtype=np.float32)
|
| 168 |
+
if len(chunks) == 1:
|
| 169 |
+
return np.asarray(chunks[0], dtype=np.float32)
|
| 170 |
+
|
| 171 |
+
silence = np.zeros((int(sr * (silence_ms / 1000.0)),), dtype=np.float32) if silence_ms > 0 else None
|
| 172 |
+
out = []
|
| 173 |
+
for i, ch in enumerate(chunks):
|
| 174 |
+
out.append(np.asarray(ch, dtype=np.float32))
|
| 175 |
+
if silence is not None and i != len(chunks) - 1:
|
| 176 |
+
out.append(silence)
|
| 177 |
+
return np.concatenate(out, axis=0)
|
| 178 |
+
|
| 179 |
+
def b64encode_bytes(b: bytes) -> str:
|
| 180 |
+
return base64.b64encode(b).decode("ascii")
|
| 181 |
+
|
| 182 |
+
def safe_filename(prefix: str = "audio", ext: str = ".wav") -> str:
|
| 183 |
+
return f"{prefix}_{int(time.time() * 1000)}{ext}"
|
| 184 |
+
|
| 185 |
+
def _filter_kwargs(fn, kwargs: Dict) -> Dict:
|
| 186 |
+
"""
|
| 187 |
+
Accepts a kwargs dict, returns a subset that the callable supports.
|
| 188 |
+
This keeps the API stable even if underlying libs differ by version.
|
| 189 |
+
"""
|
| 190 |
+
import inspect
|
| 191 |
+
try:
|
| 192 |
+
sig = inspect.signature(fn)
|
| 193 |
+
except Exception:
|
| 194 |
+
return kwargs
|
| 195 |
+
accepted = set(sig.parameters.keys())
|
| 196 |
+
return {k: v for k, v in kwargs.items() if k in accepted}
|
| 197 |
+
|
| 198 |
+
# -----------------------
|
| 199 |
+
# Model manager (lazy + locked)
|
| 200 |
+
# -----------------------
|
| 201 |
+
class _Locks:
|
| 202 |
+
xtts = threading.Lock()
|
| 203 |
+
xtts_infer = threading.Lock() # Protect XTTS inference
|
| 204 |
+
parler = threading.Lock()
|
| 205 |
+
parler_infer = threading.Lock() # Protect Parler inference
|
| 206 |
+
piper = threading.Lock()
|
| 207 |
+
|
| 208 |
+
class ModelManager:
|
| 209 |
+
def __init__(self) -> None:
|
| 210 |
+
self._xtts: Optional[TTS] = None
|
| 211 |
+
self._parler = None
|
| 212 |
+
self._parler_tok = None
|
| 213 |
+
self._piper_voices: Dict[str, str] = {}
|
| 214 |
+
self._piper_loaded: Dict[str, "PiperVoice"] = {} # type: ignore
|
| 215 |
+
|
| 216 |
+
def _maybe_torch_compile(self, module: torch.nn.Module) -> torch.nn.Module:
|
| 217 |
+
if not hasattr(torch, "compile"):
|
| 218 |
+
return module
|
| 219 |
+
try:
|
| 220 |
+
return torch.compile(module) # type: ignore
|
| 221 |
+
except Exception:
|
| 222 |
+
return module
|
| 223 |
+
|
| 224 |
+
def _maybe_dynamic_int8(self, module: torch.nn.Module) -> torch.nn.Module:
|
| 225 |
+
"""
|
| 226 |
+
Dynamic int8 quantization is most useful for Linear-heavy CPU graphs.
|
| 227 |
+
This is a best-effort hook: if it fails, we keep fp32.
|
| 228 |
+
"""
|
| 229 |
+
try:
|
| 230 |
+
from torch.ao.quantization import quantize_dynamic
|
| 231 |
+
return quantize_dynamic(module, {torch.nn.Linear}, dtype=torch.qint8)
|
| 232 |
+
except Exception:
|
| 233 |
+
return module
|
| 234 |
+
|
| 235 |
+
def get_xtts(self) -> TTS:
|
| 236 |
+
with _Locks.xtts:
|
| 237 |
+
if self._xtts is None:
|
| 238 |
+
tts = TTS(model_name=S.xtts_model_name, progress_bar=False, gpu=False)
|
| 239 |
+
# Best-effort: compile/quantize inner model if accessible.
|
| 240 |
+
try:
|
| 241 |
+
# Some versions expose synthesizer.tts_model
|
| 242 |
+
inner = getattr(getattr(tts, "synthesizer", None), "tts_model", None)
|
| 243 |
+
if isinstance(inner, torch.nn.Module):
|
| 244 |
+
if S.xtts_dynamic_int8:
|
| 245 |
+
inner = self._maybe_dynamic_int8(inner)
|
| 246 |
+
tts.synthesizer.tts_model = inner
|
| 247 |
+
if S.xtts_torch_compile:
|
| 248 |
+
inner = self._maybe_torch_compile(inner)
|
| 249 |
+
tts.synthesizer.tts_model = inner
|
| 250 |
+
except Exception:
|
| 251 |
+
pass
|
| 252 |
+
self._xtts = tts
|
| 253 |
+
return self._xtts
|
| 254 |
+
|
| 255 |
+
def get_parler(self):
|
| 256 |
+
with _Locks.parler:
|
| 257 |
+
if ParlerTTSForConditionalGeneration is None:
|
| 258 |
+
raise RuntimeError("parler_tts is not installed or failed to import.")
|
| 259 |
+
if self._parler is None or self._parler_tok is None:
|
| 260 |
+
tok = AutoTokenizer.from_pretrained(S.parler_model_name)
|
| 261 |
+
model = ParlerTTSForConditionalGeneration.from_pretrained(S.parler_model_name).to("cpu")
|
| 262 |
+
model.eval()
|
| 263 |
+
# Best-effort compile/quantize
|
| 264 |
+
if isinstance(model, torch.nn.Module):
|
| 265 |
+
if S.parler_dynamic_int8:
|
| 266 |
+
model = self._maybe_dynamic_int8(model)
|
| 267 |
+
if S.parler_torch_compile:
|
| 268 |
+
model = self._maybe_torch_compile(model)
|
| 269 |
+
self._parler = model
|
| 270 |
+
self._parler_tok = tok
|
| 271 |
+
return self._parler, self._parler_tok
|
| 272 |
+
|
| 273 |
+
def _load_piper_registry(self) -> Dict[str, str]:
|
| 274 |
+
reg: Dict[str, str] = {}
|
| 275 |
+
if S.piper_voices_json:
|
| 276 |
+
try:
|
| 277 |
+
reg.update(json.loads(S.piper_voices_json))
|
| 278 |
+
except Exception:
|
| 279 |
+
pass
|
| 280 |
+
# Auto-scan dir for *.onnx
|
| 281 |
+
try:
|
| 282 |
+
if os.path.isdir(S.piper_voices_dir):
|
| 283 |
+
for fn in os.listdir(S.piper_voices_dir):
|
| 284 |
+
if fn.endswith(".onnx"):
|
| 285 |
+
voice_id = os.path.splitext(fn)[0]
|
| 286 |
+
reg.setdefault(voice_id, os.path.join(S.piper_voices_dir, fn))
|
| 287 |
+
except Exception:
|
| 288 |
+
pass
|
| 289 |
+
return reg
|
| 290 |
+
|
| 291 |
+
def list_piper_voices(self) -> Dict[str, str]:
|
| 292 |
+
with _Locks.piper:
|
| 293 |
+
if not self._piper_voices:
|
| 294 |
+
self._piper_voices = self._load_piper_registry()
|
| 295 |
+
return dict(self._piper_voices)
|
| 296 |
+
|
| 297 |
+
def get_piper(self, voice_id: str):
|
| 298 |
+
if PiperVoice is None:
|
| 299 |
+
raise RuntimeError("piper-tts is not installed or failed to import.")
|
| 300 |
+
with _Locks.piper:
|
| 301 |
+
if not self._piper_voices:
|
| 302 |
+
self._piper_voices = self._load_piper_registry()
|
| 303 |
+
if voice_id not in self._piper_voices:
|
| 304 |
+
raise KeyError(f"Unknown Piper voice_id '{voice_id}'.")
|
| 305 |
+
if voice_id not in self._piper_loaded:
|
| 306 |
+
self._piper_loaded[voice_id] = PiperVoice.load(self._piper_voices[voice_id])
|
| 307 |
+
return self._piper_loaded[voice_id]
|
| 308 |
+
|
| 309 |
+
|
| 310 |
+
MM = ModelManager()
|
| 311 |
+
|
| 312 |
+
# -----------------------
|
| 313 |
+
# Request models
|
| 314 |
+
# -----------------------
|
| 315 |
+
class XttsJsonRequest(BaseModel):
|
| 316 |
+
text: str = Field(..., min_length=1)
|
| 317 |
+
language: str = Field(default=S.xtts_default_language)
|
| 318 |
+
# speed/prosody knobs (best-effort; filtered by signature)
|
| 319 |
+
speed: float = Field(default=1.0, ge=0.5, le=2.0)
|
| 320 |
+
temperature: float = Field(default=0.7, ge=0.0, le=1.5)
|
| 321 |
+
length_penalty: float = Field(default=1.0, ge=0.5, le=2.0)
|
| 322 |
+
# chunking knobs
|
| 323 |
+
chunking: bool = True
|
| 324 |
+
chunk_max_chars: int = Field(default=S.chunk_max_chars, ge=80, le=2000)
|
| 325 |
+
chunk_max_words: int = Field(default=S.chunk_max_words, ge=10, le=300)
|
| 326 |
+
chunk_max_sentences: int = Field(default=S.chunk_max_sentences, ge=0, le=100)
|
| 327 |
+
join_silence_ms: int = Field(default=S.join_silence_ms, ge=0, le=500)
|
| 328 |
+
|
| 329 |
+
class ParlerJsonRequest(BaseModel):
|
| 330 |
+
text: str = Field(..., min_length=1)
|
| 331 |
+
description: str = Field(default=S.parler_default_description)
|
| 332 |
+
seed: int = Field(default=S.parler_seed, ge=0, le=2**31-1)
|
| 333 |
+
temperature: float = Field(default=0.7, ge=0.0, le=1.5)
|
| 334 |
+
max_new_tokens: int = Field(default=512, ge=64, le=2048)
|
| 335 |
+
chunking: bool = True
|
| 336 |
+
chunk_max_chars: int = Field(default=S.chunk_max_chars, ge=80, le=2000)
|
| 337 |
+
chunk_max_words: int = Field(default=S.chunk_max_words, ge=10, le=300)
|
| 338 |
+
chunk_max_sentences: int = Field(default=S.chunk_max_sentences, ge=0, le=100)
|
| 339 |
+
join_silence_ms: int = Field(default=S.join_silence_ms, ge=0, le=500)
|
| 340 |
+
|
| 341 |
+
class PiperJsonRequest(BaseModel):
|
| 342 |
+
text: str = Field(..., min_length=1)
|
| 343 |
+
voice_id: str = Field(..., min_length=1)
|
| 344 |
+
# Optional tuning (depends on Piper voice config support)
|
| 345 |
+
length_scale: float = Field(default=1.0, ge=0.5, le=2.0)
|
| 346 |
+
noise_scale: float = Field(default=0.667, ge=0.0, le=2.0)
|
| 347 |
+
noise_w: float = Field(default=0.8, ge=0.0, le=2.0)
|
| 348 |
+
|
| 349 |
+
# -----------------------
|
| 350 |
+
# Inference functions
|
| 351 |
+
# -----------------------
|
| 352 |
+
def _xtts_synthesize_chunks(
|
| 353 |
+
text: str,
|
| 354 |
+
language: str,
|
| 355 |
+
speaker_wav_path: Optional[str],
|
| 356 |
+
speed: float,
|
| 357 |
+
temperature: float,
|
| 358 |
+
length_penalty: float,
|
| 359 |
+
chunking: bool,
|
| 360 |
+
chunk_max_chars: int,
|
| 361 |
+
chunk_max_words: int,
|
| 362 |
+
chunk_max_sentences: int,
|
| 363 |
+
) -> Tuple[int, List[np.ndarray]]:
|
| 364 |
+
"""
|
| 365 |
+
Returns (sample_rate, [audio_chunks...]) where each chunk is a waveform.
|
| 366 |
+
"""
|
| 367 |
+
tts = MM.get_xtts()
|
| 368 |
+
segments = [normalize_text(text)] if not chunking else split_text_into_chunks(text, chunk_max_chars, chunk_max_words, max_sentences=chunk_max_sentences)
|
| 369 |
+
if not segments:
|
| 370 |
+
raise ValueError("Empty text after normalization.")
|
| 371 |
+
chunks: List[np.ndarray] = []
|
| 372 |
+
|
| 373 |
+
# Protect inference with lock (PyTorch models are not thread-safe for concurrent inference)
|
| 374 |
+
with _Locks.xtts_infer:
|
| 375 |
+
for seg in segments:
|
| 376 |
+
kwargs = {
|
| 377 |
+
"text": seg,
|
| 378 |
+
"speaker_wav": speaker_wav_path,
|
| 379 |
+
"language": language,
|
| 380 |
+
"speed": speed,
|
| 381 |
+
"temperature": temperature,
|
| 382 |
+
"length_penalty": length_penalty,
|
| 383 |
+
}
|
| 384 |
+
kwargs = _filter_kwargs(tts.tts, kwargs)
|
| 385 |
+
audio = tts.tts(**kwargs) # numpy array
|
| 386 |
+
chunks.append(np.asarray(audio, dtype=np.float32))
|
| 387 |
+
|
| 388 |
+
# Best-effort sample rate extraction
|
| 389 |
+
sr = getattr(getattr(tts, "synthesizer", None), "output_sample_rate", None)
|
| 390 |
+
if not isinstance(sr, int):
|
| 391 |
+
# XTTS commonly uses 24000
|
| 392 |
+
sr = 24000
|
| 393 |
+
return sr, chunks
|
| 394 |
+
|
| 395 |
+
def _parler_synthesize_chunks(
|
| 396 |
+
text: str,
|
| 397 |
+
description: str,
|
| 398 |
+
seed: int,
|
| 399 |
+
temperature: float,
|
| 400 |
+
max_new_tokens: int,
|
| 401 |
+
chunking: bool,
|
| 402 |
+
chunk_max_chars: int,
|
| 403 |
+
chunk_max_words: int,
|
| 404 |
+
chunk_max_sentences: int,
|
| 405 |
+
) -> Tuple[int, List[np.ndarray]]:
|
| 406 |
+
model, tok = MM.get_parler()
|
| 407 |
+
if seed:
|
| 408 |
+
set_seed(seed)
|
| 409 |
+
|
| 410 |
+
segments = [normalize_text(text)] if not chunking else split_text_into_chunks(text, chunk_max_chars, chunk_max_words, max_sentences=chunk_max_sentences)
|
| 411 |
+
if not segments:
|
| 412 |
+
raise ValueError("Empty text after normalization.")
|
| 413 |
+
|
| 414 |
+
chunks: List[np.ndarray] = []
|
| 415 |
+
sr = getattr(model.config, "sampling_rate", 24000)
|
| 416 |
+
|
| 417 |
+
# Protect inference with lock (PyTorch models are not thread-safe for concurrent inference)
|
| 418 |
+
with _Locks.parler_infer:
|
| 419 |
+
for seg in segments:
|
| 420 |
+
input_ids = tok(description, return_tensors="pt").input_ids
|
| 421 |
+
prompt_ids = tok(seg, return_tensors="pt").input_ids
|
| 422 |
+
gen_kwargs = {
|
| 423 |
+
"input_ids": input_ids,
|
| 424 |
+
"prompt_input_ids": prompt_ids,
|
| 425 |
+
"temperature": temperature,
|
| 426 |
+
"max_new_tokens": max_new_tokens,
|
| 427 |
+
}
|
| 428 |
+
gen_kwargs = _filter_kwargs(model.generate, gen_kwargs)
|
| 429 |
+
out = model.generate(**gen_kwargs)
|
| 430 |
+
# transformers audio generation outputs: audio_values
|
| 431 |
+
audio = getattr(out, "audio_values", None)
|
| 432 |
+
if audio is None:
|
| 433 |
+
raise RuntimeError("Parler-TTS generate() did not return audio_values. Check transformers/parler_tts versions.")
|
| 434 |
+
wav = audio.cpu().numpy().squeeze()
|
| 435 |
+
chunks.append(np.asarray(wav, dtype=np.float32))
|
| 436 |
+
|
| 437 |
+
return int(sr), chunks
|
| 438 |
+
|
| 439 |
+
def _piper_synthesize(text: str, voice_id: str, length_scale: float, noise_scale: float, noise_w: float) -> Tuple[int, np.ndarray]:
|
| 440 |
+
voice = MM.get_piper(voice_id)
|
| 441 |
+
audio = voice.synthesize(text, length_scale=length_scale, noise_scale=noise_scale, noise_w=noise_w)
|
| 442 |
+
# PiperVoice returns int16 pcm at 22050/16000 depending on voice
|
| 443 |
+
sr = getattr(voice, "sample_rate", 22050)
|
| 444 |
+
audio = np.asarray(audio, dtype=np.float32) / 32768.0
|
| 445 |
+
return int(sr), audio
|
| 446 |
+
|
| 447 |
+
# -----------------------
|
| 448 |
+
# Warmup & monitoring
|
| 449 |
+
# -----------------------
|
| 450 |
+
import logging
|
| 451 |
+
|
| 452 |
+
logger = logging.getLogger(__name__)
|
| 453 |
+
|
| 454 |
+
# Track warmup status
|
| 455 |
+
_warmup_complete = {"parler": False, "xtts": False}
|
| 456 |
+
|
| 457 |
+
async def _warmup_task():
|
| 458 |
+
"""Pre-load Parler-TTS model on startup (non-blocking background task)."""
|
| 459 |
+
try:
|
| 460 |
+
logger.info("🔥 Warming up Parler-TTS model...")
|
| 461 |
+
start = time.time()
|
| 462 |
+
|
| 463 |
+
# Pre-load Parler model and tokenizer
|
| 464 |
+
model, tok = MM.get_parler()
|
| 465 |
+
|
| 466 |
+
# Run tiny inference to compile model graph
|
| 467 |
+
test_text = "Test"
|
| 468 |
+
test_desc = "A neutral voice"
|
| 469 |
+
input_ids = tok(test_desc, return_tensors="pt").input_ids
|
| 470 |
+
prompt_ids = tok(test_text, return_tensors="pt").input_ids
|
| 471 |
+
_ = model.generate(input_ids=input_ids, prompt_input_ids=prompt_ids, max_new_tokens=50)
|
| 472 |
+
|
| 473 |
+
elapsed = time.time() - start
|
| 474 |
+
_warmup_complete["parler"] = True
|
| 475 |
+
logger.info(f"✅ Parler-TTS ready! (warmup took {elapsed:.1f}s)")
|
| 476 |
+
|
| 477 |
+
except Exception as e:
|
| 478 |
+
logger.error(f"❌ Warmup failed: {e}", exc_info=True)
|
| 479 |
+
logger.warning("⚠️ First request will be slow (~60s)")
|
| 480 |
+
|
| 481 |
+
# -----------------------
|
| 482 |
+
# FastAPI app
|
| 483 |
+
# -----------------------
|
| 484 |
+
app = FastAPI(title="TTS API (XTTS v2 / Parler / Piper)", version="1.1.0")
|
| 485 |
+
|
| 486 |
+
|
| 487 |
+
@app.on_event("startup")
|
| 488 |
+
async def warmup():
|
| 489 |
+
"""Launch warmup in background to not block Space readiness check."""
|
| 490 |
+
asyncio.create_task(_warmup_task())
|
| 491 |
+
|
| 492 |
+
|
| 493 |
+
@app.get("/health")
|
| 494 |
+
def health():
|
| 495 |
+
return {
|
| 496 |
+
"status": "ok",
|
| 497 |
+
"warmup_complete": _warmup_complete,
|
| 498 |
+
"models_loaded": {
|
| 499 |
+
"xtts": MM._xtts is not None,
|
| 500 |
+
"parler": MM._parler is not None,
|
| 501 |
+
"piper": len(MM._piper_loaded) > 0,
|
| 502 |
+
},
|
| 503 |
+
"services": {
|
| 504 |
+
"xtts": True,
|
| 505 |
+
"parler": ParlerTTSForConditionalGeneration is not None,
|
| 506 |
+
"piper": PiperVoice is not None,
|
| 507 |
+
},
|
| 508 |
+
"defaults": {
|
| 509 |
+
"chunk_max_chars": S.chunk_max_chars,
|
| 510 |
+
"chunk_max_words": S.chunk_max_words,
|
| 511 |
+
"join_silence_ms": S.join_silence_ms,
|
| 512 |
+
},
|
| 513 |
+
}
|
| 514 |
+
|
| 515 |
+
|
| 516 |
+
@app.get("/v1/piper/voices")
|
| 517 |
+
def list_piper_voices():
|
| 518 |
+
try:
|
| 519 |
+
return {"voices": MM.list_piper_voices()}
|
| 520 |
+
except Exception as e:
|
| 521 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 522 |
+
|
| 523 |
+
|
| 524 |
+
# -------- XTTS endpoints (separate service) --------
|
| 525 |
+
@app.post("/v1/xtts/synthesize")
|
| 526 |
+
async def xtts_synthesize(
|
| 527 |
+
# multipart for true voice cloning upload
|
| 528 |
+
text: str = Form(...),
|
| 529 |
+
language: str = Form(S.xtts_default_language),
|
| 530 |
+
speed: float = Form(1.0),
|
| 531 |
+
temperature: float = Form(0.7),
|
| 532 |
+
length_penalty: float = Form(1.0),
|
| 533 |
+
chunking: bool = Form(True),
|
| 534 |
+
chunk_max_chars: int = Form(S.chunk_max_chars),
|
| 535 |
+
chunk_max_words: int = Form(S.chunk_max_words),
|
| 536 |
+
chunk_max_sentences: int = Form(S.chunk_max_sentences),
|
| 537 |
+
join_silence_ms: int = Form(S.join_silence_ms),
|
| 538 |
+
speaker_wav: Optional[UploadFile] = File(None),
|
| 539 |
+
):
|
| 540 |
+
"""
|
| 541 |
+
Returns a single WAV file.
|
| 542 |
+
If speaker_wav is provided, XTTS does voice cloning (zero-shot, best with 6-10s clean Polish reference).
|
| 543 |
+
"""
|
| 544 |
+
speaker_path = None
|
| 545 |
+
try:
|
| 546 |
+
if speaker_wav is not None:
|
| 547 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(speaker_wav.filename or "ref.wav")[1] or ".wav") as tmp:
|
| 548 |
+
tmp.write(await speaker_wav.read())
|
| 549 |
+
speaker_path = tmp.name
|
| 550 |
+
|
| 551 |
+
def work():
|
| 552 |
+
sr, chunks = _xtts_synthesize_chunks(
|
| 553 |
+
text=text,
|
| 554 |
+
language=language,
|
| 555 |
+
speaker_wav_path=speaker_path,
|
| 556 |
+
speed=float(speed),
|
| 557 |
+
temperature=float(temperature),
|
| 558 |
+
length_penalty=float(length_penalty),
|
| 559 |
+
chunking=bool(chunking),
|
| 560 |
+
chunk_max_chars=int(chunk_max_chars),
|
| 561 |
+
chunk_max_words=int(chunk_max_words),
|
| 562 |
+
chunk_max_sentences=int(chunk_max_sentences),
|
| 563 |
+
)
|
| 564 |
+
audio = concat_audio(chunks, sr, silence_ms=int(join_silence_ms))
|
| 565 |
+
return sr, audio
|
| 566 |
+
|
| 567 |
+
sr, audio = await asyncio.to_thread(work)
|
| 568 |
+
wav = wav_bytes_from_audio(audio, sr)
|
| 569 |
+
return Response(content=wav, media_type="audio/wav", headers={"X-Sample-Rate": str(sr)})
|
| 570 |
+
|
| 571 |
+
except Exception as e:
|
| 572 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 573 |
+
finally:
|
| 574 |
+
if speaker_path and os.path.exists(speaker_path):
|
| 575 |
+
try:
|
| 576 |
+
os.remove(speaker_path)
|
| 577 |
+
except Exception:
|
| 578 |
+
pass
|
| 579 |
+
|
| 580 |
+
|
| 581 |
+
@app.post("/v1/xtts/stream")
|
| 582 |
+
async def xtts_stream(req: XttsJsonRequest, speaker_wav: Optional[UploadFile] = File(None)):
|
| 583 |
+
"""
|
| 584 |
+
Streams chunk-by-chunk via Server-Sent Events (SSE).
|
| 585 |
+
Each SSE event contains base64-encoded standalone WAV for that chunk.
|
| 586 |
+
"""
|
| 587 |
+
speaker_path = None
|
| 588 |
+
try:
|
| 589 |
+
if speaker_wav is not None:
|
| 590 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(speaker_wav.filename or "ref.wav")[1] or ".wav") as tmp:
|
| 591 |
+
tmp.write(await speaker_wav.read())
|
| 592 |
+
speaker_path = tmp.name
|
| 593 |
+
|
| 594 |
+
def gen() -> Generator[bytes, None, None]:
|
| 595 |
+
try:
|
| 596 |
+
sr, chunks = _xtts_synthesize_chunks(
|
| 597 |
+
text=req.text,
|
| 598 |
+
language=req.language,
|
| 599 |
+
speaker_wav_path=speaker_path,
|
| 600 |
+
speed=req.speed,
|
| 601 |
+
temperature=req.temperature,
|
| 602 |
+
length_penalty=req.length_penalty,
|
| 603 |
+
chunking=req.chunking,
|
| 604 |
+
chunk_max_chars=req.chunk_max_chars,
|
| 605 |
+
chunk_max_words=req.chunk_max_words,
|
| 606 |
+
chunk_max_sentences=req.chunk_max_sentences,
|
| 607 |
+
)
|
| 608 |
+
for i, ch in enumerate(chunks):
|
| 609 |
+
wav = wav_bytes_from_audio(ch, sr)
|
| 610 |
+
payload = {"index": i, "sample_rate": sr, "wav_b64": b64encode_bytes(wav), "is_last": False}
|
| 611 |
+
yield f"data: {json.dumps(payload)}\n\n".encode("utf-8")
|
| 612 |
+
yield f"data: {json.dumps({'is_last': True})}\n\n".encode("utf-8")
|
| 613 |
+
except Exception as e:
|
| 614 |
+
err = {"error": str(e), "is_last": True}
|
| 615 |
+
yield f"data: {json.dumps(err)}\n\n".encode("utf-8")
|
| 616 |
+
|
| 617 |
+
return StreamingResponse(gen(), media_type="text/event-stream")
|
| 618 |
+
|
| 619 |
+
finally:
|
| 620 |
+
if speaker_path and os.path.exists(speaker_path):
|
| 621 |
+
try:
|
| 622 |
+
os.remove(speaker_path)
|
| 623 |
+
except Exception:
|
| 624 |
+
pass
|
| 625 |
+
|
| 626 |
+
|
| 627 |
+
# -------- Parler endpoints (separate service) --------
|
| 628 |
+
@app.post("/v1/parler/synthesize")
|
| 629 |
+
async def parler_synthesize(req: ParlerJsonRequest):
|
| 630 |
+
"""
|
| 631 |
+
Returns a single WAV file. Style/emotion are controlled via the `description` field.
|
| 632 |
+
"""
|
| 633 |
+
try:
|
| 634 |
+
def work():
|
| 635 |
+
sr, chunks = _parler_synthesize_chunks(
|
| 636 |
+
text=req.text,
|
| 637 |
+
description=req.description,
|
| 638 |
+
seed=req.seed,
|
| 639 |
+
temperature=req.temperature,
|
| 640 |
+
max_new_tokens=req.max_new_tokens,
|
| 641 |
+
chunking=req.chunking,
|
| 642 |
+
chunk_max_chars=req.chunk_max_chars,
|
| 643 |
+
chunk_max_words=req.chunk_max_words,
|
| 644 |
+
chunk_max_sentences=req.chunk_max_sentences,
|
| 645 |
+
)
|
| 646 |
+
audio = concat_audio(chunks, sr, silence_ms=req.join_silence_ms)
|
| 647 |
+
return sr, audio
|
| 648 |
+
|
| 649 |
+
sr, audio = await asyncio.to_thread(work)
|
| 650 |
+
wav = wav_bytes_from_audio(audio, sr)
|
| 651 |
+
return Response(content=wav, media_type="audio/wav", headers={"X-Sample-Rate": str(sr)})
|
| 652 |
+
except Exception as e:
|
| 653 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 654 |
+
|
| 655 |
+
|
| 656 |
+
@app.post("/v1/parler/stream")
|
| 657 |
+
async def parler_stream(req: ParlerJsonRequest):
|
| 658 |
+
"""
|
| 659 |
+
Streams chunk-by-chunk via SSE (each chunk is a standalone WAV).
|
| 660 |
+
"""
|
| 661 |
+
def gen() -> Generator[bytes, None, None]:
|
| 662 |
+
try:
|
| 663 |
+
sr, chunks = _parler_synthesize_chunks(
|
| 664 |
+
text=req.text,
|
| 665 |
+
description=req.description,
|
| 666 |
+
seed=req.seed,
|
| 667 |
+
temperature=req.temperature,
|
| 668 |
+
max_new_tokens=req.max_new_tokens,
|
| 669 |
+
chunking=req.chunking,
|
| 670 |
+
chunk_max_chars=req.chunk_max_chars,
|
| 671 |
+
chunk_max_words=req.chunk_max_words,
|
| 672 |
+
chunk_max_sentences=req.chunk_max_sentences,
|
| 673 |
+
)
|
| 674 |
+
for i, ch in enumerate(chunks):
|
| 675 |
+
wav = wav_bytes_from_audio(ch, sr)
|
| 676 |
+
payload = {"index": i, "sample_rate": sr, "wav_b64": b64encode_bytes(wav), "is_last": False}
|
| 677 |
+
yield f"data: {json.dumps(payload)}\n\n".encode("utf-8")
|
| 678 |
+
yield f"data: {json.dumps({'is_last': True})}\n\n".encode("utf-8")
|
| 679 |
+
except Exception as e:
|
| 680 |
+
err = {"error": str(e), "is_last": True}
|
| 681 |
+
yield f"data: {json.dumps(err)}\n\n".encode("utf-8")
|
| 682 |
+
|
| 683 |
+
return StreamingResponse(gen(), media_type="text/event-stream")
|
| 684 |
+
|
| 685 |
+
|
| 686 |
+
# -------- Piper endpoints (separate service) --------
|
| 687 |
+
@app.post("/v1/piper/synthesize")
|
| 688 |
+
async def piper_synthesize(req: PiperJsonRequest):
|
| 689 |
+
"""
|
| 690 |
+
Returns a single WAV file from a specified Piper voice.
|
| 691 |
+
Provide the voice .onnx files via:
|
| 692 |
+
- PIPER_VOICES_JSON='{"pl_PL-gosia-medium": "/data/piper/pl_PL-gosia-medium.onnx", ...}'
|
| 693 |
+
- or mount them into /data/piper and call GET /v1/piper/voices.
|
| 694 |
+
"""
|
| 695 |
+
try:
|
| 696 |
+
def work():
|
| 697 |
+
sr, audio = _piper_synthesize(req.text, req.voice_id, req.length_scale, req.noise_scale, req.noise_w)
|
| 698 |
+
return sr, audio
|
| 699 |
+
sr, audio = await asyncio.to_thread(work)
|
| 700 |
+
wav = wav_bytes_from_audio(audio, sr)
|
| 701 |
+
return Response(content=wav, media_type="audio/wav", headers={"X-Sample-Rate": str(sr), "X-Voice-Id": req.voice_id})
|
| 702 |
+
except KeyError as e:
|
| 703 |
+
raise HTTPException(status_code=404, detail=str(e))
|
| 704 |
+
except Exception as e:
|
| 705 |
+
raise HTTPException(status_code=500, detail=str(e))
|
requirements.txt
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core API
|
| 2 |
+
fastapi>=0.110
|
| 3 |
+
uvicorn[standard]>=0.27
|
| 4 |
+
pydantic>=2.6
|
| 5 |
+
python-multipart>=0.0.9
|
| 6 |
+
|
| 7 |
+
# Audio + numerics
|
| 8 |
+
numpy>=1.24
|
| 9 |
+
scipy>=1.10
|
| 10 |
+
soundfile>=0.12
|
| 11 |
+
|
| 12 |
+
# XTTS v2 (Coqui)
|
| 13 |
+
TTS>=0.22.0
|
| 14 |
+
|
| 15 |
+
# Parler-TTS library (install from git - validated by working Space)
|
| 16 |
+
git+https://github.com/huggingface/parler-tts.git@main
|
| 17 |
+
|
| 18 |
+
# Transformers - use range for flexibility (working Space uses unpinned)
|
| 19 |
+
transformers>=4.39,<4.50
|
| 20 |
+
|
| 21 |
+
# Core dependencies
|
| 22 |
+
accelerate>=0.27
|
| 23 |
+
safetensors>=0.4
|
| 24 |
+
|
| 25 |
+
# CRITICAL: Required by Parler tokenizer
|
| 26 |
+
sentencepiece>=0.1.99
|
| 27 |
+
|
| 28 |
+
# Optional: Piper fallback (leave installed, or remove if you truly do not want it)
|
| 29 |
+
piper-tts>=1.2.0
|