Spaces:
Sleeping
Sleeping
Divax
commited on
Commit
·
94fd4b0
1
Parent(s):
71905d8
test
Browse files- Dockerfile +13 -19
- Dockerfile.coqui +0 -51
- README.md +64 -288
- README_coqui.md +0 -351
- app.py +0 -414
- app_config.py +0 -54
- client_example.py +0 -269
- requirements.txt +8 -13
- requirements_coqui.txt +0 -12
- start_c3po_api.py +17 -136
- startup.py +0 -120
- test.py +0 -144
- test_build.py +69 -0
- test_coqui_api.py +0 -146
- test_coqui_tts.py +0 -99
- test_kokoro_install.py +0 -86
Dockerfile
CHANGED
|
@@ -1,13 +1,12 @@
|
|
| 1 |
-
FROM python:3.11
|
| 2 |
|
| 3 |
# Set up a new user named "user" with user ID 1000
|
| 4 |
RUN useradd -m -u 1000 user
|
| 5 |
|
| 6 |
-
# Install system dependencies
|
| 7 |
RUN apt-get update && apt-get install -y \
|
| 8 |
git \
|
| 9 |
git-lfs \
|
| 10 |
-
espeak-ng \
|
| 11 |
ffmpeg \
|
| 12 |
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
|
|
@@ -17,35 +16,30 @@ RUN git lfs install
|
|
| 17 |
# Switch to the "user" user
|
| 18 |
USER user
|
| 19 |
|
| 20 |
-
# Set
|
| 21 |
ENV HOME=/home/user \
|
| 22 |
PATH=/home/user/.local/bin:$PATH \
|
| 23 |
COQUI_TOS_AGREED=1 \
|
| 24 |
-
|
| 25 |
-
FORCE_CPU=true \
|
| 26 |
-
CUDA_VISIBLE_DEVICES=""
|
| 27 |
|
| 28 |
-
# Set the working directory
|
| 29 |
WORKDIR $HOME/app
|
| 30 |
|
| 31 |
-
#
|
| 32 |
RUN pip install --no-cache-dir --upgrade pip
|
| 33 |
|
| 34 |
-
# Copy
|
| 35 |
COPY --chown=user requirements.txt .
|
| 36 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 37 |
|
| 38 |
-
#
|
| 39 |
-
RUN python -
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
| 45 |
-
COPY --chown=user . $HOME/app
|
| 46 |
|
| 47 |
# Expose the port
|
| 48 |
EXPOSE 7860
|
| 49 |
|
| 50 |
-
# Start the API
|
| 51 |
-
CMD ["uvicorn", "
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
|
| 3 |
# Set up a new user named "user" with user ID 1000
|
| 4 |
RUN useradd -m -u 1000 user
|
| 5 |
|
| 6 |
+
# Install only essential system dependencies
|
| 7 |
RUN apt-get update && apt-get install -y \
|
| 8 |
git \
|
| 9 |
git-lfs \
|
|
|
|
| 10 |
ffmpeg \
|
| 11 |
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
|
|
|
|
| 16 |
# Switch to the "user" user
|
| 17 |
USER user
|
| 18 |
|
| 19 |
+
# Set environment variables
|
| 20 |
ENV HOME=/home/user \
|
| 21 |
PATH=/home/user/.local/bin:$PATH \
|
| 22 |
COQUI_TOS_AGREED=1 \
|
| 23 |
+
HF_HUB_DISABLE_TELEMETRY=1
|
|
|
|
|
|
|
| 24 |
|
| 25 |
+
# Set the working directory
|
| 26 |
WORKDIR $HOME/app
|
| 27 |
|
| 28 |
+
# Upgrade pip
|
| 29 |
RUN pip install --no-cache-dir --upgrade pip
|
| 30 |
|
| 31 |
+
# Copy and install requirements
|
| 32 |
COPY --chown=user requirements.txt .
|
| 33 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 34 |
|
| 35 |
+
# Pre-download the C-3PO model to speed up startup
|
| 36 |
+
RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='Borcherding/XTTS-v2_C3PO', local_dir='./models/XTTS-v2_C3PO', local_dir_use_symlinks=False)"
|
| 37 |
|
| 38 |
+
# Copy the API file
|
| 39 |
+
COPY --chown=user coqui_api.py .
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# Expose the port
|
| 42 |
EXPOSE 7860
|
| 43 |
|
| 44 |
+
# Start the C-3PO TTS API
|
| 45 |
+
CMD ["uvicorn", "coqui_api:app", "--host", "0.0.0.0", "--port", "7860"]
|
Dockerfile.coqui
DELETED
|
@@ -1,51 +0,0 @@
|
|
| 1 |
-
FROM python:3.11
|
| 2 |
-
|
| 3 |
-
# Set up a new user named "user" with user ID 1000
|
| 4 |
-
RUN useradd -m -u 1000 user
|
| 5 |
-
|
| 6 |
-
# Install system dependencies as root
|
| 7 |
-
RUN apt-get update && apt-get install -y \
|
| 8 |
-
git \
|
| 9 |
-
git-lfs \
|
| 10 |
-
espeak-ng \
|
| 11 |
-
ffmpeg \
|
| 12 |
-
libsndfile1 \
|
| 13 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
-
|
| 15 |
-
# Initialize git lfs
|
| 16 |
-
RUN git lfs install
|
| 17 |
-
|
| 18 |
-
# Switch to the "user" user
|
| 19 |
-
USER user
|
| 20 |
-
|
| 21 |
-
# Set home to the user's home directory
|
| 22 |
-
ENV HOME=/home/user \
|
| 23 |
-
PATH=/home/user/.local/bin:$PATH \
|
| 24 |
-
COQUI_TOS_AGREED=1 \
|
| 25 |
-
HF_HUB_DISABLE_TELEMETRY=1 \
|
| 26 |
-
HF_HOME=/home/user/.cache/huggingface
|
| 27 |
-
|
| 28 |
-
# Set the working directory to the user's home directory
|
| 29 |
-
WORKDIR $HOME/app
|
| 30 |
-
|
| 31 |
-
# Upgrade pip
|
| 32 |
-
RUN pip install --no-cache-dir --upgrade pip
|
| 33 |
-
|
| 34 |
-
# Install PyTorch with CPU support for Hugging Face Spaces
|
| 35 |
-
RUN pip install --no-cache-dir torch torchaudio --index-url https://download.pytorch.org/whl/cpu
|
| 36 |
-
|
| 37 |
-
# Copy requirements and install dependencies
|
| 38 |
-
COPY --chown=user requirements.txt .
|
| 39 |
-
RUN pip install --no-cache-dir -r requirements.txt
|
| 40 |
-
|
| 41 |
-
# Copy the API file
|
| 42 |
-
COPY --chown=user coqui_api.py .
|
| 43 |
-
|
| 44 |
-
# Create necessary directories
|
| 45 |
-
RUN mkdir -p $HOME/.cache $HOME/app/models
|
| 46 |
-
|
| 47 |
-
# Expose the port
|
| 48 |
-
EXPOSE 7860
|
| 49 |
-
|
| 50 |
-
# Start the Coqui TTS API
|
| 51 |
-
CMD ["uvicorn", "coqui_api:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -1,334 +1,110 @@
|
|
| 1 |
-
|
| 2 |
-
title: XTTS C3PO Voice Cloning API
|
| 3 |
-
emoji: 🤖
|
| 4 |
-
colorFrom: indigo
|
| 5 |
-
colorTo: yellow
|
| 6 |
-
sdk: docker
|
| 7 |
-
pinned: false
|
| 8 |
-
---
|
| 9 |
|
| 10 |
-
|
| 11 |
|
| 12 |
-
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
- **Custom Voice Cloning**: Upload your own reference audio for voice cloning
|
| 18 |
-
- **Multilingual Support**: 16+ languages with C3PO voice
|
| 19 |
-
- **No Upload Required**: Use C3PO voice without any file uploads
|
| 20 |
-
- **RESTful API**: Clean API with automatic documentation
|
| 21 |
-
- **Docker Support**: Optimized for Hugging Face Spaces deployment
|
| 22 |
-
- **PyTorch 2.6 Compatible**: Includes compatibility fixes
|
| 23 |
-
|
| 24 |
-
## About the C3PO Model
|
| 25 |
-
|
| 26 |
-
This API uses the XTTS-v2 C3PO model from [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO), which provides the iconic voice of C-3PO from Star Wars. The model supports:
|
| 27 |
-
|
| 28 |
-
- High-quality C3PO voice synthesis
|
| 29 |
-
- Multilingual C3PO speech (16+ languages)
|
| 30 |
-
- Custom voice cloning capabilities
|
| 31 |
-
- Real-time speech generation
|
| 32 |
-
|
| 33 |
-
## Quick Start
|
| 34 |
-
|
| 35 |
-
### Using C3PO Voice (No Upload Required)
|
| 36 |
|
|
|
|
| 37 |
```bash
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
-F "language=en" \
|
| 41 |
-
--output c3po_speech.wav
|
| 42 |
-
```
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
```bash
|
| 47 |
-
curl -X POST "http://localhost:7860/tts" \
|
| 48 |
-
-F "text=This will be spoken in your custom voice!" \
|
| 49 |
-
-F "language=en" \
|
| 50 |
-
-F "speaker_file=@your_reference_voice.wav" \
|
| 51 |
-
--output custom_speech.wav
|
| 52 |
```
|
| 53 |
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
### C3PO Voice Only
|
| 57 |
-
- **POST** `/tts-c3po` - Generate speech using C3PO voice (no file upload needed)
|
| 58 |
-
- **Parameters:**
|
| 59 |
-
- `text` (form): Text to convert to speech (max 500 characters)
|
| 60 |
-
- `language` (form): Language code (default: "en")
|
| 61 |
-
- `no_lang_auto_detect` (form): Disable automatic language detection
|
| 62 |
-
|
| 63 |
-
### Voice Cloning with Fallback
|
| 64 |
-
- **POST** `/tts` - Convert text to speech with optional custom voice
|
| 65 |
-
- **Parameters:**
|
| 66 |
-
- `text` (form): Text to convert to speech (max 500 characters)
|
| 67 |
-
- `language` (form): Language code (default: "en")
|
| 68 |
-
- `voice_cleanup` (form): Apply audio cleanup to reference voice
|
| 69 |
-
- `no_lang_auto_detect` (form): Disable automatic language detection
|
| 70 |
-
- `speaker_file` (file, optional): Reference speaker audio file (uses C3PO if not provided)
|
| 71 |
-
|
| 72 |
-
### JSON API
|
| 73 |
-
- **POST** `/tts-json` - Convert text to speech using JSON request body
|
| 74 |
-
- **Body:** JSON object with `text`, `language`, `voice_cleanup`, `no_lang_auto_detect`
|
| 75 |
-
- **File:** `speaker_file` (optional) - Reference speaker audio file
|
| 76 |
-
|
| 77 |
-
### Information Endpoints
|
| 78 |
-
- **GET** `/health` - Check API status, device info, and supported languages
|
| 79 |
-
- **GET** `/languages` - Get list of supported languages
|
| 80 |
-
- **GET** `/docs` - Interactive API documentation (Swagger UI)
|
| 81 |
-
|
| 82 |
-
## Usage Examples
|
| 83 |
-
|
| 84 |
-
### Python - C3PO Voice
|
| 85 |
-
|
| 86 |
-
```python
|
| 87 |
-
import requests
|
| 88 |
-
|
| 89 |
-
# Generate C3PO speech
|
| 90 |
-
url = "http://localhost:7860/tts-c3po"
|
| 91 |
-
data = {
|
| 92 |
-
"text": "Hello there! I am C-3PO, human-cyborg relations.",
|
| 93 |
-
"language": "en"
|
| 94 |
-
}
|
| 95 |
-
|
| 96 |
-
response = requests.post(url, data=data)
|
| 97 |
-
|
| 98 |
-
if response.status_code == 200:
|
| 99 |
-
with open("c3po_speech.wav", "wb") as f:
|
| 100 |
-
f.write(response.content)
|
| 101 |
-
print("C3PO speech generated!")
|
| 102 |
-
```
|
| 103 |
-
|
| 104 |
-
### Python - Custom Voice with C3PO Fallback
|
| 105 |
-
|
| 106 |
-
```python
|
| 107 |
-
import requests
|
| 108 |
-
|
| 109 |
-
url = "http://localhost:7860/tts"
|
| 110 |
-
data = {
|
| 111 |
-
"text": "This will use C3PO voice if no speaker file is provided.",
|
| 112 |
-
"language": "en"
|
| 113 |
-
}
|
| 114 |
-
|
| 115 |
-
# No speaker_file provided - will use C3PO voice
|
| 116 |
-
response = requests.post(url, data=data)
|
| 117 |
-
|
| 118 |
-
if response.status_code == 200:
|
| 119 |
-
with open("speech_output.wav", "wb") as f:
|
| 120 |
-
f.write(response.content)
|
| 121 |
-
```
|
| 122 |
-
|
| 123 |
-
### Multilingual C3PO
|
| 124 |
-
|
| 125 |
-
```python
|
| 126 |
-
# C3PO speaking Spanish
|
| 127 |
-
data = {
|
| 128 |
-
"text": "Hola, soy C-3PO. Domino más de seis millones de formas de comunicación.",
|
| 129 |
-
"language": "es"
|
| 130 |
-
}
|
| 131 |
-
response = requests.post("http://localhost:7860/tts-c3po", data=data)
|
| 132 |
-
```
|
| 133 |
-
|
| 134 |
-
## Supported Languages
|
| 135 |
-
|
| 136 |
-
The C3PO model supports all XTTS-v2 languages:
|
| 137 |
-
|
| 138 |
-
- **en** - English
|
| 139 |
-
- **es** - Spanish
|
| 140 |
-
- **fr** - French
|
| 141 |
-
- **de** - German
|
| 142 |
-
- **it** - Italian
|
| 143 |
-
- **pt** - Portuguese (Brazilian)
|
| 144 |
-
- **pl** - Polish
|
| 145 |
-
- **tr** - Turkish
|
| 146 |
-
- **ru** - Russian
|
| 147 |
-
- **nl** - Dutch
|
| 148 |
-
- **cs** - Czech
|
| 149 |
-
- **ar** - Arabic
|
| 150 |
-
- **zh-cn** - Mandarin Chinese
|
| 151 |
-
- **ja** - Japanese
|
| 152 |
-
- **ko** - Korean
|
| 153 |
-
- **hu** - Hungarian
|
| 154 |
-
- **hi** - Hindi
|
| 155 |
-
|
| 156 |
-
## Setup
|
| 157 |
-
|
| 158 |
-
### CPU-Only Installation (Recommended for most users)
|
| 159 |
-
|
| 160 |
-
For CPU-only usage (no GPU required):
|
| 161 |
-
```bash
|
| 162 |
-
# Ubuntu/Debian
|
| 163 |
-
sudo apt-get install espeak-ng ffmpeg git git-lfs
|
| 164 |
-
|
| 165 |
-
# macOS
|
| 166 |
-
brew install espeak ffmpeg git git-lfs
|
| 167 |
-
```
|
| 168 |
-
|
| 169 |
-
2. **Install CPU-only PyTorch and dependencies:**
|
| 170 |
```bash
|
| 171 |
-
#
|
| 172 |
-
chmod +x install_cpu.sh
|
| 173 |
-
./install_cpu.sh
|
| 174 |
-
|
| 175 |
-
# Option 2: Manual installation
|
| 176 |
-
pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu
|
| 177 |
pip install -r requirements.txt
|
| 178 |
-
python -m unidic download
|
| 179 |
-
```
|
| 180 |
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
export FORCE_CPU=true
|
| 184 |
-
export CUDA_VISIBLE_DEVICES=""
|
| 185 |
```
|
| 186 |
|
| 187 |
-
|
| 188 |
-
```bash
|
| 189 |
-
uvicorn app:app --host 0.0.0.0 --port 7860
|
| 190 |
-
```
|
| 191 |
|
| 192 |
-
|
| 193 |
|
| 194 |
-
|
| 195 |
-
- Automatic C3PO model downloading
|
| 196 |
-
- Proper user permissions (user ID 1000)
|
| 197 |
-
- PyTorch 2.6 compatibility fixes
|
| 198 |
-
- COQUI license agreement handling
|
| 199 |
-
|
| 200 |
-
### Local Development
|
| 201 |
-
|
| 202 |
-
1. **Install system dependencies:**
|
| 203 |
```bash
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
brew install espeak ffmpeg git git-lfs
|
| 209 |
```
|
| 210 |
|
| 211 |
-
|
| 212 |
```bash
|
| 213 |
-
|
| 214 |
-
|
|
|
|
|
|
|
| 215 |
```
|
| 216 |
|
| 217 |
-
|
| 218 |
```bash
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
| 220 |
```
|
| 221 |
|
| 222 |
-
|
| 223 |
```bash
|
| 224 |
-
|
| 225 |
```
|
| 226 |
|
| 227 |
-
|
| 228 |
|
| 229 |
-
|
| 230 |
-
# Build and run
|
| 231 |
-
docker build -t xtts-c3po-api .
|
| 232 |
-
docker run -p 7860:7860 xtts-c3po-api
|
| 233 |
-
```
|
| 234 |
|
| 235 |
-
##
|
| 236 |
|
| 237 |
-
|
| 238 |
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
5. **Speaker**: Single speaker, clear pronunciation
|
| 244 |
|
| 245 |
-
##
|
| 246 |
|
| 247 |
-
|
| 248 |
-
- **Voice**: C3PO from Star Wars
|
| 249 |
-
- **Source**: [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO)
|
| 250 |
-
- **Languages**: 16+ supported
|
| 251 |
-
- **License**: CPML (Coqui Public Model License)
|
| 252 |
|
| 253 |
-
## Testing
|
| 254 |
|
| 255 |
-
Run the test suite:
|
| 256 |
```bash
|
| 257 |
-
#
|
| 258 |
-
python
|
| 259 |
-
|
| 260 |
-
# Test API endpoints
|
| 261 |
-
python client_example.py
|
| 262 |
-
```
|
| 263 |
-
|
| 264 |
-
## Environment Variables
|
| 265 |
-
|
| 266 |
-
Automatically configured:
|
| 267 |
-
- `COQUI_TOS_AGREED=1` - Agrees to CPML license
|
| 268 |
-
- `NUMBA_DISABLE_JIT=1` - Disables Numba JIT compilation
|
| 269 |
-
|
| 270 |
-
## API Response Examples
|
| 271 |
-
|
| 272 |
-
### Health Check Response
|
| 273 |
-
```json
|
| 274 |
-
{
|
| 275 |
-
"status": "healthy",
|
| 276 |
-
"device": "cuda",
|
| 277 |
-
"model": "XTTS-v2 C3PO",
|
| 278 |
-
"default_voice": "C3PO",
|
| 279 |
-
"supported_languages": ["en", "es", "fr", ...]
|
| 280 |
-
}
|
| 281 |
-
```
|
| 282 |
-
|
| 283 |
-
### Languages Response
|
| 284 |
-
```json
|
| 285 |
-
{
|
| 286 |
-
"languages": ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "ko", "hu", "hi"]
|
| 287 |
-
}
|
| 288 |
```
|
| 289 |
|
| 290 |
-
##
|
| 291 |
-
|
| 292 |
-
### CPU Performance
|
| 293 |
-
When running on CPU:
|
| 294 |
-
- Speech generation will be slower than GPU (30-60 seconds vs 3-5 seconds)
|
| 295 |
-
- Memory usage is lower (2-4GB RAM vs 4-8GB VRAM)
|
| 296 |
-
- No CUDA installation required
|
| 297 |
-
- Works on any system with sufficient RAM
|
| 298 |
-
|
| 299 |
-
### PyTorch Loading Issues
|
| 300 |
-
The API includes fixes for PyTorch 2.6's `weights_only=True` default. If you encounter loading issues, ensure the compatibility fix is applied.
|
| 301 |
-
|
| 302 |
-
### Model Download Issues
|
| 303 |
-
If the C3PO model fails to download:
|
| 304 |
-
1. Check internet connection
|
| 305 |
-
2. Verify git and git-lfs are installed
|
| 306 |
-
3. Manually clone: `git clone https://huggingface.co/Borcherding/XTTS-v2_C3PO XTTS-v2_C3PO`
|
| 307 |
-
|
| 308 |
-
### Audio Quality Issues
|
| 309 |
-
- Use high-quality reference audio for custom voices
|
| 310 |
-
- Enable `voice_cleanup` for noisy reference audio
|
| 311 |
-
- Ensure reference audio is 3-10 seconds long
|
| 312 |
|
| 313 |
-
|
| 314 |
-
- **CPU Mode**: Requires 2-4GB RAM, works on most modern computers
|
| 315 |
-
- **GPU Mode**: Requires 4GB+ VRAM for optimal performance
|
| 316 |
-
- Reduce text length for batch processing
|
| 317 |
-
- Use CPU mode with `FORCE_CPU=true` environment variable
|
| 318 |
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
1. Set environment variables: `export FORCE_CPU=true CUDA_VISIBLE_DEVICES=""`
|
| 322 |
-
2. Install CPU-only PyTorch: `pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu`
|
| 323 |
-
3. Restart the API after setting environment variables
|
| 324 |
|
| 325 |
-
##
|
| 326 |
|
| 327 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
|
| 329 |
-
## Credits
|
| 330 |
|
| 331 |
-
-
|
| 332 |
-
-
|
| 333 |
-
-
|
| 334 |
|
|
|
|
| 1 |
+
# 🤖 C-3PO TTS API
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
A FastAPI-based text-to-speech service using the **C-3PO fine-tuned XTTS v2 model** from [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO) for authentic C-3PO voice synthesis.
|
| 4 |
|
| 5 |
+
## ✨ Features
|
| 6 |
|
| 7 |
+
- 🤖 **Authentic C-3PO Voice**: Fine-tuned XTTS v2 model with 20 unique C-3PO voice lines
|
| 8 |
+
- 🌍 **17+ Languages**: Multilingual support while maintaining C-3PO characteristics
|
| 9 |
+
- 🎭 **Voice Cloning**: Optional custom voice cloning capabilities
|
| 10 |
+
- 🚀 **FastAPI**: Modern API with automatic documentation
|
| 11 |
+
- 🐳 **Docker Ready**: Containerized for easy deployment
|
| 12 |
|
| 13 |
+
## 🚀 Quick Start
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
+
### Docker Deployment
|
| 16 |
```bash
|
| 17 |
+
# Build the container
|
| 18 |
+
docker build -t c3po-tts .
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
# Run the container
|
| 21 |
+
docker run -p 7860:7860 c3po-tts
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
```
|
| 23 |
|
| 24 |
+
### Local Development
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
```bash
|
| 26 |
+
# Install dependencies
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
pip install -r requirements.txt
|
|
|
|
|
|
|
| 28 |
|
| 29 |
+
# Run the API
|
| 30 |
+
python coqui_api.py
|
|
|
|
|
|
|
| 31 |
```
|
| 32 |
|
| 33 |
+
The API will be available at `http://localhost:7860`
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
+
## 📡 API Endpoints
|
| 36 |
|
| 37 |
+
### C-3PO Text-to-Speech
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
```bash
|
| 39 |
+
curl -X POST "http://localhost:7860/tts-c3po" \
|
| 40 |
+
-F "text=I am C-3PO, human-cyborg relations." \
|
| 41 |
+
-F "language=en" \
|
| 42 |
+
--output c3po_voice.wav
|
|
|
|
| 43 |
```
|
| 44 |
|
| 45 |
+
### General Text-to-Speech (with C-3PO voice by default)
|
| 46 |
```bash
|
| 47 |
+
curl -X POST "http://localhost:7860/tts" \
|
| 48 |
+
-F "text=The odds of successfully navigating an asteroid field are approximately 3,720 to 1." \
|
| 49 |
+
-F "language=en" \
|
| 50 |
+
--output c3po_output.wav
|
| 51 |
```
|
| 52 |
|
| 53 |
+
### JSON API
|
| 54 |
```bash
|
| 55 |
+
curl -X POST "http://localhost:7860/tts-json" \
|
| 56 |
+
-H "Content-Type: application/json" \
|
| 57 |
+
-d '{"text": "R2-D2, you know better than to trust a strange computer!", "language": "en"}' \
|
| 58 |
+
--output c3po_json.wav
|
| 59 |
```
|
| 60 |
|
| 61 |
+
### Health Check
|
| 62 |
```bash
|
| 63 |
+
curl http://localhost:7860/health
|
| 64 |
```
|
| 65 |
|
| 66 |
+
## 🌍 Supported Languages
|
| 67 |
|
| 68 |
+
English, Spanish, French, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese, Japanese, Hungarian, Korean, Hindi
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
+
## 🎨 Example C-3PO Phrases
|
| 71 |
|
| 72 |
+
Perfect texts for demonstrating C-3PO's voice:
|
| 73 |
|
| 74 |
+
- "I am C-3PO, human-cyborg relations."
|
| 75 |
+
- "The odds of successfully navigating an asteroid field are approximately 3,720 to 1."
|
| 76 |
+
- "R2-D2, you know better than to trust a strange computer!"
|
| 77 |
+
- "Oh my! How interesting!"
|
|
|
|
| 78 |
|
| 79 |
+
## 📖 API Documentation
|
| 80 |
|
| 81 |
+
Visit `http://localhost:7860/docs` for interactive API documentation.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
+
## 🧪 Testing
|
| 84 |
|
|
|
|
| 85 |
```bash
|
| 86 |
+
# Run the C-3PO test suite
|
| 87 |
+
python test_c3po_model.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
```
|
| 89 |
|
| 90 |
+
## 🔧 Configuration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
+
The API automatically downloads the C-3PO model on first run. Environment variables:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
+
- `COQUI_TOS_AGREED=1`: Accepts Coqui TTS terms
|
| 95 |
+
- `HF_HUB_DISABLE_TELEMETRY=1`: Disables telemetry
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
+
## 📦 Files
|
| 98 |
|
| 99 |
+
- `coqui_api.py`: Main C-3PO TTS API
|
| 100 |
+
- `test_c3po_model.py`: Test suite for C-3PO functionality
|
| 101 |
+
- `start_c3po_api.py`: Startup script with dependency checks
|
| 102 |
+
- `Dockerfile`: Container configuration
|
| 103 |
+
- `requirements.txt`: Python dependencies
|
| 104 |
|
| 105 |
+
## 🎭 Credits
|
| 106 |
|
| 107 |
+
- [C-3PO Fine-tuned Model](https://huggingface.co/Borcherding/XTTS-v2_C3PO) by Borcherding
|
| 108 |
+
- [Coqui TTS](https://github.com/coqui-ai/TTS) - The underlying TTS engine
|
| 109 |
+
- [FastAPI](https://fastapi.tiangolo.com/) - Web framework
|
| 110 |
|
README_coqui.md
DELETED
|
@@ -1,351 +0,0 @@
|
|
| 1 |
-
# 🤖 Coqui TTS C-3PO API for Hugging Face Spaces
|
| 2 |
-
|
| 3 |
-
A FastAPI-based text-to-speech service using the Coqui TTS library with the **C-3PO fine-tuned XTTS v2 model** from [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO) for authentic C-3PO voice synthesis.
|
| 4 |
-
|
| 5 |
-
## ✨ Features
|
| 6 |
-
|
| 7 |
-
- 🤖 **C-3PO Voice**: Authentic C-3PO voice using fine-tuned XTTS v2 model
|
| 8 |
-
- 🎯 **Text-to-Speech**: Convert text to natural-sounding speech
|
| 9 |
-
- 🎭 **Voice Cloning**: Clone any voice from a reference audio sample
|
| 10 |
-
- 🌍 **Multilingual**: Support for 17+ languages with C-3PO voice characteristics
|
| 11 |
-
- 🚀 **FastAPI**: Modern, fast API with automatic documentation
|
| 12 |
-
- 🐳 **Docker Ready**: Containerized for easy deployment
|
| 13 |
-
- ☁️ **Hugging Face Spaces**: Optimized for HF Spaces deployment
|
| 14 |
-
|
| 15 |
-
## 🎭 C-3PO Model Information
|
| 16 |
-
|
| 17 |
-
This API uses the fine-tuned C-3PO voice model from [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO), which features:
|
| 18 |
-
|
| 19 |
-
- **Fine-tuned on 20 unique C-3PO voice lines** from Star Wars
|
| 20 |
-
- **Multi-lingual support** (17 languages) while maintaining C-3PO's distinctive voice
|
| 21 |
-
- **Emotion & Style Transfer** capturing C-3PO's formal, protocol droid characteristics
|
| 22 |
-
- **High-Quality Audio** output at 24kHz sampling rate
|
| 23 |
-
|
| 24 |
-
## 📡 API Endpoints
|
| 25 |
-
|
| 26 |
-
### 1. Health Check
|
| 27 |
-
```bash
|
| 28 |
-
GET /health
|
| 29 |
-
```
|
| 30 |
-
Returns API status, model information, and C-3PO voice availability.
|
| 31 |
-
|
| 32 |
-
### 2. List Models
|
| 33 |
-
```bash
|
| 34 |
-
GET /models
|
| 35 |
-
```
|
| 36 |
-
Returns available TTS models.
|
| 37 |
-
|
| 38 |
-
### 3. C-3PO Text-to-Speech (Dedicated)
|
| 39 |
-
```bash
|
| 40 |
-
POST /tts-c3po
|
| 41 |
-
```
|
| 42 |
-
**Parameters:**
|
| 43 |
-
- `text` (string): Text to convert to C-3PO voice (2-500 characters)
|
| 44 |
-
- `language` (string): Language code (default: "en")
|
| 45 |
-
|
| 46 |
-
**Example using curl:**
|
| 47 |
-
```bash
|
| 48 |
-
curl -X POST "http://localhost:7860/tts-c3po" \
|
| 49 |
-
-F "text=I am C-3PO, human-cyborg relations." \
|
| 50 |
-
-F "language=en" \
|
| 51 |
-
--output c3po_voice.wav
|
| 52 |
-
```
|
| 53 |
-
|
| 54 |
-
### 4. General Text-to-Speech
|
| 55 |
-
```bash
|
| 56 |
-
POST /tts
|
| 57 |
-
```
|
| 58 |
-
**Parameters:**
|
| 59 |
-
- `text` (string): Text to convert to speech (2-500 characters)
|
| 60 |
-
- `language` (string): Language code (default: "en")
|
| 61 |
-
- `speaker_file` (file, optional): Reference audio for voice cloning
|
| 62 |
-
- `use_c3po_voice` (boolean): Use C-3PO voice if no speaker file provided (default: true)
|
| 63 |
-
|
| 64 |
-
**Example using curl:**
|
| 65 |
-
```bash
|
| 66 |
-
# C-3PO voice (default)
|
| 67 |
-
curl -X POST "http://localhost:7860/tts" \
|
| 68 |
-
-F "text=The odds of successfully navigating an asteroid field are approximately 3,720 to 1." \
|
| 69 |
-
-F "language=en" \
|
| 70 |
-
--output c3po_output.wav
|
| 71 |
-
|
| 72 |
-
# Custom voice cloning
|
| 73 |
-
curl -X POST "http://localhost:7860/tts" \
|
| 74 |
-
-F "text=This will sound like the reference voice." \
|
| 75 |
-
-F "language=en" \
|
| 76 |
-
-F "speaker_file=@reference_voice.wav" \
|
| 77 |
-
-F "use_c3po_voice=false" \
|
| 78 |
-
--output cloned_voice.wav
|
| 79 |
-
```
|
| 80 |
-
|
| 81 |
-
### 5. JSON TTS (C-3PO Voice)
|
| 82 |
-
```bash
|
| 83 |
-
POST /tts-json
|
| 84 |
-
```
|
| 85 |
-
**JSON Body:**
|
| 86 |
-
```json
|
| 87 |
-
{
|
| 88 |
-
"text": "R2-D2, you know better than to trust a strange computer!",
|
| 89 |
-
"language": "en"
|
| 90 |
-
}
|
| 91 |
-
```
|
| 92 |
-
|
| 93 |
-
## 🚀 Deployment on Hugging Face Spaces
|
| 94 |
-
|
| 95 |
-
### Step 1: Create a new Space
|
| 96 |
-
1. Go to [Hugging Face Spaces](https://huggingface.co/spaces)
|
| 97 |
-
2. Click "Create new Space"
|
| 98 |
-
3. Choose "Docker" as the SDK
|
| 99 |
-
4. Set your space name and visibility
|
| 100 |
-
|
| 101 |
-
### Step 2: Add files to your Space
|
| 102 |
-
Upload these files to your Hugging Face Space repository:
|
| 103 |
-
|
| 104 |
-
```
|
| 105 |
-
your-space/
|
| 106 |
-
├── coqui_api.py # Main API file with C-3PO integration
|
| 107 |
-
├── requirements.txt # Dependencies (includes huggingface_hub)
|
| 108 |
-
├── Dockerfile.coqui # Docker configuration
|
| 109 |
-
├── test_c3po_model.py # Test script for C-3PO functionality
|
| 110 |
-
└── README.md # This file
|
| 111 |
-
```
|
| 112 |
-
|
| 113 |
-
### Step 3: Configure your Space
|
| 114 |
-
Rename the files in your Space:
|
| 115 |
-
- `Dockerfile.coqui` → `Dockerfile`
|
| 116 |
-
|
| 117 |
-
### Step 4: Deploy
|
| 118 |
-
Your Space will automatically build and deploy. The build process may take 15-20 minutes as it downloads the C-3PO fine-tuned model from Hugging Face.
|
| 119 |
-
|
| 120 |
-
## 💻 Local Development
|
| 121 |
-
|
| 122 |
-
### Requirements
|
| 123 |
-
- Python 3.11+
|
| 124 |
-
- PyTorch
|
| 125 |
-
- Coqui TTS library
|
| 126 |
-
- Hugging Face Hub
|
| 127 |
-
|
| 128 |
-
### Installation
|
| 129 |
-
```bash
|
| 130 |
-
# Clone the repository
|
| 131 |
-
git clone <your-repo>
|
| 132 |
-
cd <your-repo>
|
| 133 |
-
|
| 134 |
-
# Install dependencies
|
| 135 |
-
pip install -r requirements.txt
|
| 136 |
-
|
| 137 |
-
# Run the API
|
| 138 |
-
python coqui_api.py
|
| 139 |
-
```
|
| 140 |
-
|
| 141 |
-
The API will be available at `http://localhost:7860`
|
| 142 |
-
|
| 143 |
-
### Testing
|
| 144 |
-
```bash
|
| 145 |
-
# Run the C-3PO model test suite
|
| 146 |
-
python test_c3po_model.py
|
| 147 |
-
|
| 148 |
-
# Run the general test client
|
| 149 |
-
python test_coqui_api.py
|
| 150 |
-
```
|
| 151 |
-
|
| 152 |
-
## 🎪 Usage Examples
|
| 153 |
-
|
| 154 |
-
### Python Client - C-3PO Voice
|
| 155 |
-
```python
|
| 156 |
-
import requests
|
| 157 |
-
|
| 158 |
-
# C-3PO voice synthesis
|
| 159 |
-
data = {"text": "I am C-3PO, human-cyborg relations.", "language": "en"}
|
| 160 |
-
response = requests.post("http://localhost:7860/tts-c3po", data=data)
|
| 161 |
-
|
| 162 |
-
with open("c3po_output.wav", "wb") as f:
|
| 163 |
-
f.write(response.content)
|
| 164 |
-
|
| 165 |
-
# JSON API
|
| 166 |
-
import json
|
| 167 |
-
headers = {'Content-Type': 'application/json'}
|
| 168 |
-
data = {"text": "The odds are approximately 3,720 to 1!", "language": "en"}
|
| 169 |
-
response = requests.post("http://localhost:7860/tts-json", json=data, headers=headers)
|
| 170 |
-
|
| 171 |
-
with open("c3po_json.wav", "wb") as f:
|
| 172 |
-
f.write(response.content)
|
| 173 |
-
```
|
| 174 |
-
|
| 175 |
-
### JavaScript/Web - C-3PO Voice
|
| 176 |
-
```javascript
|
| 177 |
-
// C-3PO voice synthesis
|
| 178 |
-
const formData = new FormData();
|
| 179 |
-
formData.append('text', 'Oh my! How interesting!');
|
| 180 |
-
formData.append('language', 'en');
|
| 181 |
-
|
| 182 |
-
fetch('http://localhost:7860/tts-c3po', {
|
| 183 |
-
method: 'POST',
|
| 184 |
-
body: formData
|
| 185 |
-
})
|
| 186 |
-
.then(response => response.blob())
|
| 187 |
-
.then(blob => {
|
| 188 |
-
const url = URL.createObjectURL(blob);
|
| 189 |
-
const audio = new Audio(url);
|
| 190 |
-
audio.play();
|
| 191 |
-
});
|
| 192 |
-
|
| 193 |
-
// JSON API
|
| 194 |
-
fetch('http://localhost:7860/tts-json', {
|
| 195 |
-
method: 'POST',
|
| 196 |
-
headers: {'Content-Type': 'application/json'},
|
| 197 |
-
body: JSON.stringify({
|
| 198 |
-
text: 'R2-D2, you know better than to trust a strange computer!',
|
| 199 |
-
language: 'en'
|
| 200 |
-
})
|
| 201 |
-
})
|
| 202 |
-
.then(response => response.blob())
|
| 203 |
-
.then(blob => {
|
| 204 |
-
const url = URL.createObjectURL(blob);
|
| 205 |
-
const audio = new Audio(url);
|
| 206 |
-
audio.play();
|
| 207 |
-
});
|
| 208 |
-
```
|
| 209 |
-
|
| 210 |
-
## 🎨 C-3PO Voice Examples
|
| 211 |
-
|
| 212 |
-
Perfect texts for demonstrating C-3PO's voice characteristics:
|
| 213 |
-
|
| 214 |
-
```bash
|
| 215 |
-
# Classic C-3PO phrases
|
| 216 |
-
curl -X POST "http://localhost:7860/tts-c3po" \
|
| 217 |
-
-F "text=I am C-3PO, human-cyborg relations." \
|
| 218 |
-
-F "language=en" --output c3po_intro.wav
|
| 219 |
-
|
| 220 |
-
curl -X POST "http://localhost:7860/tts-c3po" \
|
| 221 |
-
-F "text=The odds of successfully navigating an asteroid field are approximately 3,720 to 1." \
|
| 222 |
-
-F "language=en" --output c3po_odds.wav
|
| 223 |
-
|
| 224 |
-
curl -X POST "http://localhost:7860/tts-c3po" \
|
| 225 |
-
-F "text=R2-D2, you know better than to trust a strange computer!" \
|
| 226 |
-
-F "language=en" --output c3po_r2d2.wav
|
| 227 |
-
|
| 228 |
-
curl -X POST "http://localhost:7860/tts-c3po" \
|
| 229 |
-
-F "text=Oh my! How interesting!" \
|
| 230 |
-
-F "language=en" --output c3po_oh_my.wav
|
| 231 |
-
```
|
| 232 |
-
|
| 233 |
-
## 🌍 Multilingual C-3PO Support
|
| 234 |
-
|
| 235 |
-
The C-3PO model maintains its distinctive voice characteristics across multiple languages:
|
| 236 |
-
|
| 237 |
-
```python
|
| 238 |
-
# Multilingual examples
|
| 239 |
-
languages = [
|
| 240 |
-
("Hello, I am C-3PO", "en"),
|
| 241 |
-
("Hola, soy C-3PO", "es"),
|
| 242 |
-
("Bonjour, je suis C-3PO", "fr"),
|
| 243 |
-
("Guten Tag, ich bin C-3PO", "de"),
|
| 244 |
-
("Ciao, sono C-3PO", "it"),
|
| 245 |
-
("Olá, eu sou C-3PO", "pt")
|
| 246 |
-
]
|
| 247 |
-
|
| 248 |
-
for text, lang in languages:
|
| 249 |
-
response = requests.post("http://localhost:7860/tts-c3po",
|
| 250 |
-
data={"text": text, "language": lang})
|
| 251 |
-
with open(f"c3po_{lang}.wav", "wb") as f:
|
| 252 |
-
f.write(response.content)
|
| 253 |
-
```
|
| 254 |
-
|
| 255 |
-
## 🔧 Voice Cloning Guide
|
| 256 |
-
|
| 257 |
-
1. **Prepare Reference Audio:**
|
| 258 |
-
- Duration: 5-10 seconds (optimal)
|
| 259 |
-
- Format: WAV, MP3, or M4A
|
| 260 |
-
- Quality: Clear speech, minimal background noise
|
| 261 |
-
- Content: Natural speaking, preferably in target language
|
| 262 |
-
|
| 263 |
-
2. **API Request:**
|
| 264 |
-
```bash
|
| 265 |
-
curl -X POST "http://your-space.hf.space/tts" \
|
| 266 |
-
-F "text=Your text to synthesize" \
|
| 267 |
-
-F "language=en" \
|
| 268 |
-
-F "speaker_file=@your_reference.wav" \
|
| 269 |
-
--output result.wav
|
| 270 |
-
```
|
| 271 |
-
|
| 272 |
-
3. **Tips for Best Results:**
|
| 273 |
-
- Use high-quality reference audio
|
| 274 |
-
- Match the language of reference and target text
|
| 275 |
-
- Keep text length reasonable (under 500 characters)
|
| 276 |
-
- Experiment with different reference samples
|
| 277 |
-
|
| 278 |
-
## Supported Languages
|
| 279 |
-
|
| 280 |
-
The XTTS v2 model supports multiple languages including:
|
| 281 |
-
- English (en)
|
| 282 |
-
- Spanish (es)
|
| 283 |
-
- French (fr)
|
| 284 |
-
- German (de)
|
| 285 |
-
- Italian (it)
|
| 286 |
-
- Portuguese (pt)
|
| 287 |
-
- Polish (pl)
|
| 288 |
-
- Turkish (tr)
|
| 289 |
-
- Russian (ru)
|
| 290 |
-
- Dutch (nl)
|
| 291 |
-
- Czech (cs)
|
| 292 |
-
- Arabic (ar)
|
| 293 |
-
- Chinese (zh-cn)
|
| 294 |
-
- Japanese (ja)
|
| 295 |
-
- Hungarian (hu)
|
| 296 |
-
- Korean (ko)
|
| 297 |
-
|
| 298 |
-
## Troubleshooting
|
| 299 |
-
|
| 300 |
-
### Common Issues
|
| 301 |
-
|
| 302 |
-
1. **Model Download Errors:**
|
| 303 |
-
- The first run downloads ~1.7GB model files
|
| 304 |
-
- Ensure stable internet connection
|
| 305 |
-
- Check Hugging Face Spaces logs
|
| 306 |
-
|
| 307 |
-
2. **Audio Quality Issues:**
|
| 308 |
-
- Use high-quality reference audio for voice cloning
|
| 309 |
-
- Ensure reference audio matches target language
|
| 310 |
-
- Try different reference samples
|
| 311 |
-
|
| 312 |
-
3. **Memory Issues on HF Spaces:**
|
| 313 |
-
- The model requires significant memory
|
| 314 |
-
- Consider upgrading to a higher-tier Space if needed
|
| 315 |
-
|
| 316 |
-
4. **API Timeouts:**
|
| 317 |
-
- Initial model loading takes time
|
| 318 |
-
- Subsequent requests are faster
|
| 319 |
-
- Consider warming up the model with a test request
|
| 320 |
-
|
| 321 |
-
### Environment Variables
|
| 322 |
-
|
| 323 |
-
- `COQUI_TOS_AGREED=1`: Accepts Coqui TTS terms of service
|
| 324 |
-
- `HF_HUB_DISABLE_TELEMETRY=1`: Disables telemetry
|
| 325 |
-
- `TORCH_HOME`: PyTorch cache directory
|
| 326 |
-
|
| 327 |
-
## API Documentation
|
| 328 |
-
|
| 329 |
-
Once deployed, visit your Space URL and add `/docs` to access the interactive API documentation:
|
| 330 |
-
```
|
| 331 |
-
https://your-username-your-space-name.hf.space/docs
|
| 332 |
-
```
|
| 333 |
-
|
| 334 |
-
## Contributing
|
| 335 |
-
|
| 336 |
-
1. Fork the repository
|
| 337 |
-
2. Create a feature branch
|
| 338 |
-
3. Make your changes
|
| 339 |
-
4. Test thoroughly
|
| 340 |
-
5. Submit a pull request
|
| 341 |
-
|
| 342 |
-
## License
|
| 343 |
-
|
| 344 |
-
This project uses the Coqui TTS library. Please check [Coqui TTS license](https://github.com/coqui-ai/TTS) for usage terms.
|
| 345 |
-
|
| 346 |
-
## Credits
|
| 347 |
-
|
| 348 |
-
- [Coqui TTS](https://github.com/coqui-ai/TTS) - The underlying TTS engine
|
| 349 |
-
- [XTTS v2](https://arxiv.org/abs/2309.11321) - The voice cloning model
|
| 350 |
-
- [FastAPI](https://fastapi.tiangolo.com/) - Web framework
|
| 351 |
-
- [Hugging Face Spaces](https://huggingface.co/spaces) - Deployment platform
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
DELETED
|
@@ -1,414 +0,0 @@
|
|
| 1 |
-
# Import configuration first to setup environment
|
| 2 |
-
import app_config
|
| 3 |
-
|
| 4 |
-
import os
|
| 5 |
-
import sys
|
| 6 |
-
import io
|
| 7 |
-
import subprocess
|
| 8 |
-
import uuid
|
| 9 |
-
import time
|
| 10 |
-
import torch
|
| 11 |
-
import torchaudio
|
| 12 |
-
import tempfile
|
| 13 |
-
import logging
|
| 14 |
-
from typing import Optional
|
| 15 |
-
|
| 16 |
-
# Fix PyTorch weights_only issue for XTTS
|
| 17 |
-
import torch.serialization
|
| 18 |
-
from TTS.tts.configs.xtts_config import XttsConfig
|
| 19 |
-
torch.serialization.add_safe_globals([XttsConfig])
|
| 20 |
-
|
| 21 |
-
# Set environment variables
|
| 22 |
-
os.environ["COQUI_TOS_AGREED"] = "1"
|
| 23 |
-
os.environ["NUMBA_DISABLE_JIT"] = "1"
|
| 24 |
-
|
| 25 |
-
# Force CPU usage if specified
|
| 26 |
-
if os.environ.get("FORCE_CPU", "false").lower() == "true":
|
| 27 |
-
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
| 28 |
-
|
| 29 |
-
from fastapi import FastAPI, HTTPException, UploadFile, File, Form
|
| 30 |
-
from fastapi.responses import FileResponse
|
| 31 |
-
from pydantic import BaseModel
|
| 32 |
-
import langid
|
| 33 |
-
from scipy.io.wavfile import write
|
| 34 |
-
from pydub import AudioSegment
|
| 35 |
-
|
| 36 |
-
from TTS.api import TTS
|
| 37 |
-
from TTS.tts.configs.xtts_config import XttsConfig
|
| 38 |
-
from TTS.tts.models.xtts import Xtts
|
| 39 |
-
from TTS.utils.generic_utils import get_user_data_dir
|
| 40 |
-
|
| 41 |
-
# Configure logging
|
| 42 |
-
logging.basicConfig(level=logging.INFO)
|
| 43 |
-
logger = logging.getLogger(__name__)
|
| 44 |
-
|
| 45 |
-
app = FastAPI(title="XTTS C3PO API", description="Text-to-Speech API using XTTS-v2 C3PO model", version="1.0.0")
|
| 46 |
-
|
| 47 |
-
class TTSRequest(BaseModel):
|
| 48 |
-
text: str
|
| 49 |
-
language: str = "en"
|
| 50 |
-
voice_cleanup: bool = False
|
| 51 |
-
no_lang_auto_detect: bool = False
|
| 52 |
-
|
| 53 |
-
class XTTSService:
|
| 54 |
-
def __init__(self):
|
| 55 |
-
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 56 |
-
logger.info(f"Using device: {self.device}")
|
| 57 |
-
|
| 58 |
-
# Use the C3PO model path
|
| 59 |
-
self.model_path = "XTTS-v2_C3PO/"
|
| 60 |
-
self.config_path = "XTTS-v2_C3PO/config.json"
|
| 61 |
-
|
| 62 |
-
# Check if model files exist, if not download them
|
| 63 |
-
if not os.path.exists(self.config_path):
|
| 64 |
-
logger.info("C3PO model not found locally, downloading...")
|
| 65 |
-
self._download_c3po_model()
|
| 66 |
-
|
| 67 |
-
# Load configuration
|
| 68 |
-
config = XttsConfig()
|
| 69 |
-
config.load_json(self.config_path)
|
| 70 |
-
|
| 71 |
-
# Initialize and load model
|
| 72 |
-
self.model = Xtts.init_from_config(config)
|
| 73 |
-
self.model.load_checkpoint(
|
| 74 |
-
config,
|
| 75 |
-
checkpoint_path=os.path.join(self.model_path, "model.pth"),
|
| 76 |
-
vocab_path=os.path.join(self.model_path, "vocab.json"),
|
| 77 |
-
eval=True,
|
| 78 |
-
)
|
| 79 |
-
|
| 80 |
-
if self.device == "cuda":
|
| 81 |
-
self.model.cuda()
|
| 82 |
-
|
| 83 |
-
self.supported_languages = config.languages
|
| 84 |
-
logger.info(f"XTTS C3PO model loaded successfully. Supported languages: {self.supported_languages}")
|
| 85 |
-
|
| 86 |
-
# Set default reference audio (C3PO voice)
|
| 87 |
-
self.default_reference = os.path.join(self.model_path, "reference.wav")
|
| 88 |
-
if not os.path.exists(self.default_reference):
|
| 89 |
-
# Look for any reference audio in the model directory
|
| 90 |
-
for file in os.listdir(self.model_path):
|
| 91 |
-
if file.endswith(('.wav', '.mp3', '.m4a')):
|
| 92 |
-
self.default_reference = os.path.join(self.model_path, file)
|
| 93 |
-
break
|
| 94 |
-
else:
|
| 95 |
-
self.default_reference = None
|
| 96 |
-
|
| 97 |
-
if self.default_reference:
|
| 98 |
-
logger.info(f"Default C3PO reference audio: {self.default_reference}")
|
| 99 |
-
else:
|
| 100 |
-
logger.warning("No default reference audio found in C3PO model directory")
|
| 101 |
-
|
| 102 |
-
def _download_c3po_model(self):
|
| 103 |
-
"""Download the C3PO model from Hugging Face"""
|
| 104 |
-
try:
|
| 105 |
-
logger.info("Downloading C3PO model from Hugging Face...")
|
| 106 |
-
subprocess.run([
|
| 107 |
-
"git", "clone",
|
| 108 |
-
"https://huggingface.co/Borcherding/XTTS-v2_C3PO",
|
| 109 |
-
"XTTS-v2_C3PO"
|
| 110 |
-
], check=True)
|
| 111 |
-
logger.info("C3PO model downloaded successfully")
|
| 112 |
-
except subprocess.CalledProcessError as e:
|
| 113 |
-
logger.error(f"Failed to download C3PO model: {e}")
|
| 114 |
-
raise HTTPException(status_code=500, detail="Failed to download C3PO model")
|
| 115 |
-
|
| 116 |
-
def generate_speech(self, text: str, speaker_wav_path: str = None, language: str = "en",
|
| 117 |
-
voice_cleanup: bool = False, no_lang_auto_detect: bool = False) -> str:
|
| 118 |
-
"""Generate speech and return the path to the output file"""
|
| 119 |
-
try:
|
| 120 |
-
# Use default C3PO voice if no speaker file provided
|
| 121 |
-
if speaker_wav_path is None:
|
| 122 |
-
if self.default_reference is None:
|
| 123 |
-
raise HTTPException(status_code=400, detail="No reference audio available. Please upload a speaker file.")
|
| 124 |
-
speaker_wav_path = self.default_reference
|
| 125 |
-
logger.info("Using default C3PO voice")
|
| 126 |
-
|
| 127 |
-
# Validate language
|
| 128 |
-
if language not in self.supported_languages:
|
| 129 |
-
raise HTTPException(status_code=400, detail=f"Language '{language}' not supported. Supported: {self.supported_languages}")
|
| 130 |
-
|
| 131 |
-
# Language detection for longer texts
|
| 132 |
-
if len(text) > 15 and not no_lang_auto_detect:
|
| 133 |
-
language_predicted = langid.classify(text)[0].strip()
|
| 134 |
-
if language_predicted == "zh":
|
| 135 |
-
language_predicted = "zh-cn"
|
| 136 |
-
|
| 137 |
-
if language_predicted != language:
|
| 138 |
-
logger.warning(f"Detected language: {language_predicted}, chosen: {language}")
|
| 139 |
-
|
| 140 |
-
# Text length validation
|
| 141 |
-
if len(text) < 2:
|
| 142 |
-
raise HTTPException(status_code=400, detail="Text too short, please provide longer text")
|
| 143 |
-
|
| 144 |
-
if len(text) > 500: # Increased limit for API
|
| 145 |
-
raise HTTPException(status_code=400, detail="Text too long, maximum 500 characters")
|
| 146 |
-
|
| 147 |
-
# Voice cleanup if requested
|
| 148 |
-
processed_speaker_wav = speaker_wav_path
|
| 149 |
-
if voice_cleanup:
|
| 150 |
-
processed_speaker_wav = self._cleanup_audio(speaker_wav_path)
|
| 151 |
-
|
| 152 |
-
# Generate conditioning latents
|
| 153 |
-
try:
|
| 154 |
-
gpt_cond_latent, speaker_embedding = self.model.get_conditioning_latents(
|
| 155 |
-
audio_path=processed_speaker_wav,
|
| 156 |
-
gpt_cond_len=30,
|
| 157 |
-
gpt_cond_chunk_len=4,
|
| 158 |
-
max_ref_length=60
|
| 159 |
-
)
|
| 160 |
-
except Exception as e:
|
| 161 |
-
logger.error(f"Speaker encoding error: {e}")
|
| 162 |
-
raise HTTPException(status_code=400, detail="Error processing reference audio. Please check the audio file.")
|
| 163 |
-
|
| 164 |
-
# Generate speech
|
| 165 |
-
logger.info("Generating speech...")
|
| 166 |
-
start_time = time.time()
|
| 167 |
-
|
| 168 |
-
out = self.model.inference(
|
| 169 |
-
text,
|
| 170 |
-
language,
|
| 171 |
-
gpt_cond_latent,
|
| 172 |
-
speaker_embedding,
|
| 173 |
-
repetition_penalty=5.0,
|
| 174 |
-
temperature=0.75,
|
| 175 |
-
)
|
| 176 |
-
|
| 177 |
-
inference_time = time.time() - start_time
|
| 178 |
-
logger.info(f"Speech generation completed in {inference_time:.2f} seconds")
|
| 179 |
-
|
| 180 |
-
# Save output
|
| 181 |
-
output_filename = f"xtts_c3po_output_{uuid.uuid4().hex}.wav"
|
| 182 |
-
output_path = os.path.join(tempfile.gettempdir(), output_filename)
|
| 183 |
-
|
| 184 |
-
torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
|
| 185 |
-
|
| 186 |
-
return output_path
|
| 187 |
-
|
| 188 |
-
except Exception as e:
|
| 189 |
-
logger.error(f"Error generating speech: {e}")
|
| 190 |
-
if isinstance(e, HTTPException):
|
| 191 |
-
raise e
|
| 192 |
-
raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")
|
| 193 |
-
|
| 194 |
-
def _cleanup_audio(self, audio_path: str) -> str:
|
| 195 |
-
"""Apply audio cleanup filters"""
|
| 196 |
-
try:
|
| 197 |
-
output_path = audio_path + "_cleaned.wav"
|
| 198 |
-
|
| 199 |
-
# Basic audio cleanup using ffmpeg-python or similar
|
| 200 |
-
# For now, just return the original path
|
| 201 |
-
# You can implement more sophisticated cleanup here
|
| 202 |
-
|
| 203 |
-
return audio_path
|
| 204 |
-
except Exception as e:
|
| 205 |
-
logger.warning(f"Audio cleanup failed: {e}, using original audio")
|
| 206 |
-
return audio_path
|
| 207 |
-
|
| 208 |
-
# Initialize XTTS service
|
| 209 |
-
logger.info("Initializing XTTS C3PO service...")
|
| 210 |
-
tts_service = XTTSService()
|
| 211 |
-
|
| 212 |
-
@app.get("/")
|
| 213 |
-
async def root():
|
| 214 |
-
return {"message": "XTTS C3PO API is running", "status": "healthy", "model": "C3PO"}
|
| 215 |
-
|
| 216 |
-
@app.get("/health")
|
| 217 |
-
async def health_check():
|
| 218 |
-
return {
|
| 219 |
-
"status": "healthy",
|
| 220 |
-
"device": tts_service.device,
|
| 221 |
-
"model": "XTTS-v2 C3PO",
|
| 222 |
-
"supported_languages": tts_service.supported_languages,
|
| 223 |
-
"default_voice": "C3PO" if tts_service.default_reference else "None"
|
| 224 |
-
}
|
| 225 |
-
|
| 226 |
-
@app.get("/languages")
|
| 227 |
-
async def get_languages():
|
| 228 |
-
"""Get list of supported languages"""
|
| 229 |
-
return {"languages": tts_service.supported_languages}
|
| 230 |
-
|
| 231 |
-
@app.post("/tts")
|
| 232 |
-
async def text_to_speech(
|
| 233 |
-
text: str = Form(...),
|
| 234 |
-
language: str = Form("en"),
|
| 235 |
-
voice_cleanup: bool = Form(False),
|
| 236 |
-
no_lang_auto_detect: bool = Form(False),
|
| 237 |
-
speaker_file: UploadFile = File(None)
|
| 238 |
-
):
|
| 239 |
-
"""
|
| 240 |
-
Convert text to speech using XTTS C3PO voice cloning
|
| 241 |
-
|
| 242 |
-
- **text**: The text to convert to speech (max 500 characters)
|
| 243 |
-
- **language**: Language code (default: "en")
|
| 244 |
-
- **voice_cleanup**: Apply audio cleanup to reference voice
|
| 245 |
-
- **no_lang_auto_detect**: Disable automatic language detection
|
| 246 |
-
- **speaker_file**: Reference speaker audio file (optional, uses C3PO voice if not provided)
|
| 247 |
-
"""
|
| 248 |
-
|
| 249 |
-
if not text.strip():
|
| 250 |
-
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
| 251 |
-
|
| 252 |
-
speaker_temp_path = None
|
| 253 |
-
|
| 254 |
-
try:
|
| 255 |
-
# Handle speaker file if provided
|
| 256 |
-
if speaker_file is not None:
|
| 257 |
-
# Validate file type
|
| 258 |
-
if not speaker_file.content_type.startswith('audio/'):
|
| 259 |
-
raise HTTPException(status_code=400, detail="Speaker file must be an audio file")
|
| 260 |
-
|
| 261 |
-
# Save uploaded speaker file temporarily
|
| 262 |
-
speaker_temp_path = os.path.join(tempfile.gettempdir(), f"speaker_{uuid.uuid4().hex}.wav")
|
| 263 |
-
|
| 264 |
-
with open(speaker_temp_path, "wb") as buffer:
|
| 265 |
-
content = await speaker_file.read()
|
| 266 |
-
buffer.write(content)
|
| 267 |
-
|
| 268 |
-
# Generate speech (will use C3PO voice if no speaker file provided)
|
| 269 |
-
output_path = tts_service.generate_speech(
|
| 270 |
-
text,
|
| 271 |
-
speaker_temp_path,
|
| 272 |
-
language,
|
| 273 |
-
voice_cleanup,
|
| 274 |
-
no_lang_auto_detect
|
| 275 |
-
)
|
| 276 |
-
|
| 277 |
-
# Clean up temporary speaker file
|
| 278 |
-
if speaker_temp_path and os.path.exists(speaker_temp_path):
|
| 279 |
-
try:
|
| 280 |
-
os.remove(speaker_temp_path)
|
| 281 |
-
except:
|
| 282 |
-
pass
|
| 283 |
-
|
| 284 |
-
# Return the generated audio file
|
| 285 |
-
voice_type = "custom" if speaker_file else "c3po"
|
| 286 |
-
return FileResponse(
|
| 287 |
-
output_path,
|
| 288 |
-
media_type="audio/wav",
|
| 289 |
-
filename=f"xtts_{voice_type}_output_{uuid.uuid4().hex}.wav",
|
| 290 |
-
headers={"Content-Disposition": "attachment"}
|
| 291 |
-
)
|
| 292 |
-
|
| 293 |
-
except Exception as e:
|
| 294 |
-
# Clean up files in case of error
|
| 295 |
-
if speaker_temp_path and os.path.exists(speaker_temp_path):
|
| 296 |
-
try:
|
| 297 |
-
os.remove(speaker_temp_path)
|
| 298 |
-
except:
|
| 299 |
-
pass
|
| 300 |
-
|
| 301 |
-
logger.error(f"Error in TTS endpoint: {e}")
|
| 302 |
-
if isinstance(e, HTTPException):
|
| 303 |
-
raise e
|
| 304 |
-
raise HTTPException(status_code=500, detail=str(e))
|
| 305 |
-
|
| 306 |
-
@app.post("/tts-json")
|
| 307 |
-
async def text_to_speech_json(
|
| 308 |
-
request: TTSRequest,
|
| 309 |
-
speaker_file: UploadFile = File(None)
|
| 310 |
-
):
|
| 311 |
-
"""
|
| 312 |
-
Convert text to speech using JSON request body
|
| 313 |
-
|
| 314 |
-
- **request**: TTSRequest containing text, language, and options
|
| 315 |
-
- **speaker_file**: Reference speaker audio file (optional, uses C3PO voice if not provided)
|
| 316 |
-
"""
|
| 317 |
-
|
| 318 |
-
if not request.text.strip():
|
| 319 |
-
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
| 320 |
-
|
| 321 |
-
speaker_temp_path = None
|
| 322 |
-
|
| 323 |
-
try:
|
| 324 |
-
# Handle speaker file if provided
|
| 325 |
-
if speaker_file is not None:
|
| 326 |
-
# Validate file type
|
| 327 |
-
if not speaker_file.content_type.startswith('audio/'):
|
| 328 |
-
raise HTTPException(status_code=400, detail="Speaker file must be an audio file")
|
| 329 |
-
|
| 330 |
-
# Save uploaded speaker file temporarily
|
| 331 |
-
speaker_temp_path = os.path.join(tempfile.gettempdir(), f"speaker_{uuid.uuid4().hex}.wav")
|
| 332 |
-
|
| 333 |
-
with open(speaker_temp_path, "wb") as buffer:
|
| 334 |
-
content = await speaker_file.read()
|
| 335 |
-
buffer.write(content)
|
| 336 |
-
|
| 337 |
-
# Generate speech
|
| 338 |
-
output_path = tts_service.generate_speech(
|
| 339 |
-
request.text,
|
| 340 |
-
speaker_temp_path,
|
| 341 |
-
request.language,
|
| 342 |
-
request.voice_cleanup,
|
| 343 |
-
request.no_lang_auto_detect
|
| 344 |
-
)
|
| 345 |
-
|
| 346 |
-
# Clean up temporary speaker file
|
| 347 |
-
if speaker_temp_path and os.path.exists(speaker_temp_path):
|
| 348 |
-
try:
|
| 349 |
-
os.remove(speaker_temp_path)
|
| 350 |
-
except:
|
| 351 |
-
pass
|
| 352 |
-
|
| 353 |
-
# Return the generated audio file
|
| 354 |
-
voice_type = "custom" if speaker_file else "c3po"
|
| 355 |
-
return FileResponse(
|
| 356 |
-
output_path,
|
| 357 |
-
media_type="audio/wav",
|
| 358 |
-
filename=f"xtts_{voice_type}_{request.language}_{uuid.uuid4().hex}.wav",
|
| 359 |
-
headers={"Content-Disposition": "attachment"}
|
| 360 |
-
)
|
| 361 |
-
|
| 362 |
-
except Exception as e:
|
| 363 |
-
# Clean up files in case of error
|
| 364 |
-
if speaker_temp_path and os.path.exists(speaker_temp_path):
|
| 365 |
-
try:
|
| 366 |
-
os.remove(speaker_temp_path)
|
| 367 |
-
except:
|
| 368 |
-
pass
|
| 369 |
-
|
| 370 |
-
logger.error(f"Error in TTS JSON endpoint: {e}")
|
| 371 |
-
if isinstance(e, HTTPException):
|
| 372 |
-
raise e
|
| 373 |
-
raise HTTPException(status_code=500, detail=str(e))
|
| 374 |
-
|
| 375 |
-
@app.post("/tts-c3po")
|
| 376 |
-
async def text_to_speech_c3po_only(
|
| 377 |
-
text: str = Form(...),
|
| 378 |
-
language: str = Form("en"),
|
| 379 |
-
no_lang_auto_detect: bool = Form(False)
|
| 380 |
-
):
|
| 381 |
-
"""
|
| 382 |
-
Convert text to speech using C3PO voice only (no file upload needed)
|
| 383 |
-
|
| 384 |
-
- **text**: The text to convert to speech (max 500 characters)
|
| 385 |
-
- **language**: Language code (default: "en")
|
| 386 |
-
- **no_lang_auto_detect**: Disable automatic language detection
|
| 387 |
-
"""
|
| 388 |
-
|
| 389 |
-
if not text.strip():
|
| 390 |
-
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
| 391 |
-
|
| 392 |
-
try:
|
| 393 |
-
# Generate speech using C3PO voice
|
| 394 |
-
output_path = tts_service.generate_speech(
|
| 395 |
-
text,
|
| 396 |
-
None, # Use default C3PO voice
|
| 397 |
-
language,
|
| 398 |
-
False, # No voice cleanup needed for default voice
|
| 399 |
-
no_lang_auto_detect
|
| 400 |
-
)
|
| 401 |
-
|
| 402 |
-
# Return the generated audio file
|
| 403 |
-
return FileResponse(
|
| 404 |
-
output_path,
|
| 405 |
-
media_type="audio/wav",
|
| 406 |
-
filename=f"c3po_voice_{uuid.uuid4().hex}.wav",
|
| 407 |
-
headers={"Content-Disposition": "attachment"}
|
| 408 |
-
)
|
| 409 |
-
|
| 410 |
-
except Exception as e:
|
| 411 |
-
logger.error(f"Error in C3PO TTS endpoint: {e}")
|
| 412 |
-
if isinstance(e, HTTPException):
|
| 413 |
-
raise e
|
| 414 |
-
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app_config.py
DELETED
|
@@ -1,54 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Configuration for Kokoro TTS API, especially for Hugging Face Spaces deployment.
|
| 3 |
-
"""
|
| 4 |
-
|
| 5 |
-
import os
|
| 6 |
-
import tempfile
|
| 7 |
-
import logging
|
| 8 |
-
|
| 9 |
-
# Configure logging
|
| 10 |
-
logging.basicConfig(level=logging.INFO)
|
| 11 |
-
logger = logging.getLogger(__name__)
|
| 12 |
-
|
| 13 |
-
def setup_hf_cache():
|
| 14 |
-
"""Setup cache environment variables for Hugging Face Spaces"""
|
| 15 |
-
# Use user's home directory for cache
|
| 16 |
-
home_dir = os.path.expanduser("~")
|
| 17 |
-
cache_dir = os.path.join(home_dir, ".cache")
|
| 18 |
-
|
| 19 |
-
cache_settings = {
|
| 20 |
-
'HF_HOME': cache_dir,
|
| 21 |
-
'TRANSFORMERS_CACHE': cache_dir,
|
| 22 |
-
'HF_HUB_CACHE': cache_dir,
|
| 23 |
-
'TORCH_HOME': cache_dir,
|
| 24 |
-
'NUMBA_CACHE_DIR': os.path.join(cache_dir, 'numba'),
|
| 25 |
-
'NUMBA_DISABLE_JIT': '1',
|
| 26 |
-
'HF_HUB_DISABLE_TELEMETRY': '1'
|
| 27 |
-
}
|
| 28 |
-
|
| 29 |
-
# Set environment variables
|
| 30 |
-
for key, value in cache_settings.items():
|
| 31 |
-
os.environ[key] = value
|
| 32 |
-
logger.info(f"Set {key} to {value}")
|
| 33 |
-
|
| 34 |
-
# Create cache directories
|
| 35 |
-
cache_dirs = [cache_dir, os.path.join(cache_dir, 'numba')]
|
| 36 |
-
for cache_path in cache_dirs:
|
| 37 |
-
try:
|
| 38 |
-
os.makedirs(cache_path, exist_ok=True)
|
| 39 |
-
logger.info(f"Created cache directory: {cache_path}")
|
| 40 |
-
except Exception as e:
|
| 41 |
-
logger.warning(f"Could not create {cache_path}: {e}")
|
| 42 |
-
|
| 43 |
-
logger.info("Cache environment setup completed")
|
| 44 |
-
|
| 45 |
-
def get_temp_dir():
|
| 46 |
-
"""Get a writable temporary directory"""
|
| 47 |
-
return tempfile.gettempdir()
|
| 48 |
-
|
| 49 |
-
def is_hf_spaces():
|
| 50 |
-
"""Check if running on Hugging Face Spaces"""
|
| 51 |
-
return os.environ.get('SPACE_ID') is not None
|
| 52 |
-
|
| 53 |
-
# Initialize cache setup
|
| 54 |
-
setup_hf_cache()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
client_example.py
DELETED
|
@@ -1,269 +0,0 @@
|
|
| 1 |
-
import requests
|
| 2 |
-
import os
|
| 3 |
-
|
| 4 |
-
def test_c3po_voice():
|
| 5 |
-
"""Test the C3PO voice without uploading any files"""
|
| 6 |
-
|
| 7 |
-
# API endpoint for C3PO voice only
|
| 8 |
-
url = "http://localhost:7860/tts-c3po"
|
| 9 |
-
|
| 10 |
-
# Text to convert to speech
|
| 11 |
-
text = "Hello there! I am C-3PO, human-cyborg relations. How may I assist you today?"
|
| 12 |
-
|
| 13 |
-
# Prepare the request data
|
| 14 |
-
data = {
|
| 15 |
-
"text": text,
|
| 16 |
-
"language": "en",
|
| 17 |
-
"no_lang_auto_detect": False
|
| 18 |
-
}
|
| 19 |
-
|
| 20 |
-
try:
|
| 21 |
-
print("Testing C3PO voice...")
|
| 22 |
-
print(f"Text: {text}")
|
| 23 |
-
|
| 24 |
-
response = requests.post(url, data=data)
|
| 25 |
-
|
| 26 |
-
if response.status_code == 200:
|
| 27 |
-
# Save the generated audio
|
| 28 |
-
output_filename = "c3po_voice_sample.wav"
|
| 29 |
-
with open(output_filename, "wb") as f:
|
| 30 |
-
f.write(response.content)
|
| 31 |
-
print(f"Success! C3PO voice sample saved as {output_filename}")
|
| 32 |
-
else:
|
| 33 |
-
print(f"Error: {response.status_code}")
|
| 34 |
-
print(response.text)
|
| 35 |
-
|
| 36 |
-
except requests.exceptions.ConnectionError:
|
| 37 |
-
print("Error: Could not connect to the API. Make sure the server is running on http://localhost:7860")
|
| 38 |
-
except Exception as e:
|
| 39 |
-
print(f"Error: {e}")
|
| 40 |
-
|
| 41 |
-
def test_xtts_with_custom_voice():
|
| 42 |
-
"""Example of using XTTS with custom voice upload"""
|
| 43 |
-
|
| 44 |
-
# API endpoint
|
| 45 |
-
url = "http://localhost:7860/tts"
|
| 46 |
-
|
| 47 |
-
# Text to convert to speech
|
| 48 |
-
text = "This is a test of XTTS voice cloning with a custom reference voice."
|
| 49 |
-
|
| 50 |
-
# Path to your speaker reference audio file
|
| 51 |
-
speaker_file_path = "reference.wav" # Update this path to your reference audio
|
| 52 |
-
|
| 53 |
-
# Check if speaker file exists
|
| 54 |
-
if not os.path.exists(speaker_file_path):
|
| 55 |
-
print(f"Custom voice test skipped: Speaker file not found at {speaker_file_path}")
|
| 56 |
-
print("To test custom voice cloning:")
|
| 57 |
-
print("1. Record 3-10 seconds of clear speech")
|
| 58 |
-
print("2. Save as 'reference.wav' in this directory")
|
| 59 |
-
print("3. Run this test again")
|
| 60 |
-
return
|
| 61 |
-
|
| 62 |
-
# Prepare the request data
|
| 63 |
-
data = {
|
| 64 |
-
"text": text,
|
| 65 |
-
"language": "en",
|
| 66 |
-
"voice_cleanup": False,
|
| 67 |
-
"no_lang_auto_detect": False
|
| 68 |
-
}
|
| 69 |
-
|
| 70 |
-
files = {
|
| 71 |
-
"speaker_file": open(speaker_file_path, "rb")
|
| 72 |
-
}
|
| 73 |
-
|
| 74 |
-
try:
|
| 75 |
-
print("Testing XTTS with custom voice...")
|
| 76 |
-
print(f"Text: {text}")
|
| 77 |
-
print(f"Speaker file: {speaker_file_path}")
|
| 78 |
-
|
| 79 |
-
response = requests.post(url, data=data, files=files)
|
| 80 |
-
|
| 81 |
-
if response.status_code == 200:
|
| 82 |
-
# Save the generated audio
|
| 83 |
-
output_filename = "custom_voice_clone.wav"
|
| 84 |
-
with open(output_filename, "wb") as f:
|
| 85 |
-
f.write(response.content)
|
| 86 |
-
print(f"Success! Custom voice clone saved as {output_filename}")
|
| 87 |
-
else:
|
| 88 |
-
print(f"Error: {response.status_code}")
|
| 89 |
-
print(response.text)
|
| 90 |
-
|
| 91 |
-
except requests.exceptions.ConnectionError:
|
| 92 |
-
print("Error: Could not connect to the API. Make sure the server is running on http://localhost:7860")
|
| 93 |
-
except Exception as e:
|
| 94 |
-
print(f"Error: {e}")
|
| 95 |
-
finally:
|
| 96 |
-
files["speaker_file"].close()
|
| 97 |
-
|
| 98 |
-
def test_xtts_fallback_to_c3po():
|
| 99 |
-
"""Test XTTS endpoint without speaker file (should use C3PO voice)"""
|
| 100 |
-
|
| 101 |
-
# API endpoint
|
| 102 |
-
url = "http://localhost:7860/tts"
|
| 103 |
-
|
| 104 |
-
# Text to convert to speech
|
| 105 |
-
text = "When no custom voice is provided, I will speak in the C3PO voice by default."
|
| 106 |
-
|
| 107 |
-
# Prepare the request data (no speaker file)
|
| 108 |
-
data = {
|
| 109 |
-
"text": text,
|
| 110 |
-
"language": "en",
|
| 111 |
-
"voice_cleanup": False,
|
| 112 |
-
"no_lang_auto_detect": False
|
| 113 |
-
}
|
| 114 |
-
|
| 115 |
-
try:
|
| 116 |
-
print("Testing XTTS fallback to C3PO voice...")
|
| 117 |
-
print(f"Text: {text}")
|
| 118 |
-
|
| 119 |
-
response = requests.post(url, data=data)
|
| 120 |
-
|
| 121 |
-
if response.status_code == 200:
|
| 122 |
-
# Save the generated audio
|
| 123 |
-
output_filename = "xtts_c3po_fallback.wav"
|
| 124 |
-
with open(output_filename, "wb") as f:
|
| 125 |
-
f.write(response.content)
|
| 126 |
-
print(f"Success! XTTS with C3PO fallback saved as {output_filename}")
|
| 127 |
-
else:
|
| 128 |
-
print(f"Error: {response.status_code}")
|
| 129 |
-
print(response.text)
|
| 130 |
-
|
| 131 |
-
except requests.exceptions.ConnectionError:
|
| 132 |
-
print("Error: Could not connect to the API. Make sure the server is running on http://localhost:7860")
|
| 133 |
-
except Exception as e:
|
| 134 |
-
print(f"Error: {e}")
|
| 135 |
-
|
| 136 |
-
def test_multilingual_c3po():
|
| 137 |
-
"""Test C3PO voice in different languages"""
|
| 138 |
-
|
| 139 |
-
# API endpoint for C3PO voice only
|
| 140 |
-
url = "http://localhost:7860/tts-c3po"
|
| 141 |
-
|
| 142 |
-
# Test different languages
|
| 143 |
-
test_cases = [
|
| 144 |
-
("en", "Hello, I am C-3PO. I am fluent in over six million forms of communication."),
|
| 145 |
-
("es", "Hola, soy C-3PO. Domino más de seis millones de formas de comunicación."),
|
| 146 |
-
("fr", "Bonjour, je suis C-3PO. Je maîtrise plus de six millions de formes de communication."),
|
| 147 |
-
("de", "Hallo, ich bin C-3PO. Ich beherrsche über sechs Millionen Kommunikationsformen."),
|
| 148 |
-
]
|
| 149 |
-
|
| 150 |
-
for language, text in test_cases:
|
| 151 |
-
data = {
|
| 152 |
-
"text": text,
|
| 153 |
-
"language": language,
|
| 154 |
-
"no_lang_auto_detect": True # Force the specified language
|
| 155 |
-
}
|
| 156 |
-
|
| 157 |
-
try:
|
| 158 |
-
print(f"Testing C3PO voice in {language.upper()}...")
|
| 159 |
-
print(f"Text: {text}")
|
| 160 |
-
|
| 161 |
-
response = requests.post(url, data=data)
|
| 162 |
-
|
| 163 |
-
if response.status_code == 200:
|
| 164 |
-
# Save the generated audio
|
| 165 |
-
output_filename = f"c3po_voice_{language}.wav"
|
| 166 |
-
with open(output_filename, "wb") as f:
|
| 167 |
-
f.write(response.content)
|
| 168 |
-
print(f"Success! C3PO {language} voice saved as {output_filename}")
|
| 169 |
-
else:
|
| 170 |
-
print(f"Error: {response.status_code}")
|
| 171 |
-
print(response.text)
|
| 172 |
-
|
| 173 |
-
except requests.exceptions.ConnectionError:
|
| 174 |
-
print("Error: Could not connect to the API. Make sure the server is running on http://localhost:7860")
|
| 175 |
-
except Exception as e:
|
| 176 |
-
print(f"Error: {e}")
|
| 177 |
-
|
| 178 |
-
print() # Add spacing between tests
|
| 179 |
-
|
| 180 |
-
def get_supported_languages():
|
| 181 |
-
"""Get list of supported languages"""
|
| 182 |
-
try:
|
| 183 |
-
response = requests.get("http://localhost:7860/languages")
|
| 184 |
-
if response.status_code == 200:
|
| 185 |
-
languages = response.json()
|
| 186 |
-
print("Supported languages:", languages["languages"])
|
| 187 |
-
return languages["languages"]
|
| 188 |
-
else:
|
| 189 |
-
print("Failed to get languages:", response.status_code)
|
| 190 |
-
return []
|
| 191 |
-
except requests.exceptions.ConnectionError:
|
| 192 |
-
print("API is not running. Start it with: uvicorn app:app --host 0.0.0.0 --port 7860")
|
| 193 |
-
return []
|
| 194 |
-
|
| 195 |
-
def check_api_health():
|
| 196 |
-
"""Check if the API is running"""
|
| 197 |
-
try:
|
| 198 |
-
response = requests.get("http://localhost:7860/health")
|
| 199 |
-
if response.status_code == 200:
|
| 200 |
-
health_info = response.json()
|
| 201 |
-
print("API Health Check:")
|
| 202 |
-
print(f" Status: {health_info['status']}")
|
| 203 |
-
print(f" Device: {health_info['device']}")
|
| 204 |
-
print(f" Model: {health_info['model']}")
|
| 205 |
-
print(f" Default Voice: {health_info['default_voice']}")
|
| 206 |
-
print(f" Languages: {len(health_info['supported_languages'])} supported")
|
| 207 |
-
return True
|
| 208 |
-
else:
|
| 209 |
-
print("API health check failed:", response.status_code)
|
| 210 |
-
return False
|
| 211 |
-
except requests.exceptions.ConnectionError:
|
| 212 |
-
print("API is not running. Start it with: uvicorn app:app --host 0.0.0.0 --port 7860")
|
| 213 |
-
return False
|
| 214 |
-
|
| 215 |
-
def create_sample_reference():
|
| 216 |
-
"""Instructions for creating a reference audio file"""
|
| 217 |
-
print("\n" + "="*50)
|
| 218 |
-
print("REFERENCE AUDIO SETUP")
|
| 219 |
-
print("="*50)
|
| 220 |
-
print("To use XTTS voice cloning, you need a reference audio file:")
|
| 221 |
-
print("1. Record 3-10 seconds of clear speech")
|
| 222 |
-
print("2. Save as WAV format (recommended)")
|
| 223 |
-
print("3. Ensure good audio quality (no background noise)")
|
| 224 |
-
print("4. Place the file in the same directory as this script")
|
| 225 |
-
print("5. Update the 'speaker_file_path' variable in the functions above")
|
| 226 |
-
print("\nExample recording text:")
|
| 227 |
-
print("'Hello, this is my voice. I'm recording this sample for voice cloning.'")
|
| 228 |
-
print("="*50)
|
| 229 |
-
|
| 230 |
-
if __name__ == "__main__":
|
| 231 |
-
print("XTTS C3PO API Client Example")
|
| 232 |
-
print("=" * 40)
|
| 233 |
-
|
| 234 |
-
# First check if API is running
|
| 235 |
-
if check_api_health():
|
| 236 |
-
print()
|
| 237 |
-
|
| 238 |
-
# Get supported languages
|
| 239 |
-
languages = get_supported_languages()
|
| 240 |
-
print()
|
| 241 |
-
|
| 242 |
-
# Test C3PO voice (no file upload needed)
|
| 243 |
-
print("1. Testing C3PO voice (no upload required)...")
|
| 244 |
-
test_c3po_voice()
|
| 245 |
-
print()
|
| 246 |
-
|
| 247 |
-
# Test XTTS fallback to C3PO
|
| 248 |
-
print("2. Testing XTTS endpoint without speaker file (C3PO fallback)...")
|
| 249 |
-
test_xtts_fallback_to_c3po()
|
| 250 |
-
print()
|
| 251 |
-
|
| 252 |
-
# Test custom voice if reference file exists
|
| 253 |
-
print("3. Testing custom voice cloning...")
|
| 254 |
-
test_xtts_with_custom_voice()
|
| 255 |
-
print()
|
| 256 |
-
|
| 257 |
-
# Test multilingual C3PO
|
| 258 |
-
print("4. Testing multilingual C3PO voice...")
|
| 259 |
-
test_multilingual_c3po()
|
| 260 |
-
|
| 261 |
-
print("All tests completed!")
|
| 262 |
-
print("\nGenerated files:")
|
| 263 |
-
for file in os.listdir("."):
|
| 264 |
-
if file.endswith(".wav") and ("c3po" in file or "custom" in file or "xtts" in file):
|
| 265 |
-
print(f" - {file}")
|
| 266 |
-
|
| 267 |
-
else:
|
| 268 |
-
print("\nPlease start the API server first:")
|
| 269 |
-
print("uvicorn app:app --host 0.0.0.0 --port 7860")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -1,13 +1,8 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
num2words>=0.5.14
|
| 10 |
-
pysbd>=0.3.4
|
| 11 |
-
tqdm>=4.64.1
|
| 12 |
-
coqui-tts == 0.26.2
|
| 13 |
-
huggingface_hub>=0.17.0
|
|
|
|
| 1 |
+
fastapi>=0.104.1
|
| 2 |
+
uvicorn>=0.24.0
|
| 3 |
+
python-multipart>=0.0.6
|
| 4 |
+
torch>=2.0.0
|
| 5 |
+
torchaudio>=2.0.0
|
| 6 |
+
coqui-tts>=0.22.0
|
| 7 |
+
huggingface_hub>=0.17.0
|
| 8 |
+
pydantic>=2.0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements_coqui.txt
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
fastapi>=0.104.1
|
| 2 |
-
uvicorn[standard]>=0.24.0
|
| 3 |
-
python-multipart>=0.0.6
|
| 4 |
-
coqui-tts==0.26.2
|
| 5 |
-
torch>=2.0.0
|
| 6 |
-
torchaudio>=2.0.0
|
| 7 |
-
numpy>=1.24.0
|
| 8 |
-
scipy>=1.11.0
|
| 9 |
-
pydub>=0.25.1
|
| 10 |
-
librosa>=0.10.0
|
| 11 |
-
soundfile>=0.12.1
|
| 12 |
-
typing-extensions>=4.8.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
start_c3po_api.py
CHANGED
|
@@ -1,171 +1,52 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
|
| 4 |
-
Handles model download, initialization, and server startup
|
| 5 |
"""
|
| 6 |
|
| 7 |
import os
|
| 8 |
import sys
|
| 9 |
-
import subprocess
|
| 10 |
import logging
|
| 11 |
-
import time
|
| 12 |
-
from pathlib import Path
|
| 13 |
|
| 14 |
# Configure logging
|
| 15 |
-
logging.basicConfig(
|
| 16 |
-
level=logging.INFO,
|
| 17 |
-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 18 |
-
)
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
|
| 21 |
-
def check_dependencies():
|
| 22 |
-
"""Check if all required dependencies are installed"""
|
| 23 |
-
logger.info("🔍 Checking dependencies...")
|
| 24 |
-
|
| 25 |
-
try:
|
| 26 |
-
import torch
|
| 27 |
-
import TTS
|
| 28 |
-
import fastapi
|
| 29 |
-
import huggingface_hub
|
| 30 |
-
logger.info("✅ All core dependencies found")
|
| 31 |
-
return True
|
| 32 |
-
except ImportError as e:
|
| 33 |
-
logger.error(f"❌ Missing dependency: {e}")
|
| 34 |
-
logger.info("💡 Install with: pip install -r requirements.txt")
|
| 35 |
-
return False
|
| 36 |
-
|
| 37 |
-
def check_gpu():
|
| 38 |
-
"""Check GPU availability"""
|
| 39 |
-
try:
|
| 40 |
-
import torch
|
| 41 |
-
if torch.cuda.is_available():
|
| 42 |
-
gpu_name = torch.cuda.get_device_name(0)
|
| 43 |
-
logger.info(f"🎮 GPU available: {gpu_name}")
|
| 44 |
-
return True
|
| 45 |
-
else:
|
| 46 |
-
logger.info("💻 No GPU available, using CPU")
|
| 47 |
-
return False
|
| 48 |
-
except Exception as e:
|
| 49 |
-
logger.warning(f"⚠️ GPU check failed: {e}")
|
| 50 |
-
return False
|
| 51 |
-
|
| 52 |
-
def check_disk_space():
|
| 53 |
-
"""Check available disk space for model download"""
|
| 54 |
-
try:
|
| 55 |
-
import shutil
|
| 56 |
-
free_space = shutil.disk_usage('.').free / (1024**3) # GB
|
| 57 |
-
|
| 58 |
-
if free_space < 5:
|
| 59 |
-
logger.warning(f"⚠️ Low disk space: {free_space:.1f}GB available")
|
| 60 |
-
logger.warning("💽 C-3PO model requires ~2GB space")
|
| 61 |
-
else:
|
| 62 |
-
logger.info(f"💾 Disk space: {free_space:.1f}GB available")
|
| 63 |
-
|
| 64 |
-
return free_space > 2
|
| 65 |
-
except Exception as e:
|
| 66 |
-
logger.warning(f"⚠️ Disk space check failed: {e}")
|
| 67 |
-
return True
|
| 68 |
-
|
| 69 |
def setup_environment():
|
| 70 |
-
"""Set up environment variables"""
|
| 71 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
| 72 |
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
| 73 |
-
|
| 74 |
-
# Create models directory
|
| 75 |
-
models_dir = Path("./models")
|
| 76 |
-
models_dir.mkdir(exist_ok=True)
|
| 77 |
-
|
| 78 |
logger.info("🌍 Environment configured")
|
| 79 |
|
| 80 |
-
def
|
| 81 |
-
"""
|
| 82 |
-
logger.info("
|
| 83 |
-
|
| 84 |
-
try:
|
| 85 |
-
subprocess.check_call([
|
| 86 |
-
sys.executable, "-m", "pip", "install", "-r", "requirements.txt"
|
| 87 |
-
])
|
| 88 |
-
logger.info("✅ Dependencies installed successfully")
|
| 89 |
-
return True
|
| 90 |
-
except subprocess.CalledProcessError as e:
|
| 91 |
-
logger.error(f"❌ Failed to install dependencies: {e}")
|
| 92 |
-
return False
|
| 93 |
-
|
| 94 |
-
def test_model_download():
|
| 95 |
-
"""Test if the C-3PO model can be downloaded"""
|
| 96 |
-
logger.info("🤖 Testing C-3PO model availability...")
|
| 97 |
|
| 98 |
try:
|
| 99 |
-
from huggingface_hub import repo_info
|
| 100 |
-
|
| 101 |
-
# Check if the repo exists and is accessible
|
| 102 |
-
info = repo_info(repo_id="Borcherding/XTTS-v2_C3PO")
|
| 103 |
-
logger.info(f"✅ C-3PO model accessible: {info.id}")
|
| 104 |
-
logger.info(f" Last modified: {info.last_modified}")
|
| 105 |
-
|
| 106 |
-
return True
|
| 107 |
-
except Exception as e:
|
| 108 |
-
logger.error(f"❌ C-3PO model not accessible: {e}")
|
| 109 |
-
return False
|
| 110 |
-
|
| 111 |
-
def start_api_server():
|
| 112 |
-
"""Start the FastAPI server"""
|
| 113 |
-
logger.info("🚀 Starting C-3PO TTS API server...")
|
| 114 |
-
|
| 115 |
-
try:
|
| 116 |
-
# Import and run the API
|
| 117 |
import uvicorn
|
| 118 |
from coqui_api import app
|
| 119 |
|
| 120 |
logger.info("🎭 C-3PO TTS API starting on http://localhost:7860")
|
| 121 |
-
logger.info("📖 API documentation
|
| 122 |
|
| 123 |
-
uvicorn.run(
|
| 124 |
-
app,
|
| 125 |
-
host="0.0.0.0",
|
| 126 |
-
port=7860,
|
| 127 |
-
log_level="info"
|
| 128 |
-
)
|
| 129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
except Exception as e:
|
| 131 |
-
logger.error(f"❌ Failed to start API
|
| 132 |
-
|
| 133 |
|
| 134 |
def main():
|
| 135 |
"""Main startup sequence"""
|
| 136 |
-
print("🤖 C-3PO TTS API
|
| 137 |
-
print("=" *
|
| 138 |
|
| 139 |
-
# Step 1: Check dependencies
|
| 140 |
-
if not check_dependencies():
|
| 141 |
-
logger.info("📦 Attempting to install dependencies...")
|
| 142 |
-
if not install_dependencies():
|
| 143 |
-
logger.error("❌ Failed to install dependencies. Exiting.")
|
| 144 |
-
sys.exit(1)
|
| 145 |
-
|
| 146 |
-
# Step 2: Setup environment
|
| 147 |
setup_environment()
|
| 148 |
|
| 149 |
-
# Step 3: Check system resources
|
| 150 |
-
has_gpu = check_gpu()
|
| 151 |
-
has_space = check_disk_space()
|
| 152 |
-
|
| 153 |
-
if not has_space:
|
| 154 |
-
logger.error("❌ Insufficient disk space. Exiting.")
|
| 155 |
-
sys.exit(1)
|
| 156 |
-
|
| 157 |
-
# Step 4: Test model availability
|
| 158 |
-
if not test_model_download():
|
| 159 |
-
logger.warning("⚠️ C-3PO model may not be accessible")
|
| 160 |
-
logger.warning(" The API will fall back to standard XTTS v2")
|
| 161 |
-
|
| 162 |
-
# Step 5: Start the server
|
| 163 |
-
print("\n" + "=" * 50)
|
| 164 |
-
logger.info("🎬 All checks passed! Starting C-3PO TTS API...")
|
| 165 |
-
print("=" * 50)
|
| 166 |
-
|
| 167 |
try:
|
| 168 |
-
|
| 169 |
except KeyboardInterrupt:
|
| 170 |
logger.info("\n🛑 Server stopped by user")
|
| 171 |
except Exception as e:
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
Simple startup script for C-3PO TTS API
|
|
|
|
| 4 |
"""
|
| 5 |
|
| 6 |
import os
|
| 7 |
import sys
|
|
|
|
| 8 |
import logging
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# Configure logging
|
| 11 |
+
logging.basicConfig(level=logging.INFO)
|
|
|
|
|
|
|
|
|
|
| 12 |
logger = logging.getLogger(__name__)
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
def setup_environment():
|
| 15 |
+
"""Set up required environment variables"""
|
| 16 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
| 17 |
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
logger.info("🌍 Environment configured")
|
| 19 |
|
| 20 |
+
def start_api():
|
| 21 |
+
"""Start the C-3PO TTS API"""
|
| 22 |
+
logger.info("🤖 Starting C-3PO TTS API...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
import uvicorn
|
| 26 |
from coqui_api import app
|
| 27 |
|
| 28 |
logger.info("🎭 C-3PO TTS API starting on http://localhost:7860")
|
| 29 |
+
logger.info("📖 API documentation: http://localhost:7860/docs")
|
| 30 |
|
| 31 |
+
uvicorn.run(app, host="0.0.0.0", port=7860, log_level="info")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
+
except ImportError as e:
|
| 34 |
+
logger.error(f"❌ Missing dependency: {e}")
|
| 35 |
+
logger.info("💡 Install with: pip install -r requirements.txt")
|
| 36 |
+
sys.exit(1)
|
| 37 |
except Exception as e:
|
| 38 |
+
logger.error(f"❌ Failed to start API: {e}")
|
| 39 |
+
sys.exit(1)
|
| 40 |
|
| 41 |
def main():
|
| 42 |
"""Main startup sequence"""
|
| 43 |
+
print("🤖 C-3PO TTS API")
|
| 44 |
+
print("=" * 30)
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
setup_environment()
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
try:
|
| 49 |
+
start_api()
|
| 50 |
except KeyboardInterrupt:
|
| 51 |
logger.info("\n🛑 Server stopped by user")
|
| 52 |
except Exception as e:
|
startup.py
DELETED
|
@@ -1,120 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Startup script for Kokoro TTS API on Hugging Face Spaces
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
import os
|
| 7 |
-
import sys
|
| 8 |
-
import logging
|
| 9 |
-
import subprocess
|
| 10 |
-
|
| 11 |
-
# Configure logging
|
| 12 |
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 13 |
-
logger = logging.getLogger(__name__)
|
| 14 |
-
|
| 15 |
-
def check_environment():
|
| 16 |
-
"""Check the environment and permissions"""
|
| 17 |
-
logger.info("=== Environment Check ===")
|
| 18 |
-
|
| 19 |
-
# Check if running on HF Spaces
|
| 20 |
-
space_id = os.environ.get('SPACE_ID')
|
| 21 |
-
if space_id:
|
| 22 |
-
logger.info(f"Running on Hugging Face Spaces: {space_id}")
|
| 23 |
-
else:
|
| 24 |
-
logger.info("Not running on Hugging Face Spaces")
|
| 25 |
-
|
| 26 |
-
# Check Python version
|
| 27 |
-
logger.info(f"Python version: {sys.version}")
|
| 28 |
-
|
| 29 |
-
# Check current user and home directory
|
| 30 |
-
logger.info(f"Current user: {os.getenv('USER', 'unknown')}")
|
| 31 |
-
logger.info(f"Home directory: {os.path.expanduser('~')}")
|
| 32 |
-
logger.info(f"Current working directory: {os.getcwd()}")
|
| 33 |
-
|
| 34 |
-
# Check available disk space
|
| 35 |
-
try:
|
| 36 |
-
result = subprocess.run(['df', '-h', '/tmp'], capture_output=True, text=True)
|
| 37 |
-
logger.info(f"Disk space in /tmp:\n{result.stdout}")
|
| 38 |
-
except Exception as e:
|
| 39 |
-
logger.warning(f"Could not check disk space: {e}")
|
| 40 |
-
|
| 41 |
-
# Check write permissions for important directories
|
| 42 |
-
test_dirs = ['/tmp', os.path.expanduser('~'), os.getcwd()]
|
| 43 |
-
for test_dir in test_dirs:
|
| 44 |
-
try:
|
| 45 |
-
test_file = os.path.join(test_dir, 'test_write.tmp')
|
| 46 |
-
with open(test_file, 'w') as f:
|
| 47 |
-
f.write('test')
|
| 48 |
-
os.remove(test_file)
|
| 49 |
-
logger.info(f"✅ Write permission OK: {test_dir}")
|
| 50 |
-
except Exception as e:
|
| 51 |
-
logger.warning(f"❌ Write permission failed: {test_dir} - {e}")
|
| 52 |
-
|
| 53 |
-
def check_dependencies():
|
| 54 |
-
"""Check if required packages are installed"""
|
| 55 |
-
logger.info("=== Checking dependencies ===")
|
| 56 |
-
|
| 57 |
-
required_packages = [
|
| 58 |
-
'kokoro',
|
| 59 |
-
'soundfile',
|
| 60 |
-
'torch',
|
| 61 |
-
'fastapi',
|
| 62 |
-
'uvicorn'
|
| 63 |
-
]
|
| 64 |
-
|
| 65 |
-
for package in required_packages:
|
| 66 |
-
try:
|
| 67 |
-
__import__(package)
|
| 68 |
-
logger.info(f"✅ {package} is available")
|
| 69 |
-
except ImportError:
|
| 70 |
-
logger.error(f"❌ {package} is not available")
|
| 71 |
-
|
| 72 |
-
def test_kokoro():
|
| 73 |
-
"""Test Kokoro TTS functionality"""
|
| 74 |
-
logger.info("=== Testing Kokoro TTS ===")
|
| 75 |
-
|
| 76 |
-
try:
|
| 77 |
-
# Import after setting up environment
|
| 78 |
-
import app_config # This will setup environment
|
| 79 |
-
from kokoro import KPipeline
|
| 80 |
-
|
| 81 |
-
logger.info("Initializing Kokoro pipeline...")
|
| 82 |
-
pipeline = KPipeline(lang_code='a')
|
| 83 |
-
logger.info("✅ Kokoro pipeline initialized successfully")
|
| 84 |
-
|
| 85 |
-
# Test generation
|
| 86 |
-
logger.info("Testing speech generation...")
|
| 87 |
-
text = "Hello, this is a test."
|
| 88 |
-
generator = pipeline(text, voice='af_heart')
|
| 89 |
-
|
| 90 |
-
for i, (gs, ps, audio) in enumerate(generator):
|
| 91 |
-
logger.info(f"✅ Generated audio segment {i}: gs={gs}, ps={ps}, audio shape: {audio.shape}")
|
| 92 |
-
break
|
| 93 |
-
|
| 94 |
-
logger.info("✅ Kokoro TTS test completed successfully")
|
| 95 |
-
return True
|
| 96 |
-
|
| 97 |
-
except Exception as e:
|
| 98 |
-
logger.error(f"❌ Kokoro TTS test failed: {e}")
|
| 99 |
-
import traceback
|
| 100 |
-
logger.error(f"Full traceback: {traceback.format_exc()}")
|
| 101 |
-
return False
|
| 102 |
-
|
| 103 |
-
def main():
|
| 104 |
-
"""Main startup function"""
|
| 105 |
-
logger.info("🚀 Starting Kokoro TTS API setup...")
|
| 106 |
-
|
| 107 |
-
check_environment()
|
| 108 |
-
check_dependencies()
|
| 109 |
-
|
| 110 |
-
if test_kokoro():
|
| 111 |
-
logger.info("🎉 All checks passed! Starting the API...")
|
| 112 |
-
# Import and start the app
|
| 113 |
-
import uvicorn
|
| 114 |
-
uvicorn.run("app:app", host="0.0.0.0", port=7860, log_level="info")
|
| 115 |
-
else:
|
| 116 |
-
logger.error("❌ Setup failed. Please check the logs above.")
|
| 117 |
-
sys.exit(1)
|
| 118 |
-
|
| 119 |
-
if __name__ == "__main__":
|
| 120 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test.py
DELETED
|
@@ -1,144 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import torch
|
| 3 |
-
import torchaudio
|
| 4 |
-
import subprocess
|
| 5 |
-
|
| 6 |
-
# Set environment variables for CPU-only usage
|
| 7 |
-
os.environ['COQUI_TOS_AGREED'] = '1'
|
| 8 |
-
os.environ['NUMBA_DISABLE_JIT'] = '1'
|
| 9 |
-
os.environ['FORCE_CPU'] = 'true'
|
| 10 |
-
os.environ['CUDA_VISIBLE_DEVICES'] = ''
|
| 11 |
-
|
| 12 |
-
# Fix PyTorch weights_only issue for XTTS
|
| 13 |
-
import torch.serialization
|
| 14 |
-
from TTS.tts.configs.xtts_config import XttsConfig
|
| 15 |
-
torch.serialization.add_safe_globals([XttsConfig])
|
| 16 |
-
|
| 17 |
-
from TTS.api import TTS
|
| 18 |
-
from TTS.tts.configs.xtts_config import XttsConfig
|
| 19 |
-
from TTS.tts.models.xtts import Xtts
|
| 20 |
-
from TTS.utils.generic_utils import get_user_data_dir
|
| 21 |
-
|
| 22 |
-
print("Testing XTTS C3PO voice cloning...")
|
| 23 |
-
|
| 24 |
-
# C3PO model path
|
| 25 |
-
model_path = "XTTS-v2_C3PO/"
|
| 26 |
-
config_path = "XTTS-v2_C3PO/config.json"
|
| 27 |
-
|
| 28 |
-
# Check if model files exist, if not download them
|
| 29 |
-
if not os.path.exists(config_path):
|
| 30 |
-
print("C3PO model not found locally, downloading...")
|
| 31 |
-
try:
|
| 32 |
-
subprocess.run([
|
| 33 |
-
"git", "clone",
|
| 34 |
-
"https://huggingface.co/Borcherding/XTTS-v2_C3PO",
|
| 35 |
-
"XTTS-v2_C3PO"
|
| 36 |
-
], check=True)
|
| 37 |
-
print("C3PO model downloaded successfully")
|
| 38 |
-
except subprocess.CalledProcessError as e:
|
| 39 |
-
print(f"Failed to download C3PO model: {e}")
|
| 40 |
-
exit(1)
|
| 41 |
-
|
| 42 |
-
# Load configuration
|
| 43 |
-
config = XttsConfig()
|
| 44 |
-
config.load_json(config_path)
|
| 45 |
-
|
| 46 |
-
# Initialize and load model
|
| 47 |
-
model = Xtts.init_from_config(config)
|
| 48 |
-
model.load_checkpoint(
|
| 49 |
-
config,
|
| 50 |
-
checkpoint_path=os.path.join(model_path, "model.pth"),
|
| 51 |
-
vocab_path=os.path.join(model_path, "vocab.json"),
|
| 52 |
-
eval=True,
|
| 53 |
-
)
|
| 54 |
-
|
| 55 |
-
device = "cpu" # Force CPU usage
|
| 56 |
-
print(f"C3PO model loaded on {device} (forced CPU mode)")
|
| 57 |
-
|
| 58 |
-
# Text to convert to speech
|
| 59 |
-
text = "Hello there! I am C-3PO, human-cyborg relations. How may I assist you today?"
|
| 60 |
-
|
| 61 |
-
# Look for reference audio in the C3PO model directory
|
| 62 |
-
reference_audio_path = None
|
| 63 |
-
for file in os.listdir(model_path):
|
| 64 |
-
if file.endswith(('.wav', '.mp3', '.m4a')):
|
| 65 |
-
reference_audio_path = os.path.join(model_path, file)
|
| 66 |
-
print(f"Found C3PO reference audio: {file}")
|
| 67 |
-
break
|
| 68 |
-
|
| 69 |
-
# If no reference audio found, create a simple test reference
|
| 70 |
-
if reference_audio_path is None:
|
| 71 |
-
print("No reference audio found in C3PO model, creating test reference...")
|
| 72 |
-
reference_audio_path = "test_reference.wav"
|
| 73 |
-
|
| 74 |
-
# Generate a simple sine wave as placeholder
|
| 75 |
-
import numpy as np
|
| 76 |
-
sample_rate = 24000
|
| 77 |
-
duration = 3 # seconds
|
| 78 |
-
frequency = 440 # Hz
|
| 79 |
-
t = np.linspace(0, duration, int(sample_rate * duration))
|
| 80 |
-
audio_data = 0.3 * np.sin(2 * np.pi * frequency * t)
|
| 81 |
-
|
| 82 |
-
# Save as WAV
|
| 83 |
-
torchaudio.save(reference_audio_path, torch.tensor(audio_data).unsqueeze(0), sample_rate)
|
| 84 |
-
print(f"Test reference audio created: {reference_audio_path}")
|
| 85 |
-
|
| 86 |
-
try:
|
| 87 |
-
# Generate conditioning latents
|
| 88 |
-
print("Processing reference audio...")
|
| 89 |
-
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
|
| 90 |
-
audio_path=reference_audio_path,
|
| 91 |
-
gpt_cond_len=30,
|
| 92 |
-
gpt_cond_chunk_len=4,
|
| 93 |
-
max_ref_length=60
|
| 94 |
-
)
|
| 95 |
-
|
| 96 |
-
# Generate speech
|
| 97 |
-
print("Generating C3PO speech...")
|
| 98 |
-
out = model.inference(
|
| 99 |
-
text,
|
| 100 |
-
"en", # language
|
| 101 |
-
gpt_cond_latent,
|
| 102 |
-
speaker_embedding,
|
| 103 |
-
repetition_penalty=5.0,
|
| 104 |
-
temperature=0.75,
|
| 105 |
-
)
|
| 106 |
-
|
| 107 |
-
# Save output
|
| 108 |
-
output_path = "c3po_test_output.wav"
|
| 109 |
-
torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
|
| 110 |
-
print(f"C3PO speech generated successfully! Saved as: {output_path}")
|
| 111 |
-
|
| 112 |
-
# Test multilingual capabilities
|
| 113 |
-
print("\nTesting multilingual C3PO...")
|
| 114 |
-
multilingual_tests = [
|
| 115 |
-
("es", "Hola, soy C-3PO. Domino más de seis millones de formas de comunicación."),
|
| 116 |
-
("fr", "Bonjour, je suis C-3PO. Je maîtrise plus de six millions de formes de communication."),
|
| 117 |
-
("de", "Hallo, ich bin C-3PO. Ich beherrsche über sechs Millionen Kommunikationsformen."),
|
| 118 |
-
]
|
| 119 |
-
|
| 120 |
-
for lang, test_text in multilingual_tests:
|
| 121 |
-
print(f"Generating {lang.upper()} speech...")
|
| 122 |
-
out = model.inference(
|
| 123 |
-
test_text,
|
| 124 |
-
lang,
|
| 125 |
-
gpt_cond_latent,
|
| 126 |
-
speaker_embedding,
|
| 127 |
-
repetition_penalty=5.0,
|
| 128 |
-
temperature=0.75,
|
| 129 |
-
)
|
| 130 |
-
|
| 131 |
-
output_path = f"c3po_test_{lang}.wav"
|
| 132 |
-
torchaudio.save(output_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
|
| 133 |
-
print(f"C3PO {lang.upper()} speech saved as: {output_path}")
|
| 134 |
-
|
| 135 |
-
except Exception as e:
|
| 136 |
-
print(f"Error during speech generation: {e}")
|
| 137 |
-
import traceback
|
| 138 |
-
traceback.print_exc()
|
| 139 |
-
|
| 140 |
-
print("XTTS C3PO test completed!")
|
| 141 |
-
print("\nGenerated files:")
|
| 142 |
-
for file in os.listdir("."):
|
| 143 |
-
if file.startswith("c3po_test") and file.endswith(".wav"):
|
| 144 |
-
print(f" - {file}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_build.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Simple build test for C-3PO TTS API
|
| 4 |
+
Tests if all dependencies can be imported
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
def test_imports():
|
| 8 |
+
"""Test if all required packages can be imported"""
|
| 9 |
+
print("🔍 Testing imports...")
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
import fastapi
|
| 13 |
+
print("✅ FastAPI")
|
| 14 |
+
|
| 15 |
+
import uvicorn
|
| 16 |
+
print("✅ Uvicorn")
|
| 17 |
+
|
| 18 |
+
import torch
|
| 19 |
+
print("✅ PyTorch")
|
| 20 |
+
|
| 21 |
+
import torchaudio
|
| 22 |
+
print("✅ TorchAudio")
|
| 23 |
+
|
| 24 |
+
import TTS
|
| 25 |
+
print("✅ Coqui TTS")
|
| 26 |
+
|
| 27 |
+
import huggingface_hub
|
| 28 |
+
print("✅ Hugging Face Hub")
|
| 29 |
+
|
| 30 |
+
import pydantic
|
| 31 |
+
print("✅ Pydantic")
|
| 32 |
+
|
| 33 |
+
return True
|
| 34 |
+
|
| 35 |
+
except ImportError as e:
|
| 36 |
+
print(f"❌ Import failed: {e}")
|
| 37 |
+
return False
|
| 38 |
+
|
| 39 |
+
def test_api_creation():
|
| 40 |
+
"""Test if the API can be created without errors"""
|
| 41 |
+
print("\n🚀 Testing API creation...")
|
| 42 |
+
|
| 43 |
+
try:
|
| 44 |
+
from coqui_api import app
|
| 45 |
+
print("✅ API created successfully")
|
| 46 |
+
return True
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"❌ API creation failed: {e}")
|
| 49 |
+
return False
|
| 50 |
+
|
| 51 |
+
def main():
|
| 52 |
+
"""Run build tests"""
|
| 53 |
+
print("🧪 C-3PO TTS Build Test")
|
| 54 |
+
print("=" * 30)
|
| 55 |
+
|
| 56 |
+
import_ok = test_imports()
|
| 57 |
+
api_ok = test_api_creation()
|
| 58 |
+
|
| 59 |
+
print("\n" + "=" * 30)
|
| 60 |
+
|
| 61 |
+
if import_ok and api_ok:
|
| 62 |
+
print("🎉 All tests passed! Ready to deploy.")
|
| 63 |
+
return 0
|
| 64 |
+
else:
|
| 65 |
+
print("❌ Some tests failed. Check dependencies.")
|
| 66 |
+
return 1
|
| 67 |
+
|
| 68 |
+
if __name__ == "__main__":
|
| 69 |
+
exit(main())
|
test_coqui_api.py
DELETED
|
@@ -1,146 +0,0 @@
|
|
| 1 |
-
import requests
|
| 2 |
-
import os
|
| 3 |
-
import time
|
| 4 |
-
|
| 5 |
-
# API base URL (update this to your deployed Hugging Face Space URL)
|
| 6 |
-
BASE_URL = "http://localhost:7860" # Change to your HF Space URL when deployed
|
| 7 |
-
|
| 8 |
-
def test_health():
|
| 9 |
-
"""Test the health endpoint"""
|
| 10 |
-
print("🔍 Testing health endpoint...")
|
| 11 |
-
try:
|
| 12 |
-
response = requests.get(f"{BASE_URL}/health")
|
| 13 |
-
if response.status_code == 200:
|
| 14 |
-
print("✅ Health check passed!")
|
| 15 |
-
print(f"Response: {response.json()}")
|
| 16 |
-
else:
|
| 17 |
-
print(f"❌ Health check failed: {response.status_code}")
|
| 18 |
-
print(f"Response: {response.text}")
|
| 19 |
-
except Exception as e:
|
| 20 |
-
print(f"❌ Health check error: {e}")
|
| 21 |
-
|
| 22 |
-
def test_list_models():
|
| 23 |
-
"""Test the models endpoint"""
|
| 24 |
-
print("\n🔍 Testing models endpoint...")
|
| 25 |
-
try:
|
| 26 |
-
response = requests.get(f"{BASE_URL}/models")
|
| 27 |
-
if response.status_code == 200:
|
| 28 |
-
models = response.json()
|
| 29 |
-
print("✅ Models endpoint working!")
|
| 30 |
-
print(f"Found {len(models.get('models', []))} models")
|
| 31 |
-
# Show first 5 models
|
| 32 |
-
for i, model in enumerate(models.get('models', [])[:5]):
|
| 33 |
-
print(f" {i+1}. {model}")
|
| 34 |
-
else:
|
| 35 |
-
print(f"❌ Models endpoint failed: {response.status_code}")
|
| 36 |
-
except Exception as e:
|
| 37 |
-
print(f"❌ Models endpoint error: {e}")
|
| 38 |
-
|
| 39 |
-
def test_simple_tts():
|
| 40 |
-
"""Test simple text-to-speech without voice cloning"""
|
| 41 |
-
print("\n🔍 Testing simple TTS...")
|
| 42 |
-
try:
|
| 43 |
-
data = {
|
| 44 |
-
"text": "Hello world! This is a test of Coqui TTS.",
|
| 45 |
-
"language": "en"
|
| 46 |
-
}
|
| 47 |
-
|
| 48 |
-
response = requests.post(f"{BASE_URL}/tts", data=data)
|
| 49 |
-
|
| 50 |
-
if response.status_code == 200:
|
| 51 |
-
# Save the audio file
|
| 52 |
-
output_file = "simple_tts_output.wav"
|
| 53 |
-
with open(output_file, "wb") as f:
|
| 54 |
-
f.write(response.content)
|
| 55 |
-
print(f"✅ Simple TTS successful! Audio saved to: {output_file}")
|
| 56 |
-
print(f"File size: {len(response.content)} bytes")
|
| 57 |
-
else:
|
| 58 |
-
print(f"❌ Simple TTS failed: {response.status_code}")
|
| 59 |
-
print(f"Response: {response.text}")
|
| 60 |
-
except Exception as e:
|
| 61 |
-
print(f"❌ Simple TTS error: {e}")
|
| 62 |
-
|
| 63 |
-
def test_voice_cloning(speaker_file_path=None):
|
| 64 |
-
"""Test voice cloning with uploaded speaker file"""
|
| 65 |
-
if not speaker_file_path or not os.path.exists(speaker_file_path):
|
| 66 |
-
print("\n⚠️ Skipping voice cloning test - no speaker file provided")
|
| 67 |
-
print(" To test voice cloning, provide a .wav file path")
|
| 68 |
-
return
|
| 69 |
-
|
| 70 |
-
print(f"\n🔍 Testing voice cloning with: {speaker_file_path}")
|
| 71 |
-
try:
|
| 72 |
-
data = {
|
| 73 |
-
"text": "This is voice cloning using Coqui TTS. The voice should match the reference audio.",
|
| 74 |
-
"language": "en"
|
| 75 |
-
}
|
| 76 |
-
|
| 77 |
-
with open(speaker_file_path, "rb") as f:
|
| 78 |
-
files = {"speaker_file": f}
|
| 79 |
-
response = requests.post(f"{BASE_URL}/tts", data=data, files=files)
|
| 80 |
-
|
| 81 |
-
if response.status_code == 200:
|
| 82 |
-
# Save the cloned audio
|
| 83 |
-
output_file = "voice_cloned_output.wav"
|
| 84 |
-
with open(output_file, "wb") as f:
|
| 85 |
-
f.write(response.content)
|
| 86 |
-
print(f"✅ Voice cloning successful! Audio saved to: {output_file}")
|
| 87 |
-
print(f"File size: {len(response.content)} bytes")
|
| 88 |
-
else:
|
| 89 |
-
print(f"❌ Voice cloning failed: {response.status_code}")
|
| 90 |
-
print(f"Response: {response.text}")
|
| 91 |
-
except Exception as e:
|
| 92 |
-
print(f"❌ Voice cloning error: {e}")
|
| 93 |
-
|
| 94 |
-
def test_json_tts():
|
| 95 |
-
"""Test JSON endpoint"""
|
| 96 |
-
print("\n🔍 Testing JSON TTS endpoint...")
|
| 97 |
-
try:
|
| 98 |
-
import json
|
| 99 |
-
|
| 100 |
-
data = {
|
| 101 |
-
"text": "This is a JSON request test for Coqui TTS API.",
|
| 102 |
-
"language": "en"
|
| 103 |
-
}
|
| 104 |
-
|
| 105 |
-
response = requests.post(
|
| 106 |
-
f"{BASE_URL}/tts-json",
|
| 107 |
-
headers={"Content-Type": "application/json"},
|
| 108 |
-
data=json.dumps(data)
|
| 109 |
-
)
|
| 110 |
-
|
| 111 |
-
if response.status_code == 200:
|
| 112 |
-
output_file = "json_tts_output.wav"
|
| 113 |
-
with open(output_file, "wb") as f:
|
| 114 |
-
f.write(response.content)
|
| 115 |
-
print(f"✅ JSON TTS successful! Audio saved to: {output_file}")
|
| 116 |
-
print(f"File size: {len(response.content)} bytes")
|
| 117 |
-
else:
|
| 118 |
-
print(f"❌ JSON TTS failed: {response.status_code}")
|
| 119 |
-
print(f"Response: {response.text}")
|
| 120 |
-
except Exception as e:
|
| 121 |
-
print(f"❌ JSON TTS error: {e}")
|
| 122 |
-
|
| 123 |
-
def main():
|
| 124 |
-
print("🐸 Testing Coqui TTS API")
|
| 125 |
-
print("=" * 50)
|
| 126 |
-
|
| 127 |
-
# Test all endpoints
|
| 128 |
-
test_health()
|
| 129 |
-
test_list_models()
|
| 130 |
-
test_simple_tts()
|
| 131 |
-
test_json_tts()
|
| 132 |
-
|
| 133 |
-
# Test voice cloning if speaker file is available
|
| 134 |
-
# You can specify a speaker file path here
|
| 135 |
-
speaker_file = None # Change to your speaker file path
|
| 136 |
-
test_voice_cloning(speaker_file)
|
| 137 |
-
|
| 138 |
-
print("\n🎉 API testing completed!")
|
| 139 |
-
print("\nTo test voice cloning:")
|
| 140 |
-
print("1. Record a short audio sample (5-10 seconds)")
|
| 141 |
-
print("2. Save it as a .wav file")
|
| 142 |
-
print("3. Update speaker_file variable with the file path")
|
| 143 |
-
print("4. Run the test again")
|
| 144 |
-
|
| 145 |
-
if __name__ == "__main__":
|
| 146 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_coqui_tts.py
DELETED
|
@@ -1,99 +0,0 @@
|
|
| 1 |
-
import torch
|
| 2 |
-
from TTS.api import TTS
|
| 3 |
-
import os
|
| 4 |
-
|
| 5 |
-
def test_coqui_tts():
|
| 6 |
-
"""Test Coqui TTS functionality"""
|
| 7 |
-
|
| 8 |
-
# Get device
|
| 9 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 10 |
-
print(f"Using device: {device}")
|
| 11 |
-
|
| 12 |
-
try:
|
| 13 |
-
# List available 🐸TTS models
|
| 14 |
-
print("\n=== Available TTS Models ===")
|
| 15 |
-
tts_instance = TTS()
|
| 16 |
-
models = tts_instance.list_models()
|
| 17 |
-
|
| 18 |
-
# Print first 10 models to avoid overwhelming output
|
| 19 |
-
print("First 10 available models:")
|
| 20 |
-
for i, model in enumerate(models[:10]):
|
| 21 |
-
print(f"{i+1}. {model}")
|
| 22 |
-
|
| 23 |
-
if len(models) > 10:
|
| 24 |
-
print(f"... and {len(models) - 10} more models")
|
| 25 |
-
|
| 26 |
-
except Exception as e:
|
| 27 |
-
print(f"Error listing models: {e}")
|
| 28 |
-
return
|
| 29 |
-
|
| 30 |
-
try:
|
| 31 |
-
# Initialize TTS with XTTS v2 model
|
| 32 |
-
print("\n=== Initializing XTTS v2 Model ===")
|
| 33 |
-
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
|
| 34 |
-
print("XTTS v2 model loaded successfully!")
|
| 35 |
-
|
| 36 |
-
# List speakers if available
|
| 37 |
-
print("\n=== Available Speakers ===")
|
| 38 |
-
if hasattr(tts, 'speakers') and tts.speakers:
|
| 39 |
-
print("Available speakers:")
|
| 40 |
-
for speaker in tts.speakers[:10]: # Show first 10
|
| 41 |
-
print(f"- {speaker}")
|
| 42 |
-
if len(tts.speakers) > 10:
|
| 43 |
-
print(f"... and {len(tts.speakers) - 10} more speakers")
|
| 44 |
-
else:
|
| 45 |
-
print("No preset speakers available or speakers list is empty")
|
| 46 |
-
|
| 47 |
-
except Exception as e:
|
| 48 |
-
print(f"Error initializing XTTS v2 model: {e}")
|
| 49 |
-
print("This might be due to model download requirements or missing dependencies")
|
| 50 |
-
return
|
| 51 |
-
|
| 52 |
-
try:
|
| 53 |
-
# Test TTS to file with preset speaker (if available)
|
| 54 |
-
print("\n=== Testing TTS to File ===")
|
| 55 |
-
output_file = "test_output.wav"
|
| 56 |
-
|
| 57 |
-
# Check if we have speakers available
|
| 58 |
-
if hasattr(tts, 'speakers') and tts.speakers:
|
| 59 |
-
# Use first available speaker
|
| 60 |
-
speaker_name = tts.speakers[0]
|
| 61 |
-
print(f"Using speaker: {speaker_name}")
|
| 62 |
-
|
| 63 |
-
tts.tts_to_file(
|
| 64 |
-
text="Hello world! This is a test of Coqui TTS library.",
|
| 65 |
-
speaker=speaker_name,
|
| 66 |
-
language="en",
|
| 67 |
-
file_path=output_file
|
| 68 |
-
)
|
| 69 |
-
else:
|
| 70 |
-
# Try without speaker specification
|
| 71 |
-
print("No speakers available, trying without speaker specification...")
|
| 72 |
-
tts.tts_to_file(
|
| 73 |
-
text="Hello world! This is a test of Coqui TTS library.",
|
| 74 |
-
language="en",
|
| 75 |
-
file_path=output_file
|
| 76 |
-
)
|
| 77 |
-
|
| 78 |
-
if os.path.exists(output_file):
|
| 79 |
-
print(f"✅ TTS successful! Audio saved to: {output_file}")
|
| 80 |
-
file_size = os.path.getsize(output_file)
|
| 81 |
-
print(f"File size: {file_size} bytes")
|
| 82 |
-
else:
|
| 83 |
-
print("❌ TTS failed - output file not created")
|
| 84 |
-
|
| 85 |
-
except Exception as e:
|
| 86 |
-
print(f"Error during TTS generation: {e}")
|
| 87 |
-
|
| 88 |
-
# Note about voice cloning
|
| 89 |
-
print("\n=== Voice Cloning Information ===")
|
| 90 |
-
print("To test voice cloning, you would need:")
|
| 91 |
-
print("1. A reference audio file (speaker_wav parameter)")
|
| 92 |
-
print("2. Use tts.tts() method instead of tts_to_file()")
|
| 93 |
-
print("Example:")
|
| 94 |
-
print('wav = tts.tts(text="Hello!", speaker_wav="reference.wav", language="en")')
|
| 95 |
-
|
| 96 |
-
if __name__ == "__main__":
|
| 97 |
-
print("🐸 Testing Coqui TTS Library")
|
| 98 |
-
print("=" * 50)
|
| 99 |
-
test_coqui_tts()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
test_kokoro_install.py
DELETED
|
@@ -1,86 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Simple test script to verify Kokoro TTS installation and functionality.
|
| 4 |
-
"""
|
| 5 |
-
|
| 6 |
-
import os
|
| 7 |
-
|
| 8 |
-
# Set basic environment variables
|
| 9 |
-
os.environ['NUMBA_DISABLE_JIT'] = '1'
|
| 10 |
-
|
| 11 |
-
def test_kokoro_import():
|
| 12 |
-
"""Test if Kokoro can be imported"""
|
| 13 |
-
try:
|
| 14 |
-
from kokoro import KPipeline
|
| 15 |
-
import soundfile as sf
|
| 16 |
-
import torch
|
| 17 |
-
print("✅ All required packages imported successfully!")
|
| 18 |
-
return True
|
| 19 |
-
except ImportError as e:
|
| 20 |
-
print(f"❌ Import error: {e}")
|
| 21 |
-
return False
|
| 22 |
-
|
| 23 |
-
def test_kokoro_pipeline():
|
| 24 |
-
"""Test if Kokoro pipeline can be initialized"""
|
| 25 |
-
try:
|
| 26 |
-
from kokoro import KPipeline
|
| 27 |
-
pipeline = KPipeline(lang_code='a')
|
| 28 |
-
print("✅ Kokoro pipeline initialized successfully!")
|
| 29 |
-
return True
|
| 30 |
-
except Exception as e:
|
| 31 |
-
print(f"❌ Pipeline initialization error: {e}")
|
| 32 |
-
return False
|
| 33 |
-
|
| 34 |
-
def test_kokoro_generation():
|
| 35 |
-
"""Test if Kokoro can generate speech"""
|
| 36 |
-
try:
|
| 37 |
-
from kokoro import KPipeline
|
| 38 |
-
import soundfile as sf
|
| 39 |
-
|
| 40 |
-
pipeline = KPipeline(lang_code='a')
|
| 41 |
-
text = "Hello, this is a test of Kokoro TTS."
|
| 42 |
-
|
| 43 |
-
generator = pipeline(text, voice='af_heart')
|
| 44 |
-
|
| 45 |
-
for i, (gs, ps, audio) in enumerate(generator):
|
| 46 |
-
print(f"✅ Generated audio segment {i}: gs={gs}, ps={ps}")
|
| 47 |
-
# Save test audio
|
| 48 |
-
sf.write('test_kokoro.wav', audio, 24000)
|
| 49 |
-
print("✅ Test audio saved as 'test_kokoro.wav'")
|
| 50 |
-
break # Just test the first segment
|
| 51 |
-
|
| 52 |
-
return True
|
| 53 |
-
except Exception as e:
|
| 54 |
-
print(f"❌ Speech generation error: {e}")
|
| 55 |
-
return False
|
| 56 |
-
|
| 57 |
-
def main():
|
| 58 |
-
"""Run all tests"""
|
| 59 |
-
print("🎤 Testing Kokoro TTS Installation")
|
| 60 |
-
print("=" * 40)
|
| 61 |
-
|
| 62 |
-
tests = [
|
| 63 |
-
("Import Test", test_kokoro_import),
|
| 64 |
-
("Pipeline Test", test_kokoro_pipeline),
|
| 65 |
-
("Generation Test", test_kokoro_generation)
|
| 66 |
-
]
|
| 67 |
-
|
| 68 |
-
passed = 0
|
| 69 |
-
total = len(tests)
|
| 70 |
-
|
| 71 |
-
for test_name, test_func in tests:
|
| 72 |
-
print(f"\n🔍 Running {test_name}...")
|
| 73 |
-
if test_func():
|
| 74 |
-
passed += 1
|
| 75 |
-
else:
|
| 76 |
-
print(f"❌ {test_name} failed!")
|
| 77 |
-
|
| 78 |
-
print(f"\n📊 Results: {passed}/{total} tests passed")
|
| 79 |
-
|
| 80 |
-
if passed == total:
|
| 81 |
-
print("🎉 All tests passed! Kokoro TTS is ready to use.")
|
| 82 |
-
else:
|
| 83 |
-
print("⚠️ Some tests failed. Please check the installation.")
|
| 84 |
-
|
| 85 |
-
if __name__ == "__main__":
|
| 86 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|