Spaces:
Sleeping
Sleeping
Divax
commited on
Commit
·
71905d8
1
Parent(s):
6a83fff
test
Browse files- Dockerfile.coqui +51 -0
- README_coqui.md +351 -0
- coqui_api.py +372 -0
- requirements.txt +13 -11
- requirements_coqui.txt +12 -0
- start_c3po_api.py +176 -0
- test_c3po_model.py +214 -0
- test_coqui_api.py +146 -0
- test_coqui_tts.py +99 -0
Dockerfile.coqui
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11
|
| 2 |
+
|
| 3 |
+
# Set up a new user named "user" with user ID 1000
|
| 4 |
+
RUN useradd -m -u 1000 user
|
| 5 |
+
|
| 6 |
+
# Install system dependencies as root
|
| 7 |
+
RUN apt-get update && apt-get install -y \
|
| 8 |
+
git \
|
| 9 |
+
git-lfs \
|
| 10 |
+
espeak-ng \
|
| 11 |
+
ffmpeg \
|
| 12 |
+
libsndfile1 \
|
| 13 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
+
|
| 15 |
+
# Initialize git lfs
|
| 16 |
+
RUN git lfs install
|
| 17 |
+
|
| 18 |
+
# Switch to the "user" user
|
| 19 |
+
USER user
|
| 20 |
+
|
| 21 |
+
# Set home to the user's home directory
|
| 22 |
+
ENV HOME=/home/user \
|
| 23 |
+
PATH=/home/user/.local/bin:$PATH \
|
| 24 |
+
COQUI_TOS_AGREED=1 \
|
| 25 |
+
HF_HUB_DISABLE_TELEMETRY=1 \
|
| 26 |
+
HF_HOME=/home/user/.cache/huggingface
|
| 27 |
+
|
| 28 |
+
# Set the working directory to the user's home directory
|
| 29 |
+
WORKDIR $HOME/app
|
| 30 |
+
|
| 31 |
+
# Upgrade pip
|
| 32 |
+
RUN pip install --no-cache-dir --upgrade pip
|
| 33 |
+
|
| 34 |
+
# Install PyTorch with CPU support for Hugging Face Spaces
|
| 35 |
+
RUN pip install --no-cache-dir torch torchaudio --index-url https://download.pytorch.org/whl/cpu
|
| 36 |
+
|
| 37 |
+
# Copy requirements and install dependencies
|
| 38 |
+
COPY --chown=user requirements.txt .
|
| 39 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 40 |
+
|
| 41 |
+
# Copy the API file
|
| 42 |
+
COPY --chown=user coqui_api.py .
|
| 43 |
+
|
| 44 |
+
# Create necessary directories
|
| 45 |
+
RUN mkdir -p $HOME/.cache $HOME/app/models
|
| 46 |
+
|
| 47 |
+
# Expose the port
|
| 48 |
+
EXPOSE 7860
|
| 49 |
+
|
| 50 |
+
# Start the Coqui TTS API
|
| 51 |
+
CMD ["uvicorn", "coqui_api:app", "--host", "0.0.0.0", "--port", "7860"]
|
README_coqui.md
ADDED
|
@@ -0,0 +1,351 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🤖 Coqui TTS C-3PO API for Hugging Face Spaces
|
| 2 |
+
|
| 3 |
+
A FastAPI-based text-to-speech service using the Coqui TTS library with the **C-3PO fine-tuned XTTS v2 model** from [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO) for authentic C-3PO voice synthesis.
|
| 4 |
+
|
| 5 |
+
## ✨ Features
|
| 6 |
+
|
| 7 |
+
- 🤖 **C-3PO Voice**: Authentic C-3PO voice using fine-tuned XTTS v2 model
|
| 8 |
+
- 🎯 **Text-to-Speech**: Convert text to natural-sounding speech
|
| 9 |
+
- 🎭 **Voice Cloning**: Clone any voice from a reference audio sample
|
| 10 |
+
- 🌍 **Multilingual**: Support for 17+ languages with C-3PO voice characteristics
|
| 11 |
+
- 🚀 **FastAPI**: Modern, fast API with automatic documentation
|
| 12 |
+
- 🐳 **Docker Ready**: Containerized for easy deployment
|
| 13 |
+
- ☁️ **Hugging Face Spaces**: Optimized for HF Spaces deployment
|
| 14 |
+
|
| 15 |
+
## 🎭 C-3PO Model Information
|
| 16 |
+
|
| 17 |
+
This API uses the fine-tuned C-3PO voice model from [Borcherding/XTTS-v2_C3PO](https://huggingface.co/Borcherding/XTTS-v2_C3PO), which features:
|
| 18 |
+
|
| 19 |
+
- **Fine-tuned on 20 unique C-3PO voice lines** from Star Wars
|
| 20 |
+
- **Multi-lingual support** (17 languages) while maintaining C-3PO's distinctive voice
|
| 21 |
+
- **Emotion & Style Transfer** capturing C-3PO's formal, protocol droid characteristics
|
| 22 |
+
- **High-Quality Audio** output at 24kHz sampling rate
|
| 23 |
+
|
| 24 |
+
## 📡 API Endpoints
|
| 25 |
+
|
| 26 |
+
### 1. Health Check
|
| 27 |
+
```bash
|
| 28 |
+
GET /health
|
| 29 |
+
```
|
| 30 |
+
Returns API status, model information, and C-3PO voice availability.
|
| 31 |
+
|
| 32 |
+
### 2. List Models
|
| 33 |
+
```bash
|
| 34 |
+
GET /models
|
| 35 |
+
```
|
| 36 |
+
Returns available TTS models.
|
| 37 |
+
|
| 38 |
+
### 3. C-3PO Text-to-Speech (Dedicated)
|
| 39 |
+
```bash
|
| 40 |
+
POST /tts-c3po
|
| 41 |
+
```
|
| 42 |
+
**Parameters:**
|
| 43 |
+
- `text` (string): Text to convert to C-3PO voice (2-500 characters)
|
| 44 |
+
- `language` (string): Language code (default: "en")
|
| 45 |
+
|
| 46 |
+
**Example using curl:**
|
| 47 |
+
```bash
|
| 48 |
+
curl -X POST "http://localhost:7860/tts-c3po" \
|
| 49 |
+
-F "text=I am C-3PO, human-cyborg relations." \
|
| 50 |
+
-F "language=en" \
|
| 51 |
+
--output c3po_voice.wav
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
### 4. General Text-to-Speech
|
| 55 |
+
```bash
|
| 56 |
+
POST /tts
|
| 57 |
+
```
|
| 58 |
+
**Parameters:**
|
| 59 |
+
- `text` (string): Text to convert to speech (2-500 characters)
|
| 60 |
+
- `language` (string): Language code (default: "en")
|
| 61 |
+
- `speaker_file` (file, optional): Reference audio for voice cloning
|
| 62 |
+
- `use_c3po_voice` (boolean): Use C-3PO voice if no speaker file provided (default: true)
|
| 63 |
+
|
| 64 |
+
**Example using curl:**
|
| 65 |
+
```bash
|
| 66 |
+
# C-3PO voice (default)
|
| 67 |
+
curl -X POST "http://localhost:7860/tts" \
|
| 68 |
+
-F "text=The odds of successfully navigating an asteroid field are approximately 3,720 to 1." \
|
| 69 |
+
-F "language=en" \
|
| 70 |
+
--output c3po_output.wav
|
| 71 |
+
|
| 72 |
+
# Custom voice cloning
|
| 73 |
+
curl -X POST "http://localhost:7860/tts" \
|
| 74 |
+
-F "text=This will sound like the reference voice." \
|
| 75 |
+
-F "language=en" \
|
| 76 |
+
-F "speaker_file=@reference_voice.wav" \
|
| 77 |
+
-F "use_c3po_voice=false" \
|
| 78 |
+
--output cloned_voice.wav
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
### 5. JSON TTS (C-3PO Voice)
|
| 82 |
+
```bash
|
| 83 |
+
POST /tts-json
|
| 84 |
+
```
|
| 85 |
+
**JSON Body:**
|
| 86 |
+
```json
|
| 87 |
+
{
|
| 88 |
+
"text": "R2-D2, you know better than to trust a strange computer!",
|
| 89 |
+
"language": "en"
|
| 90 |
+
}
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
## 🚀 Deployment on Hugging Face Spaces
|
| 94 |
+
|
| 95 |
+
### Step 1: Create a new Space
|
| 96 |
+
1. Go to [Hugging Face Spaces](https://huggingface.co/spaces)
|
| 97 |
+
2. Click "Create new Space"
|
| 98 |
+
3. Choose "Docker" as the SDK
|
| 99 |
+
4. Set your space name and visibility
|
| 100 |
+
|
| 101 |
+
### Step 2: Add files to your Space
|
| 102 |
+
Upload these files to your Hugging Face Space repository:
|
| 103 |
+
|
| 104 |
+
```
|
| 105 |
+
your-space/
|
| 106 |
+
├── coqui_api.py # Main API file with C-3PO integration
|
| 107 |
+
├── requirements.txt # Dependencies (includes huggingface_hub)
|
| 108 |
+
├── Dockerfile.coqui # Docker configuration
|
| 109 |
+
├── test_c3po_model.py # Test script for C-3PO functionality
|
| 110 |
+
└── README.md # This file
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
### Step 3: Configure your Space
|
| 114 |
+
Rename the files in your Space:
|
| 115 |
+
- `Dockerfile.coqui` → `Dockerfile`
|
| 116 |
+
|
| 117 |
+
### Step 4: Deploy
|
| 118 |
+
Your Space will automatically build and deploy. The build process may take 15-20 minutes as it downloads the C-3PO fine-tuned model from Hugging Face.
|
| 119 |
+
|
| 120 |
+
## 💻 Local Development
|
| 121 |
+
|
| 122 |
+
### Requirements
|
| 123 |
+
- Python 3.11+
|
| 124 |
+
- PyTorch
|
| 125 |
+
- Coqui TTS library
|
| 126 |
+
- Hugging Face Hub
|
| 127 |
+
|
| 128 |
+
### Installation
|
| 129 |
+
```bash
|
| 130 |
+
# Clone the repository
|
| 131 |
+
git clone <your-repo>
|
| 132 |
+
cd <your-repo>
|
| 133 |
+
|
| 134 |
+
# Install dependencies
|
| 135 |
+
pip install -r requirements.txt
|
| 136 |
+
|
| 137 |
+
# Run the API
|
| 138 |
+
python coqui_api.py
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
The API will be available at `http://localhost:7860`
|
| 142 |
+
|
| 143 |
+
### Testing
|
| 144 |
+
```bash
|
| 145 |
+
# Run the C-3PO model test suite
|
| 146 |
+
python test_c3po_model.py
|
| 147 |
+
|
| 148 |
+
# Run the general test client
|
| 149 |
+
python test_coqui_api.py
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
## 🎪 Usage Examples
|
| 153 |
+
|
| 154 |
+
### Python Client - C-3PO Voice
|
| 155 |
+
```python
|
| 156 |
+
import requests
|
| 157 |
+
|
| 158 |
+
# C-3PO voice synthesis
|
| 159 |
+
data = {"text": "I am C-3PO, human-cyborg relations.", "language": "en"}
|
| 160 |
+
response = requests.post("http://localhost:7860/tts-c3po", data=data)
|
| 161 |
+
|
| 162 |
+
with open("c3po_output.wav", "wb") as f:
|
| 163 |
+
f.write(response.content)
|
| 164 |
+
|
| 165 |
+
# JSON API
|
| 166 |
+
import json
|
| 167 |
+
headers = {'Content-Type': 'application/json'}
|
| 168 |
+
data = {"text": "The odds are approximately 3,720 to 1!", "language": "en"}
|
| 169 |
+
response = requests.post("http://localhost:7860/tts-json", json=data, headers=headers)
|
| 170 |
+
|
| 171 |
+
with open("c3po_json.wav", "wb") as f:
|
| 172 |
+
f.write(response.content)
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
### JavaScript/Web - C-3PO Voice
|
| 176 |
+
```javascript
|
| 177 |
+
// C-3PO voice synthesis
|
| 178 |
+
const formData = new FormData();
|
| 179 |
+
formData.append('text', 'Oh my! How interesting!');
|
| 180 |
+
formData.append('language', 'en');
|
| 181 |
+
|
| 182 |
+
fetch('http://localhost:7860/tts-c3po', {
|
| 183 |
+
method: 'POST',
|
| 184 |
+
body: formData
|
| 185 |
+
})
|
| 186 |
+
.then(response => response.blob())
|
| 187 |
+
.then(blob => {
|
| 188 |
+
const url = URL.createObjectURL(blob);
|
| 189 |
+
const audio = new Audio(url);
|
| 190 |
+
audio.play();
|
| 191 |
+
});
|
| 192 |
+
|
| 193 |
+
// JSON API
|
| 194 |
+
fetch('http://localhost:7860/tts-json', {
|
| 195 |
+
method: 'POST',
|
| 196 |
+
headers: {'Content-Type': 'application/json'},
|
| 197 |
+
body: JSON.stringify({
|
| 198 |
+
text: 'R2-D2, you know better than to trust a strange computer!',
|
| 199 |
+
language: 'en'
|
| 200 |
+
})
|
| 201 |
+
})
|
| 202 |
+
.then(response => response.blob())
|
| 203 |
+
.then(blob => {
|
| 204 |
+
const url = URL.createObjectURL(blob);
|
| 205 |
+
const audio = new Audio(url);
|
| 206 |
+
audio.play();
|
| 207 |
+
});
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
## 🎨 C-3PO Voice Examples
|
| 211 |
+
|
| 212 |
+
Perfect texts for demonstrating C-3PO's voice characteristics:
|
| 213 |
+
|
| 214 |
+
```bash
|
| 215 |
+
# Classic C-3PO phrases
|
| 216 |
+
curl -X POST "http://localhost:7860/tts-c3po" \
|
| 217 |
+
-F "text=I am C-3PO, human-cyborg relations." \
|
| 218 |
+
-F "language=en" --output c3po_intro.wav
|
| 219 |
+
|
| 220 |
+
curl -X POST "http://localhost:7860/tts-c3po" \
|
| 221 |
+
-F "text=The odds of successfully navigating an asteroid field are approximately 3,720 to 1." \
|
| 222 |
+
-F "language=en" --output c3po_odds.wav
|
| 223 |
+
|
| 224 |
+
curl -X POST "http://localhost:7860/tts-c3po" \
|
| 225 |
+
-F "text=R2-D2, you know better than to trust a strange computer!" \
|
| 226 |
+
-F "language=en" --output c3po_r2d2.wav
|
| 227 |
+
|
| 228 |
+
curl -X POST "http://localhost:7860/tts-c3po" \
|
| 229 |
+
-F "text=Oh my! How interesting!" \
|
| 230 |
+
-F "language=en" --output c3po_oh_my.wav
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
## 🌍 Multilingual C-3PO Support
|
| 234 |
+
|
| 235 |
+
The C-3PO model maintains its distinctive voice characteristics across multiple languages:
|
| 236 |
+
|
| 237 |
+
```python
|
| 238 |
+
# Multilingual examples
|
| 239 |
+
languages = [
|
| 240 |
+
("Hello, I am C-3PO", "en"),
|
| 241 |
+
("Hola, soy C-3PO", "es"),
|
| 242 |
+
("Bonjour, je suis C-3PO", "fr"),
|
| 243 |
+
("Guten Tag, ich bin C-3PO", "de"),
|
| 244 |
+
("Ciao, sono C-3PO", "it"),
|
| 245 |
+
("Olá, eu sou C-3PO", "pt")
|
| 246 |
+
]
|
| 247 |
+
|
| 248 |
+
for text, lang in languages:
|
| 249 |
+
response = requests.post("http://localhost:7860/tts-c3po",
|
| 250 |
+
data={"text": text, "language": lang})
|
| 251 |
+
with open(f"c3po_{lang}.wav", "wb") as f:
|
| 252 |
+
f.write(response.content)
|
| 253 |
+
```
|
| 254 |
+
|
| 255 |
+
## 🔧 Voice Cloning Guide
|
| 256 |
+
|
| 257 |
+
1. **Prepare Reference Audio:**
|
| 258 |
+
- Duration: 5-10 seconds (optimal)
|
| 259 |
+
- Format: WAV, MP3, or M4A
|
| 260 |
+
- Quality: Clear speech, minimal background noise
|
| 261 |
+
- Content: Natural speaking, preferably in target language
|
| 262 |
+
|
| 263 |
+
2. **API Request:**
|
| 264 |
+
```bash
|
| 265 |
+
curl -X POST "http://your-space.hf.space/tts" \
|
| 266 |
+
-F "text=Your text to synthesize" \
|
| 267 |
+
-F "language=en" \
|
| 268 |
+
-F "speaker_file=@your_reference.wav" \
|
| 269 |
+
--output result.wav
|
| 270 |
+
```
|
| 271 |
+
|
| 272 |
+
3. **Tips for Best Results:**
|
| 273 |
+
- Use high-quality reference audio
|
| 274 |
+
- Match the language of reference and target text
|
| 275 |
+
- Keep text length reasonable (under 500 characters)
|
| 276 |
+
- Experiment with different reference samples
|
| 277 |
+
|
| 278 |
+
## Supported Languages
|
| 279 |
+
|
| 280 |
+
The XTTS v2 model supports multiple languages including:
|
| 281 |
+
- English (en)
|
| 282 |
+
- Spanish (es)
|
| 283 |
+
- French (fr)
|
| 284 |
+
- German (de)
|
| 285 |
+
- Italian (it)
|
| 286 |
+
- Portuguese (pt)
|
| 287 |
+
- Polish (pl)
|
| 288 |
+
- Turkish (tr)
|
| 289 |
+
- Russian (ru)
|
| 290 |
+
- Dutch (nl)
|
| 291 |
+
- Czech (cs)
|
| 292 |
+
- Arabic (ar)
|
| 293 |
+
- Chinese (zh-cn)
|
| 294 |
+
- Japanese (ja)
|
| 295 |
+
- Hungarian (hu)
|
| 296 |
+
- Korean (ko)
|
| 297 |
+
|
| 298 |
+
## Troubleshooting
|
| 299 |
+
|
| 300 |
+
### Common Issues
|
| 301 |
+
|
| 302 |
+
1. **Model Download Errors:**
|
| 303 |
+
- The first run downloads ~1.7GB model files
|
| 304 |
+
- Ensure stable internet connection
|
| 305 |
+
- Check Hugging Face Spaces logs
|
| 306 |
+
|
| 307 |
+
2. **Audio Quality Issues:**
|
| 308 |
+
- Use high-quality reference audio for voice cloning
|
| 309 |
+
- Ensure reference audio matches target language
|
| 310 |
+
- Try different reference samples
|
| 311 |
+
|
| 312 |
+
3. **Memory Issues on HF Spaces:**
|
| 313 |
+
- The model requires significant memory
|
| 314 |
+
- Consider upgrading to a higher-tier Space if needed
|
| 315 |
+
|
| 316 |
+
4. **API Timeouts:**
|
| 317 |
+
- Initial model loading takes time
|
| 318 |
+
- Subsequent requests are faster
|
| 319 |
+
- Consider warming up the model with a test request
|
| 320 |
+
|
| 321 |
+
### Environment Variables
|
| 322 |
+
|
| 323 |
+
- `COQUI_TOS_AGREED=1`: Accepts Coqui TTS terms of service
|
| 324 |
+
- `HF_HUB_DISABLE_TELEMETRY=1`: Disables telemetry
|
| 325 |
+
- `TORCH_HOME`: PyTorch cache directory
|
| 326 |
+
|
| 327 |
+
## API Documentation
|
| 328 |
+
|
| 329 |
+
Once deployed, visit your Space URL and add `/docs` to access the interactive API documentation:
|
| 330 |
+
```
|
| 331 |
+
https://your-username-your-space-name.hf.space/docs
|
| 332 |
+
```
|
| 333 |
+
|
| 334 |
+
## Contributing
|
| 335 |
+
|
| 336 |
+
1. Fork the repository
|
| 337 |
+
2. Create a feature branch
|
| 338 |
+
3. Make your changes
|
| 339 |
+
4. Test thoroughly
|
| 340 |
+
5. Submit a pull request
|
| 341 |
+
|
| 342 |
+
## License
|
| 343 |
+
|
| 344 |
+
This project uses the Coqui TTS library. Please check [Coqui TTS license](https://github.com/coqui-ai/TTS) for usage terms.
|
| 345 |
+
|
| 346 |
+
## Credits
|
| 347 |
+
|
| 348 |
+
- [Coqui TTS](https://github.com/coqui-ai/TTS) - The underlying TTS engine
|
| 349 |
+
- [XTTS v2](https://arxiv.org/abs/2309.11321) - The voice cloning model
|
| 350 |
+
- [FastAPI](https://fastapi.tiangolo.com/) - Web framework
|
| 351 |
+
- [Hugging Face Spaces](https://huggingface.co/spaces) - Deployment platform
|
coqui_api.py
ADDED
|
@@ -0,0 +1,372 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
import tempfile
|
| 4 |
+
import uuid
|
| 5 |
+
import logging
|
| 6 |
+
from typing import Optional
|
| 7 |
+
from huggingface_hub import snapshot_download
|
| 8 |
+
|
| 9 |
+
from fastapi import FastAPI, HTTPException, UploadFile, File, Form
|
| 10 |
+
from fastapi.responses import FileResponse
|
| 11 |
+
from pydantic import BaseModel
|
| 12 |
+
from TTS.api import TTS
|
| 13 |
+
|
| 14 |
+
# Set environment variables for Coqui TTS
|
| 15 |
+
os.environ["COQUI_TOS_AGREED"] = "1"
|
| 16 |
+
|
| 17 |
+
# Configure logging
|
| 18 |
+
logging.basicConfig(level=logging.INFO)
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
app = FastAPI(
|
| 22 |
+
title="Coqui TTS C-3PO API",
|
| 23 |
+
description="Text-to-Speech API using Coqui TTS with C-3PO fine-tuned voice model",
|
| 24 |
+
version="1.0.0"
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
class TTSRequest(BaseModel):
|
| 28 |
+
text: str
|
| 29 |
+
language: str = "en"
|
| 30 |
+
|
| 31 |
+
class CoquiTTSService:
|
| 32 |
+
def __init__(self):
|
| 33 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 34 |
+
logger.info(f"Using device: {self.device}")
|
| 35 |
+
|
| 36 |
+
# Download and initialize the C-3PO fine-tuned model
|
| 37 |
+
try:
|
| 38 |
+
logger.info("Downloading C-3PO fine-tuned XTTS model from Hugging Face...")
|
| 39 |
+
|
| 40 |
+
# Download the model files from Hugging Face
|
| 41 |
+
model_path = snapshot_download(
|
| 42 |
+
repo_id="Borcherding/XTTS-v2_C3PO",
|
| 43 |
+
local_dir="./models/XTTS-v2_C3PO",
|
| 44 |
+
local_dir_use_symlinks=False
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
logger.info(f"Model downloaded to: {model_path}")
|
| 48 |
+
|
| 49 |
+
# Initialize TTS with the downloaded C-3PO model
|
| 50 |
+
config_path = os.path.join(model_path, "config.json")
|
| 51 |
+
|
| 52 |
+
if os.path.exists(config_path):
|
| 53 |
+
logger.info("Loading C-3PO fine-tuned model...")
|
| 54 |
+
self.tts = TTS(
|
| 55 |
+
model_path=model_path,
|
| 56 |
+
config_path=config_path,
|
| 57 |
+
progress_bar=False,
|
| 58 |
+
gpu=torch.cuda.is_available()
|
| 59 |
+
).to(self.device)
|
| 60 |
+
logger.info("C-3PO fine-tuned model loaded successfully!")
|
| 61 |
+
else:
|
| 62 |
+
# Fallback to using the model by name if config not found
|
| 63 |
+
logger.info("Config not found, trying to load by repo ID...")
|
| 64 |
+
self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
|
| 65 |
+
logger.info("Fallback XTTS v2 model loaded!")
|
| 66 |
+
|
| 67 |
+
# Store model path for reference audio
|
| 68 |
+
self.model_path = model_path
|
| 69 |
+
|
| 70 |
+
# Check for speakers
|
| 71 |
+
if hasattr(self.tts, 'speakers') and self.tts.speakers:
|
| 72 |
+
logger.info(f"Available speakers: {len(self.tts.speakers)}")
|
| 73 |
+
self.default_speaker = self.tts.speakers[0] if self.tts.speakers else None
|
| 74 |
+
else:
|
| 75 |
+
logger.info("No preset speakers available - voice cloning mode")
|
| 76 |
+
self.default_speaker = None
|
| 77 |
+
|
| 78 |
+
except Exception as e:
|
| 79 |
+
logger.error(f"Failed to load C-3PO model: {e}")
|
| 80 |
+
logger.info("Falling back to standard XTTS v2 model...")
|
| 81 |
+
try:
|
| 82 |
+
self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
|
| 83 |
+
self.model_path = None
|
| 84 |
+
self.default_speaker = None
|
| 85 |
+
logger.info("Fallback XTTS v2 model loaded!")
|
| 86 |
+
except Exception as fallback_error:
|
| 87 |
+
logger.error(f"Fallback model also failed: {fallback_error}")
|
| 88 |
+
raise fallback_error
|
| 89 |
+
|
| 90 |
+
def get_c3po_reference_audio(self):
|
| 91 |
+
"""Get reference audio file for C-3PO voice if available"""
|
| 92 |
+
if self.model_path:
|
| 93 |
+
# Look for reference audio files in the model directory
|
| 94 |
+
possible_ref_files = [
|
| 95 |
+
"reference.wav", "speaker.wav", "c3po.wav",
|
| 96 |
+
"sample.wav", "reference_audio.wav"
|
| 97 |
+
]
|
| 98 |
+
|
| 99 |
+
for ref_file in possible_ref_files:
|
| 100 |
+
ref_path = os.path.join(self.model_path, ref_file)
|
| 101 |
+
if os.path.exists(ref_path):
|
| 102 |
+
logger.info(f"Found C-3PO reference audio: {ref_path}")
|
| 103 |
+
return ref_path
|
| 104 |
+
|
| 105 |
+
return None
|
| 106 |
+
|
| 107 |
+
def generate_speech(self, text: str, speaker_wav_path: Optional[str] = None,
|
| 108 |
+
language: str = "en", use_c3po_voice: bool = True) -> str:
|
| 109 |
+
"""Generate speech using Coqui TTS with optional C-3PO voice"""
|
| 110 |
+
try:
|
| 111 |
+
# Validate text length
|
| 112 |
+
if len(text) < 2:
|
| 113 |
+
raise HTTPException(status_code=400, detail="Text too short")
|
| 114 |
+
if len(text) > 500:
|
| 115 |
+
raise HTTPException(status_code=400, detail="Text too long (max 500 characters)")
|
| 116 |
+
|
| 117 |
+
# Generate unique output filename
|
| 118 |
+
output_filename = f"c3po_tts_output_{uuid.uuid4().hex}.wav"
|
| 119 |
+
output_path = os.path.join(tempfile.gettempdir(), output_filename)
|
| 120 |
+
|
| 121 |
+
# Determine which speaker to use
|
| 122 |
+
final_speaker_wav = speaker_wav_path
|
| 123 |
+
|
| 124 |
+
# If no speaker provided and C-3PO voice requested, try to use reference audio
|
| 125 |
+
if not final_speaker_wav and use_c3po_voice:
|
| 126 |
+
c3po_ref = self.get_c3po_reference_audio()
|
| 127 |
+
if c3po_ref:
|
| 128 |
+
final_speaker_wav = c3po_ref
|
| 129 |
+
logger.info("Using C-3PO reference audio for voice synthesis")
|
| 130 |
+
|
| 131 |
+
if final_speaker_wav:
|
| 132 |
+
# Voice cloning mode
|
| 133 |
+
logger.info("Generating speech with voice cloning...")
|
| 134 |
+
wav = self.tts.tts(
|
| 135 |
+
text=text,
|
| 136 |
+
speaker_wav=final_speaker_wav,
|
| 137 |
+
language=language
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
# Save the audio
|
| 141 |
+
import torchaudio
|
| 142 |
+
if isinstance(wav, list):
|
| 143 |
+
wav = torch.tensor(wav)
|
| 144 |
+
if wav.dim() == 1:
|
| 145 |
+
wav = wav.unsqueeze(0)
|
| 146 |
+
|
| 147 |
+
torchaudio.save(output_path, wav, 22050)
|
| 148 |
+
|
| 149 |
+
elif self.default_speaker:
|
| 150 |
+
# Use preset speaker
|
| 151 |
+
logger.info(f"Generating speech with preset speaker: {self.default_speaker}")
|
| 152 |
+
self.tts.tts_to_file(
|
| 153 |
+
text=text,
|
| 154 |
+
speaker=self.default_speaker,
|
| 155 |
+
language=language,
|
| 156 |
+
file_path=output_path
|
| 157 |
+
)
|
| 158 |
+
else:
|
| 159 |
+
# Try without speaker (some models support this)
|
| 160 |
+
logger.info("Generating speech without specific speaker...")
|
| 161 |
+
self.tts.tts_to_file(
|
| 162 |
+
text=text,
|
| 163 |
+
language=language,
|
| 164 |
+
file_path=output_path
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
if not os.path.exists(output_path):
|
| 168 |
+
raise HTTPException(status_code=500, detail="Failed to generate audio file")
|
| 169 |
+
|
| 170 |
+
logger.info(f"Speech generated successfully: {output_path}")
|
| 171 |
+
return output_path
|
| 172 |
+
|
| 173 |
+
except Exception as e:
|
| 174 |
+
logger.error(f"Error generating speech: {e}")
|
| 175 |
+
if isinstance(e, HTTPException):
|
| 176 |
+
raise e
|
| 177 |
+
raise HTTPException(status_code=500, detail=f"Speech generation failed: {str(e)}")
|
| 178 |
+
|
| 179 |
+
# Initialize TTS service
|
| 180 |
+
logger.info("Initializing Coqui TTS service...")
|
| 181 |
+
try:
|
| 182 |
+
tts_service = CoquiTTSService()
|
| 183 |
+
logger.info("TTS service initialized successfully")
|
| 184 |
+
except Exception as e:
|
| 185 |
+
logger.error(f"Failed to initialize TTS service: {e}")
|
| 186 |
+
tts_service = None
|
| 187 |
+
|
| 188 |
+
@app.get("/")
|
| 189 |
+
async def root():
|
| 190 |
+
"""Root endpoint with API information"""
|
| 191 |
+
return {
|
| 192 |
+
"message": "Coqui TTS C-3PO API",
|
| 193 |
+
"status": "healthy" if tts_service else "error",
|
| 194 |
+
"model": "XTTS v2",
|
| 195 |
+
"voice_cloning": True
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
@app.get("/health")
|
| 199 |
+
async def health_check():
|
| 200 |
+
"""Health check endpoint"""
|
| 201 |
+
if not tts_service:
|
| 202 |
+
raise HTTPException(status_code=503, detail="TTS service not available")
|
| 203 |
+
|
| 204 |
+
c3po_ref_available = tts_service.get_c3po_reference_audio() is not None
|
| 205 |
+
|
| 206 |
+
return {
|
| 207 |
+
"status": "healthy",
|
| 208 |
+
"device": tts_service.device,
|
| 209 |
+
"model": "C-3PO Fine-tuned XTTS v2 (Coqui TTS)",
|
| 210 |
+
"default_speaker": tts_service.default_speaker,
|
| 211 |
+
"voice_cloning_available": True,
|
| 212 |
+
"c3po_voice_available": c3po_ref_available,
|
| 213 |
+
"model_path": getattr(tts_service, 'model_path', None)
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
@app.post("/tts")
|
| 217 |
+
async def text_to_speech(
|
| 218 |
+
text: str = Form(...),
|
| 219 |
+
language: str = Form("en"),
|
| 220 |
+
speaker_file: UploadFile = File(None),
|
| 221 |
+
use_c3po_voice: bool = Form(True)
|
| 222 |
+
):
|
| 223 |
+
"""
|
| 224 |
+
Convert text to speech using Coqui TTS
|
| 225 |
+
|
| 226 |
+
- **text**: Text to convert to speech (2-500 characters)
|
| 227 |
+
- **language**: Language code (default: "en")
|
| 228 |
+
- **speaker_file**: Reference audio file for voice cloning (optional)
|
| 229 |
+
- **use_c3po_voice**: Use C-3PO voice if no speaker file provided (default: True)
|
| 230 |
+
"""
|
| 231 |
+
if not tts_service:
|
| 232 |
+
raise HTTPException(status_code=503, detail="TTS service not available")
|
| 233 |
+
|
| 234 |
+
if not text.strip():
|
| 235 |
+
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
| 236 |
+
|
| 237 |
+
speaker_temp_path = None
|
| 238 |
+
|
| 239 |
+
try:
|
| 240 |
+
# Handle speaker file if provided
|
| 241 |
+
if speaker_file is not None:
|
| 242 |
+
if not speaker_file.content_type or not speaker_file.content_type.startswith('audio/'):
|
| 243 |
+
raise HTTPException(status_code=400, detail="Speaker file must be an audio file")
|
| 244 |
+
|
| 245 |
+
# Save uploaded file temporarily
|
| 246 |
+
speaker_temp_path = os.path.join(
|
| 247 |
+
tempfile.gettempdir(),
|
| 248 |
+
f"speaker_{uuid.uuid4().hex}.wav"
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
with open(speaker_temp_path, "wb") as buffer:
|
| 252 |
+
content = await speaker_file.read()
|
| 253 |
+
buffer.write(content)
|
| 254 |
+
|
| 255 |
+
logger.info(f"Speaker file saved: {speaker_temp_path}")
|
| 256 |
+
|
| 257 |
+
# Generate speech
|
| 258 |
+
output_path = tts_service.generate_speech(text, speaker_temp_path, language, use_c3po_voice)
|
| 259 |
+
|
| 260 |
+
# Clean up temporary speaker file
|
| 261 |
+
if speaker_temp_path and os.path.exists(speaker_temp_path):
|
| 262 |
+
try:
|
| 263 |
+
os.remove(speaker_temp_path)
|
| 264 |
+
except:
|
| 265 |
+
pass
|
| 266 |
+
|
| 267 |
+
# Return the generated audio
|
| 268 |
+
voice_type = "custom" if speaker_file else ("c3po" if use_c3po_voice else "default")
|
| 269 |
+
return FileResponse(
|
| 270 |
+
output_path,
|
| 271 |
+
media_type="audio/wav",
|
| 272 |
+
filename=f"c3po_tts_{voice_type}_{uuid.uuid4().hex}.wav",
|
| 273 |
+
headers={"Content-Disposition": "attachment"}
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
except Exception as e:
|
| 277 |
+
# Clean up on error
|
| 278 |
+
if speaker_temp_path and os.path.exists(speaker_temp_path):
|
| 279 |
+
try:
|
| 280 |
+
os.remove(speaker_temp_path)
|
| 281 |
+
except:
|
| 282 |
+
pass
|
| 283 |
+
|
| 284 |
+
logger.error(f"Error in TTS endpoint: {e}")
|
| 285 |
+
if isinstance(e, HTTPException):
|
| 286 |
+
raise e
|
| 287 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 288 |
+
|
| 289 |
+
@app.post("/tts-c3po")
|
| 290 |
+
async def text_to_speech_c3po(
|
| 291 |
+
text: str = Form(...),
|
| 292 |
+
language: str = Form("en")
|
| 293 |
+
):
|
| 294 |
+
"""
|
| 295 |
+
Convert text to speech using C-3PO voice specifically
|
| 296 |
+
|
| 297 |
+
- **text**: Text to convert to speech (2-500 characters)
|
| 298 |
+
- **language**: Language code (default: "en")
|
| 299 |
+
"""
|
| 300 |
+
if not tts_service:
|
| 301 |
+
raise HTTPException(status_code=503, detail="TTS service not available")
|
| 302 |
+
|
| 303 |
+
if not text.strip():
|
| 304 |
+
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
| 305 |
+
|
| 306 |
+
# Check if C-3PO voice is available
|
| 307 |
+
c3po_ref = tts_service.get_c3po_reference_audio()
|
| 308 |
+
if not c3po_ref:
|
| 309 |
+
raise HTTPException(status_code=503, detail="C-3PO reference audio not available")
|
| 310 |
+
|
| 311 |
+
try:
|
| 312 |
+
# Generate speech with C-3PO voice
|
| 313 |
+
output_path = tts_service.generate_speech(text, None, language, use_c3po_voice=True)
|
| 314 |
+
|
| 315 |
+
return FileResponse(
|
| 316 |
+
output_path,
|
| 317 |
+
media_type="audio/wav",
|
| 318 |
+
filename=f"c3po_voice_{uuid.uuid4().hex}.wav",
|
| 319 |
+
headers={"Content-Disposition": "attachment"}
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
except Exception as e:
|
| 323 |
+
logger.error(f"Error in C-3PO TTS endpoint: {e}")
|
| 324 |
+
if isinstance(e, HTTPException):
|
| 325 |
+
raise e
|
| 326 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 327 |
+
|
| 328 |
+
@app.post("/tts-json")
|
| 329 |
+
async def text_to_speech_json(request: TTSRequest):
|
| 330 |
+
"""
|
| 331 |
+
Convert text to speech using JSON request with C-3PO voice
|
| 332 |
+
|
| 333 |
+
- **request**: TTSRequest containing text and language
|
| 334 |
+
"""
|
| 335 |
+
if not tts_service:
|
| 336 |
+
raise HTTPException(status_code=503, detail="TTS service not available")
|
| 337 |
+
|
| 338 |
+
if not request.text.strip():
|
| 339 |
+
raise HTTPException(status_code=400, detail="Text cannot be empty")
|
| 340 |
+
|
| 341 |
+
try:
|
| 342 |
+
# Generate speech with C-3PO voice by default
|
| 343 |
+
output_path = tts_service.generate_speech(request.text, None, request.language, use_c3po_voice=True)
|
| 344 |
+
|
| 345 |
+
return FileResponse(
|
| 346 |
+
output_path,
|
| 347 |
+
media_type="audio/wav",
|
| 348 |
+
filename=f"c3po_tts_{request.language}_{uuid.uuid4().hex}.wav",
|
| 349 |
+
headers={"Content-Disposition": "attachment"}
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
except Exception as e:
|
| 353 |
+
logger.error(f"Error in TTS JSON endpoint: {e}")
|
| 354 |
+
if isinstance(e, HTTPException):
|
| 355 |
+
raise e
|
| 356 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 357 |
+
|
| 358 |
+
@app.get("/models")
|
| 359 |
+
async def list_models():
|
| 360 |
+
"""List available TTS models"""
|
| 361 |
+
try:
|
| 362 |
+
# Create a temporary TTS instance to list models
|
| 363 |
+
temp_tts = TTS()
|
| 364 |
+
models = temp_tts.list_models()
|
| 365 |
+
return {"models": models[:20]} # Return first 20 models
|
| 366 |
+
except Exception as e:
|
| 367 |
+
logger.error(f"Error listing models: {e}")
|
| 368 |
+
raise HTTPException(status_code=500, detail="Failed to list models")
|
| 369 |
+
|
| 370 |
+
if __name__ == "__main__":
|
| 371 |
+
import uvicorn
|
| 372 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
requirements.txt
CHANGED
|
@@ -1,11 +1,13 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
SpeechRecognition>=3.8.1
|
| 2 |
+
gtts>=2.3.2
|
| 3 |
+
openai-whisper>=20240930
|
| 4 |
+
pygame>=2.5.2
|
| 5 |
+
anyascii>=0.3.0
|
| 6 |
+
einops>=0.6.0
|
| 7 |
+
encodec>=0.1.1
|
| 8 |
+
inflect>=5.6.0
|
| 9 |
+
num2words>=0.5.14
|
| 10 |
+
pysbd>=0.3.4
|
| 11 |
+
tqdm>=4.64.1
|
| 12 |
+
coqui-tts == 0.26.2
|
| 13 |
+
huggingface_hub>=0.17.0
|
requirements_coqui.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi>=0.104.1
|
| 2 |
+
uvicorn[standard]>=0.24.0
|
| 3 |
+
python-multipart>=0.0.6
|
| 4 |
+
coqui-tts==0.26.2
|
| 5 |
+
torch>=2.0.0
|
| 6 |
+
torchaudio>=2.0.0
|
| 7 |
+
numpy>=1.24.0
|
| 8 |
+
scipy>=1.11.0
|
| 9 |
+
pydub>=0.25.1
|
| 10 |
+
librosa>=0.10.0
|
| 11 |
+
soundfile>=0.12.1
|
| 12 |
+
typing-extensions>=4.8.0
|
start_c3po_api.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Startup script for C-3PO TTS API
|
| 4 |
+
Handles model download, initialization, and server startup
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
import subprocess
|
| 10 |
+
import logging
|
| 11 |
+
import time
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
# Configure logging
|
| 15 |
+
logging.basicConfig(
|
| 16 |
+
level=logging.INFO,
|
| 17 |
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 18 |
+
)
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
def check_dependencies():
|
| 22 |
+
"""Check if all required dependencies are installed"""
|
| 23 |
+
logger.info("🔍 Checking dependencies...")
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
import torch
|
| 27 |
+
import TTS
|
| 28 |
+
import fastapi
|
| 29 |
+
import huggingface_hub
|
| 30 |
+
logger.info("✅ All core dependencies found")
|
| 31 |
+
return True
|
| 32 |
+
except ImportError as e:
|
| 33 |
+
logger.error(f"❌ Missing dependency: {e}")
|
| 34 |
+
logger.info("💡 Install with: pip install -r requirements.txt")
|
| 35 |
+
return False
|
| 36 |
+
|
| 37 |
+
def check_gpu():
|
| 38 |
+
"""Check GPU availability"""
|
| 39 |
+
try:
|
| 40 |
+
import torch
|
| 41 |
+
if torch.cuda.is_available():
|
| 42 |
+
gpu_name = torch.cuda.get_device_name(0)
|
| 43 |
+
logger.info(f"🎮 GPU available: {gpu_name}")
|
| 44 |
+
return True
|
| 45 |
+
else:
|
| 46 |
+
logger.info("💻 No GPU available, using CPU")
|
| 47 |
+
return False
|
| 48 |
+
except Exception as e:
|
| 49 |
+
logger.warning(f"⚠️ GPU check failed: {e}")
|
| 50 |
+
return False
|
| 51 |
+
|
| 52 |
+
def check_disk_space():
|
| 53 |
+
"""Check available disk space for model download"""
|
| 54 |
+
try:
|
| 55 |
+
import shutil
|
| 56 |
+
free_space = shutil.disk_usage('.').free / (1024**3) # GB
|
| 57 |
+
|
| 58 |
+
if free_space < 5:
|
| 59 |
+
logger.warning(f"⚠️ Low disk space: {free_space:.1f}GB available")
|
| 60 |
+
logger.warning("💽 C-3PO model requires ~2GB space")
|
| 61 |
+
else:
|
| 62 |
+
logger.info(f"💾 Disk space: {free_space:.1f}GB available")
|
| 63 |
+
|
| 64 |
+
return free_space > 2
|
| 65 |
+
except Exception as e:
|
| 66 |
+
logger.warning(f"⚠️ Disk space check failed: {e}")
|
| 67 |
+
return True
|
| 68 |
+
|
| 69 |
+
def setup_environment():
|
| 70 |
+
"""Set up environment variables"""
|
| 71 |
+
os.environ["COQUI_TOS_AGREED"] = "1"
|
| 72 |
+
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
| 73 |
+
|
| 74 |
+
# Create models directory
|
| 75 |
+
models_dir = Path("./models")
|
| 76 |
+
models_dir.mkdir(exist_ok=True)
|
| 77 |
+
|
| 78 |
+
logger.info("🌍 Environment configured")
|
| 79 |
+
|
| 80 |
+
def install_dependencies():
|
| 81 |
+
"""Install missing dependencies"""
|
| 82 |
+
logger.info("📦 Installing dependencies...")
|
| 83 |
+
|
| 84 |
+
try:
|
| 85 |
+
subprocess.check_call([
|
| 86 |
+
sys.executable, "-m", "pip", "install", "-r", "requirements.txt"
|
| 87 |
+
])
|
| 88 |
+
logger.info("✅ Dependencies installed successfully")
|
| 89 |
+
return True
|
| 90 |
+
except subprocess.CalledProcessError as e:
|
| 91 |
+
logger.error(f"❌ Failed to install dependencies: {e}")
|
| 92 |
+
return False
|
| 93 |
+
|
| 94 |
+
def test_model_download():
|
| 95 |
+
"""Test if the C-3PO model can be downloaded"""
|
| 96 |
+
logger.info("🤖 Testing C-3PO model availability...")
|
| 97 |
+
|
| 98 |
+
try:
|
| 99 |
+
from huggingface_hub import repo_info
|
| 100 |
+
|
| 101 |
+
# Check if the repo exists and is accessible
|
| 102 |
+
info = repo_info(repo_id="Borcherding/XTTS-v2_C3PO")
|
| 103 |
+
logger.info(f"✅ C-3PO model accessible: {info.id}")
|
| 104 |
+
logger.info(f" Last modified: {info.last_modified}")
|
| 105 |
+
|
| 106 |
+
return True
|
| 107 |
+
except Exception as e:
|
| 108 |
+
logger.error(f"❌ C-3PO model not accessible: {e}")
|
| 109 |
+
return False
|
| 110 |
+
|
| 111 |
+
def start_api_server():
|
| 112 |
+
"""Start the FastAPI server"""
|
| 113 |
+
logger.info("🚀 Starting C-3PO TTS API server...")
|
| 114 |
+
|
| 115 |
+
try:
|
| 116 |
+
# Import and run the API
|
| 117 |
+
import uvicorn
|
| 118 |
+
from coqui_api import app
|
| 119 |
+
|
| 120 |
+
logger.info("🎭 C-3PO TTS API starting on http://localhost:7860")
|
| 121 |
+
logger.info("📖 API documentation available at http://localhost:7860/docs")
|
| 122 |
+
|
| 123 |
+
uvicorn.run(
|
| 124 |
+
app,
|
| 125 |
+
host="0.0.0.0",
|
| 126 |
+
port=7860,
|
| 127 |
+
log_level="info"
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
except Exception as e:
|
| 131 |
+
logger.error(f"❌ Failed to start API server: {e}")
|
| 132 |
+
return False
|
| 133 |
+
|
| 134 |
+
def main():
|
| 135 |
+
"""Main startup sequence"""
|
| 136 |
+
print("🤖 C-3PO TTS API Startup")
|
| 137 |
+
print("=" * 50)
|
| 138 |
+
|
| 139 |
+
# Step 1: Check dependencies
|
| 140 |
+
if not check_dependencies():
|
| 141 |
+
logger.info("📦 Attempting to install dependencies...")
|
| 142 |
+
if not install_dependencies():
|
| 143 |
+
logger.error("❌ Failed to install dependencies. Exiting.")
|
| 144 |
+
sys.exit(1)
|
| 145 |
+
|
| 146 |
+
# Step 2: Setup environment
|
| 147 |
+
setup_environment()
|
| 148 |
+
|
| 149 |
+
# Step 3: Check system resources
|
| 150 |
+
has_gpu = check_gpu()
|
| 151 |
+
has_space = check_disk_space()
|
| 152 |
+
|
| 153 |
+
if not has_space:
|
| 154 |
+
logger.error("❌ Insufficient disk space. Exiting.")
|
| 155 |
+
sys.exit(1)
|
| 156 |
+
|
| 157 |
+
# Step 4: Test model availability
|
| 158 |
+
if not test_model_download():
|
| 159 |
+
logger.warning("⚠️ C-3PO model may not be accessible")
|
| 160 |
+
logger.warning(" The API will fall back to standard XTTS v2")
|
| 161 |
+
|
| 162 |
+
# Step 5: Start the server
|
| 163 |
+
print("\n" + "=" * 50)
|
| 164 |
+
logger.info("🎬 All checks passed! Starting C-3PO TTS API...")
|
| 165 |
+
print("=" * 50)
|
| 166 |
+
|
| 167 |
+
try:
|
| 168 |
+
start_api_server()
|
| 169 |
+
except KeyboardInterrupt:
|
| 170 |
+
logger.info("\n🛑 Server stopped by user")
|
| 171 |
+
except Exception as e:
|
| 172 |
+
logger.error(f"❌ Server error: {e}")
|
| 173 |
+
sys.exit(1)
|
| 174 |
+
|
| 175 |
+
if __name__ == "__main__":
|
| 176 |
+
main()
|
test_c3po_model.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script for C-3PO TTS model integration
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import requests
|
| 8 |
+
import json
|
| 9 |
+
import tempfile
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
# Test configuration
|
| 13 |
+
API_BASE_URL = "http://localhost:7860"
|
| 14 |
+
TEST_TEXTS = [
|
| 15 |
+
"I am C-3PO, human-cyborg relations.",
|
| 16 |
+
"The odds of successfully navigating an asteroid field are approximately 3,720 to 1.",
|
| 17 |
+
"R2-D2, you know better than to trust a strange computer!",
|
| 18 |
+
"Oh my! How interesting!"
|
| 19 |
+
]
|
| 20 |
+
|
| 21 |
+
def test_health_check():
|
| 22 |
+
"""Test the health check endpoint"""
|
| 23 |
+
print("🔍 Testing health check...")
|
| 24 |
+
try:
|
| 25 |
+
response = requests.get(f"{API_BASE_URL}/health")
|
| 26 |
+
if response.status_code == 200:
|
| 27 |
+
data = response.json()
|
| 28 |
+
print(f"✅ Health check passed")
|
| 29 |
+
print(f" Model: {data.get('model', 'Unknown')}")
|
| 30 |
+
print(f" Device: {data.get('device', 'Unknown')}")
|
| 31 |
+
print(f" C-3PO voice available: {data.get('c3po_voice_available', False)}")
|
| 32 |
+
print(f" Model path: {data.get('model_path', 'Not specified')}")
|
| 33 |
+
return True
|
| 34 |
+
else:
|
| 35 |
+
print(f"❌ Health check failed: {response.status_code}")
|
| 36 |
+
return False
|
| 37 |
+
except Exception as e:
|
| 38 |
+
print(f"❌ Health check error: {e}")
|
| 39 |
+
return False
|
| 40 |
+
|
| 41 |
+
def test_c3po_endpoint():
|
| 42 |
+
"""Test the dedicated C-3PO endpoint"""
|
| 43 |
+
print("\n🎭 Testing C-3PO endpoint...")
|
| 44 |
+
|
| 45 |
+
test_text = "I am C-3PO, human-cyborg relations."
|
| 46 |
+
|
| 47 |
+
try:
|
| 48 |
+
data = {
|
| 49 |
+
'text': test_text,
|
| 50 |
+
'language': 'en'
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
response = requests.post(f"{API_BASE_URL}/tts-c3po", data=data)
|
| 54 |
+
|
| 55 |
+
if response.status_code == 200:
|
| 56 |
+
# Save the audio file
|
| 57 |
+
output_path = Path(tempfile.gettempdir()) / "c3po_test_output.wav"
|
| 58 |
+
with open(output_path, 'wb') as f:
|
| 59 |
+
f.write(response.content)
|
| 60 |
+
|
| 61 |
+
print(f"✅ C-3PO endpoint test passed")
|
| 62 |
+
print(f" Audio saved to: {output_path}")
|
| 63 |
+
print(f" File size: {os.path.getsize(output_path)} bytes")
|
| 64 |
+
return True
|
| 65 |
+
else:
|
| 66 |
+
print(f"❌ C-3PO endpoint failed: {response.status_code}")
|
| 67 |
+
print(f" Response: {response.text}")
|
| 68 |
+
return False
|
| 69 |
+
|
| 70 |
+
except Exception as e:
|
| 71 |
+
print(f"❌ C-3PO endpoint error: {e}")
|
| 72 |
+
return False
|
| 73 |
+
|
| 74 |
+
def test_general_tts_with_c3po():
|
| 75 |
+
"""Test the general TTS endpoint with C-3PO voice enabled"""
|
| 76 |
+
print("\n🎤 Testing general TTS with C-3PO voice...")
|
| 77 |
+
|
| 78 |
+
test_text = "The odds of successfully navigating an asteroid field are approximately 3,720 to 1."
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
data = {
|
| 82 |
+
'text': test_text,
|
| 83 |
+
'language': 'en',
|
| 84 |
+
'use_c3po_voice': 'true'
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
response = requests.post(f"{API_BASE_URL}/tts", data=data)
|
| 88 |
+
|
| 89 |
+
if response.status_code == 200:
|
| 90 |
+
# Save the audio file
|
| 91 |
+
output_path = Path(tempfile.gettempdir()) / "general_c3po_test_output.wav"
|
| 92 |
+
with open(output_path, 'wb') as f:
|
| 93 |
+
f.write(response.content)
|
| 94 |
+
|
| 95 |
+
print(f"✅ General TTS with C-3PO test passed")
|
| 96 |
+
print(f" Audio saved to: {output_path}")
|
| 97 |
+
print(f" File size: {os.path.getsize(output_path)} bytes")
|
| 98 |
+
return True
|
| 99 |
+
else:
|
| 100 |
+
print(f"❌ General TTS with C-3PO failed: {response.status_code}")
|
| 101 |
+
print(f" Response: {response.text}")
|
| 102 |
+
return False
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
print(f"❌ General TTS with C-3PO error: {e}")
|
| 106 |
+
return False
|
| 107 |
+
|
| 108 |
+
def test_json_endpoint():
|
| 109 |
+
"""Test the JSON endpoint"""
|
| 110 |
+
print("\n📄 Testing JSON endpoint...")
|
| 111 |
+
|
| 112 |
+
test_text = "R2-D2, you know better than to trust a strange computer!"
|
| 113 |
+
|
| 114 |
+
try:
|
| 115 |
+
data = {
|
| 116 |
+
'text': test_text,
|
| 117 |
+
'language': 'en'
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
headers = {'Content-Type': 'application/json'}
|
| 121 |
+
response = requests.post(f"{API_BASE_URL}/tts-json", json=data, headers=headers)
|
| 122 |
+
|
| 123 |
+
if response.status_code == 200:
|
| 124 |
+
# Save the audio file
|
| 125 |
+
output_path = Path(tempfile.gettempdir()) / "json_c3po_test_output.wav"
|
| 126 |
+
with open(output_path, 'wb') as f:
|
| 127 |
+
f.write(response.content)
|
| 128 |
+
|
| 129 |
+
print(f"✅ JSON endpoint test passed")
|
| 130 |
+
print(f" Audio saved to: {output_path}")
|
| 131 |
+
print(f" File size: {os.path.getsize(output_path)} bytes")
|
| 132 |
+
return True
|
| 133 |
+
else:
|
| 134 |
+
print(f"❌ JSON endpoint failed: {response.status_code}")
|
| 135 |
+
print(f" Response: {response.text}")
|
| 136 |
+
return False
|
| 137 |
+
|
| 138 |
+
except Exception as e:
|
| 139 |
+
print(f"❌ JSON endpoint error: {e}")
|
| 140 |
+
return False
|
| 141 |
+
|
| 142 |
+
def test_multilingual_support():
|
| 143 |
+
"""Test multilingual support with C-3PO voice"""
|
| 144 |
+
print("\n🌍 Testing multilingual support...")
|
| 145 |
+
|
| 146 |
+
test_cases = [
|
| 147 |
+
("Hello, I am C-3PO", "en"),
|
| 148 |
+
("Hola, soy C-3PO", "es"),
|
| 149 |
+
("Bonjour, je suis C-3PO", "fr"),
|
| 150 |
+
("Guten Tag, ich bin C-3PO", "de")
|
| 151 |
+
]
|
| 152 |
+
|
| 153 |
+
success_count = 0
|
| 154 |
+
|
| 155 |
+
for text, language in test_cases:
|
| 156 |
+
try:
|
| 157 |
+
data = {
|
| 158 |
+
'text': text,
|
| 159 |
+
'language': language
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
response = requests.post(f"{API_BASE_URL}/tts-c3po", data=data)
|
| 163 |
+
|
| 164 |
+
if response.status_code == 200:
|
| 165 |
+
output_path = Path(tempfile.gettempdir()) / f"c3po_test_{language}.wav"
|
| 166 |
+
with open(output_path, 'wb') as f:
|
| 167 |
+
f.write(response.content)
|
| 168 |
+
|
| 169 |
+
print(f" ✅ {language}: {text} -> {output_path}")
|
| 170 |
+
success_count += 1
|
| 171 |
+
else:
|
| 172 |
+
print(f" ❌ {language}: Failed ({response.status_code})")
|
| 173 |
+
|
| 174 |
+
except Exception as e:
|
| 175 |
+
print(f" ❌ {language}: Error - {e}")
|
| 176 |
+
|
| 177 |
+
print(f"\n Multilingual test: {success_count}/{len(test_cases)} languages successful")
|
| 178 |
+
return success_count == len(test_cases)
|
| 179 |
+
|
| 180 |
+
def main():
|
| 181 |
+
"""Run all tests"""
|
| 182 |
+
print("🚀 Starting C-3PO TTS Model Tests")
|
| 183 |
+
print("=" * 50)
|
| 184 |
+
|
| 185 |
+
tests = [
|
| 186 |
+
test_health_check,
|
| 187 |
+
test_c3po_endpoint,
|
| 188 |
+
test_general_tts_with_c3po,
|
| 189 |
+
test_json_endpoint,
|
| 190 |
+
test_multilingual_support
|
| 191 |
+
]
|
| 192 |
+
|
| 193 |
+
passed = 0
|
| 194 |
+
total = len(tests)
|
| 195 |
+
|
| 196 |
+
for test in tests:
|
| 197 |
+
if test():
|
| 198 |
+
passed += 1
|
| 199 |
+
|
| 200 |
+
print("\n" + "=" * 50)
|
| 201 |
+
print(f"🎯 Test Results: {passed}/{total} tests passed")
|
| 202 |
+
|
| 203 |
+
if passed == total:
|
| 204 |
+
print("🎉 All tests passed! C-3PO model integration is working correctly.")
|
| 205 |
+
else:
|
| 206 |
+
print("⚠️ Some tests failed. Check the API logs for more details.")
|
| 207 |
+
|
| 208 |
+
print("\n💡 Tips:")
|
| 209 |
+
print(" - Make sure the API server is running on http://localhost:7860")
|
| 210 |
+
print(" - Check that the C-3PO model downloaded successfully")
|
| 211 |
+
print(" - Generated audio files are saved in the system temp directory")
|
| 212 |
+
|
| 213 |
+
if __name__ == "__main__":
|
| 214 |
+
main()
|
test_coqui_api.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
|
| 5 |
+
# API base URL (update this to your deployed Hugging Face Space URL)
|
| 6 |
+
BASE_URL = "http://localhost:7860" # Change to your HF Space URL when deployed
|
| 7 |
+
|
| 8 |
+
def test_health():
|
| 9 |
+
"""Test the health endpoint"""
|
| 10 |
+
print("🔍 Testing health endpoint...")
|
| 11 |
+
try:
|
| 12 |
+
response = requests.get(f"{BASE_URL}/health")
|
| 13 |
+
if response.status_code == 200:
|
| 14 |
+
print("✅ Health check passed!")
|
| 15 |
+
print(f"Response: {response.json()}")
|
| 16 |
+
else:
|
| 17 |
+
print(f"❌ Health check failed: {response.status_code}")
|
| 18 |
+
print(f"Response: {response.text}")
|
| 19 |
+
except Exception as e:
|
| 20 |
+
print(f"❌ Health check error: {e}")
|
| 21 |
+
|
| 22 |
+
def test_list_models():
|
| 23 |
+
"""Test the models endpoint"""
|
| 24 |
+
print("\n🔍 Testing models endpoint...")
|
| 25 |
+
try:
|
| 26 |
+
response = requests.get(f"{BASE_URL}/models")
|
| 27 |
+
if response.status_code == 200:
|
| 28 |
+
models = response.json()
|
| 29 |
+
print("✅ Models endpoint working!")
|
| 30 |
+
print(f"Found {len(models.get('models', []))} models")
|
| 31 |
+
# Show first 5 models
|
| 32 |
+
for i, model in enumerate(models.get('models', [])[:5]):
|
| 33 |
+
print(f" {i+1}. {model}")
|
| 34 |
+
else:
|
| 35 |
+
print(f"❌ Models endpoint failed: {response.status_code}")
|
| 36 |
+
except Exception as e:
|
| 37 |
+
print(f"❌ Models endpoint error: {e}")
|
| 38 |
+
|
| 39 |
+
def test_simple_tts():
|
| 40 |
+
"""Test simple text-to-speech without voice cloning"""
|
| 41 |
+
print("\n🔍 Testing simple TTS...")
|
| 42 |
+
try:
|
| 43 |
+
data = {
|
| 44 |
+
"text": "Hello world! This is a test of Coqui TTS.",
|
| 45 |
+
"language": "en"
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
response = requests.post(f"{BASE_URL}/tts", data=data)
|
| 49 |
+
|
| 50 |
+
if response.status_code == 200:
|
| 51 |
+
# Save the audio file
|
| 52 |
+
output_file = "simple_tts_output.wav"
|
| 53 |
+
with open(output_file, "wb") as f:
|
| 54 |
+
f.write(response.content)
|
| 55 |
+
print(f"✅ Simple TTS successful! Audio saved to: {output_file}")
|
| 56 |
+
print(f"File size: {len(response.content)} bytes")
|
| 57 |
+
else:
|
| 58 |
+
print(f"❌ Simple TTS failed: {response.status_code}")
|
| 59 |
+
print(f"Response: {response.text}")
|
| 60 |
+
except Exception as e:
|
| 61 |
+
print(f"❌ Simple TTS error: {e}")
|
| 62 |
+
|
| 63 |
+
def test_voice_cloning(speaker_file_path=None):
|
| 64 |
+
"""Test voice cloning with uploaded speaker file"""
|
| 65 |
+
if not speaker_file_path or not os.path.exists(speaker_file_path):
|
| 66 |
+
print("\n⚠️ Skipping voice cloning test - no speaker file provided")
|
| 67 |
+
print(" To test voice cloning, provide a .wav file path")
|
| 68 |
+
return
|
| 69 |
+
|
| 70 |
+
print(f"\n🔍 Testing voice cloning with: {speaker_file_path}")
|
| 71 |
+
try:
|
| 72 |
+
data = {
|
| 73 |
+
"text": "This is voice cloning using Coqui TTS. The voice should match the reference audio.",
|
| 74 |
+
"language": "en"
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
with open(speaker_file_path, "rb") as f:
|
| 78 |
+
files = {"speaker_file": f}
|
| 79 |
+
response = requests.post(f"{BASE_URL}/tts", data=data, files=files)
|
| 80 |
+
|
| 81 |
+
if response.status_code == 200:
|
| 82 |
+
# Save the cloned audio
|
| 83 |
+
output_file = "voice_cloned_output.wav"
|
| 84 |
+
with open(output_file, "wb") as f:
|
| 85 |
+
f.write(response.content)
|
| 86 |
+
print(f"✅ Voice cloning successful! Audio saved to: {output_file}")
|
| 87 |
+
print(f"File size: {len(response.content)} bytes")
|
| 88 |
+
else:
|
| 89 |
+
print(f"❌ Voice cloning failed: {response.status_code}")
|
| 90 |
+
print(f"Response: {response.text}")
|
| 91 |
+
except Exception as e:
|
| 92 |
+
print(f"❌ Voice cloning error: {e}")
|
| 93 |
+
|
| 94 |
+
def test_json_tts():
|
| 95 |
+
"""Test JSON endpoint"""
|
| 96 |
+
print("\n🔍 Testing JSON TTS endpoint...")
|
| 97 |
+
try:
|
| 98 |
+
import json
|
| 99 |
+
|
| 100 |
+
data = {
|
| 101 |
+
"text": "This is a JSON request test for Coqui TTS API.",
|
| 102 |
+
"language": "en"
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
response = requests.post(
|
| 106 |
+
f"{BASE_URL}/tts-json",
|
| 107 |
+
headers={"Content-Type": "application/json"},
|
| 108 |
+
data=json.dumps(data)
|
| 109 |
+
)
|
| 110 |
+
|
| 111 |
+
if response.status_code == 200:
|
| 112 |
+
output_file = "json_tts_output.wav"
|
| 113 |
+
with open(output_file, "wb") as f:
|
| 114 |
+
f.write(response.content)
|
| 115 |
+
print(f"✅ JSON TTS successful! Audio saved to: {output_file}")
|
| 116 |
+
print(f"File size: {len(response.content)} bytes")
|
| 117 |
+
else:
|
| 118 |
+
print(f"❌ JSON TTS failed: {response.status_code}")
|
| 119 |
+
print(f"Response: {response.text}")
|
| 120 |
+
except Exception as e:
|
| 121 |
+
print(f"❌ JSON TTS error: {e}")
|
| 122 |
+
|
| 123 |
+
def main():
|
| 124 |
+
print("🐸 Testing Coqui TTS API")
|
| 125 |
+
print("=" * 50)
|
| 126 |
+
|
| 127 |
+
# Test all endpoints
|
| 128 |
+
test_health()
|
| 129 |
+
test_list_models()
|
| 130 |
+
test_simple_tts()
|
| 131 |
+
test_json_tts()
|
| 132 |
+
|
| 133 |
+
# Test voice cloning if speaker file is available
|
| 134 |
+
# You can specify a speaker file path here
|
| 135 |
+
speaker_file = None # Change to your speaker file path
|
| 136 |
+
test_voice_cloning(speaker_file)
|
| 137 |
+
|
| 138 |
+
print("\n🎉 API testing completed!")
|
| 139 |
+
print("\nTo test voice cloning:")
|
| 140 |
+
print("1. Record a short audio sample (5-10 seconds)")
|
| 141 |
+
print("2. Save it as a .wav file")
|
| 142 |
+
print("3. Update speaker_file variable with the file path")
|
| 143 |
+
print("4. Run the test again")
|
| 144 |
+
|
| 145 |
+
if __name__ == "__main__":
|
| 146 |
+
main()
|
test_coqui_tts.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from TTS.api import TTS
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
def test_coqui_tts():
|
| 6 |
+
"""Test Coqui TTS functionality"""
|
| 7 |
+
|
| 8 |
+
# Get device
|
| 9 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 10 |
+
print(f"Using device: {device}")
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
# List available 🐸TTS models
|
| 14 |
+
print("\n=== Available TTS Models ===")
|
| 15 |
+
tts_instance = TTS()
|
| 16 |
+
models = tts_instance.list_models()
|
| 17 |
+
|
| 18 |
+
# Print first 10 models to avoid overwhelming output
|
| 19 |
+
print("First 10 available models:")
|
| 20 |
+
for i, model in enumerate(models[:10]):
|
| 21 |
+
print(f"{i+1}. {model}")
|
| 22 |
+
|
| 23 |
+
if len(models) > 10:
|
| 24 |
+
print(f"... and {len(models) - 10} more models")
|
| 25 |
+
|
| 26 |
+
except Exception as e:
|
| 27 |
+
print(f"Error listing models: {e}")
|
| 28 |
+
return
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
# Initialize TTS with XTTS v2 model
|
| 32 |
+
print("\n=== Initializing XTTS v2 Model ===")
|
| 33 |
+
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
|
| 34 |
+
print("XTTS v2 model loaded successfully!")
|
| 35 |
+
|
| 36 |
+
# List speakers if available
|
| 37 |
+
print("\n=== Available Speakers ===")
|
| 38 |
+
if hasattr(tts, 'speakers') and tts.speakers:
|
| 39 |
+
print("Available speakers:")
|
| 40 |
+
for speaker in tts.speakers[:10]: # Show first 10
|
| 41 |
+
print(f"- {speaker}")
|
| 42 |
+
if len(tts.speakers) > 10:
|
| 43 |
+
print(f"... and {len(tts.speakers) - 10} more speakers")
|
| 44 |
+
else:
|
| 45 |
+
print("No preset speakers available or speakers list is empty")
|
| 46 |
+
|
| 47 |
+
except Exception as e:
|
| 48 |
+
print(f"Error initializing XTTS v2 model: {e}")
|
| 49 |
+
print("This might be due to model download requirements or missing dependencies")
|
| 50 |
+
return
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
# Test TTS to file with preset speaker (if available)
|
| 54 |
+
print("\n=== Testing TTS to File ===")
|
| 55 |
+
output_file = "test_output.wav"
|
| 56 |
+
|
| 57 |
+
# Check if we have speakers available
|
| 58 |
+
if hasattr(tts, 'speakers') and tts.speakers:
|
| 59 |
+
# Use first available speaker
|
| 60 |
+
speaker_name = tts.speakers[0]
|
| 61 |
+
print(f"Using speaker: {speaker_name}")
|
| 62 |
+
|
| 63 |
+
tts.tts_to_file(
|
| 64 |
+
text="Hello world! This is a test of Coqui TTS library.",
|
| 65 |
+
speaker=speaker_name,
|
| 66 |
+
language="en",
|
| 67 |
+
file_path=output_file
|
| 68 |
+
)
|
| 69 |
+
else:
|
| 70 |
+
# Try without speaker specification
|
| 71 |
+
print("No speakers available, trying without speaker specification...")
|
| 72 |
+
tts.tts_to_file(
|
| 73 |
+
text="Hello world! This is a test of Coqui TTS library.",
|
| 74 |
+
language="en",
|
| 75 |
+
file_path=output_file
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
if os.path.exists(output_file):
|
| 79 |
+
print(f"✅ TTS successful! Audio saved to: {output_file}")
|
| 80 |
+
file_size = os.path.getsize(output_file)
|
| 81 |
+
print(f"File size: {file_size} bytes")
|
| 82 |
+
else:
|
| 83 |
+
print("❌ TTS failed - output file not created")
|
| 84 |
+
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f"Error during TTS generation: {e}")
|
| 87 |
+
|
| 88 |
+
# Note about voice cloning
|
| 89 |
+
print("\n=== Voice Cloning Information ===")
|
| 90 |
+
print("To test voice cloning, you would need:")
|
| 91 |
+
print("1. A reference audio file (speaker_wav parameter)")
|
| 92 |
+
print("2. Use tts.tts() method instead of tts_to_file()")
|
| 93 |
+
print("Example:")
|
| 94 |
+
print('wav = tts.tts(text="Hello!", speaker_wav="reference.wav", language="en")')
|
| 95 |
+
|
| 96 |
+
if __name__ == "__main__":
|
| 97 |
+
print("🐸 Testing Coqui TTS Library")
|
| 98 |
+
print("=" * 50)
|
| 99 |
+
test_coqui_tts()
|