Commit
·
53f51ec
1
Parent(s):
89a8916
Add HuggingFace Spaces deployment config
Browse files- Dockerfile +29 -0
- README.md +49 -221
- app.py +19 -0
Dockerfile
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
libsndfile1 \
|
| 8 |
+
ffmpeg \
|
| 9 |
+
curl \
|
| 10 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
+
|
| 12 |
+
# Copy requirements first for caching
|
| 13 |
+
COPY requirements.txt .
|
| 14 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 15 |
+
|
| 16 |
+
# Copy application code
|
| 17 |
+
COPY src/ ./src/
|
| 18 |
+
COPY models/ ./models/
|
| 19 |
+
COPY app.py .
|
| 20 |
+
COPY download_models.py .
|
| 21 |
+
|
| 22 |
+
# Create output directory
|
| 23 |
+
RUN mkdir -p outputs
|
| 24 |
+
|
| 25 |
+
# Expose port for HuggingFace Spaces
|
| 26 |
+
EXPOSE 7860
|
| 27 |
+
|
| 28 |
+
# Run the app
|
| 29 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,246 +1,74 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
A
|
| 4 |
|
| 5 |
-
## 🎯
|
| 6 |
|
| 7 |
-
Built for
|
| 8 |
|
| 9 |
## ✨ Features
|
| 10 |
|
| 11 |
-
- **11 Indian Languages**: Hindi, Bengali, Marathi, Telugu, Kannada, Bhojpuri, Chhattisgarhi, Maithili, Magahi, English,
|
| 12 |
- **21 Voice Options**: Male & Female voices for each language
|
| 13 |
-
- **
|
| 14 |
-
- **Pitch & Speed Control**: Fine-tune voice characteristics
|
| 15 |
-
- **Lightweight**: VITS-based models optimized for fast inference
|
| 16 |
-
- **REST API**: FastAPI-powered server with OpenAPI docs
|
| 17 |
-
- **Text Normalization**: Handles numbers, punctuation for Indian scripts
|
| 18 |
-
|
| 19 |
-
## 🚀 Quick Start
|
| 20 |
-
|
| 21 |
-
### 1. Installation
|
| 22 |
-
|
| 23 |
-
```bash
|
| 24 |
-
# Clone and navigate
|
| 25 |
-
git clone https://github.com/harshil748/VoiceAPI
|
| 26 |
-
cd VoiceAPI
|
| 27 |
-
|
| 28 |
-
# Create virtual environment
|
| 29 |
-
python3 -m venv tts
|
| 30 |
-
source tts/bin/activate
|
| 31 |
-
|
| 32 |
-
# Install dependencies
|
| 33 |
-
pip install -r requirements.txt
|
| 34 |
-
```
|
| 35 |
-
|
| 36 |
-
### 2. Download Models
|
| 37 |
-
|
| 38 |
-
```bash
|
| 39 |
-
# Download Hindi models (male + female)
|
| 40 |
-
python -m src.cli download --lang hi
|
| 41 |
-
|
| 42 |
-
# Or download a specific voice
|
| 43 |
-
python -m src.cli download --voice hi_male
|
| 44 |
|
| 45 |
-
|
| 46 |
-
```
|
| 47 |
-
|
| 48 |
-
### 3. Synthesize Speech
|
| 49 |
-
|
| 50 |
-
```bash
|
| 51 |
-
# Basic synthesis
|
| 52 |
-
python -m src.cli synthesize --text "नमस्ते दोस्तों" --voice hi_male --output hello.wav
|
| 53 |
|
| 54 |
-
# Play the audio (macOS)
|
| 55 |
-
afplay hello.wav
|
| 56 |
```
|
| 57 |
-
|
| 58 |
-
### 4. Start API Server
|
| 59 |
-
|
| 60 |
-
```bash
|
| 61 |
-
python -m src.cli serve --port 8000
|
| 62 |
```
|
| 63 |
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
## 🎨 Style Presets
|
| 67 |
|
| 68 |
-
|
|
| 69 |
-
|
| 70 |
-
|
|
| 71 |
-
|
|
| 72 |
-
|
|
| 73 |
-
| `soft` | 0.9 | 0.95 | 0.7 | Calming content |
|
| 74 |
-
| `loud` | 1.0 | 1.05 | 1.3 | Alerts, emphasis |
|
| 75 |
-
| `happy` | 1.1 | 1.1 | 1.2 | Positive messages |
|
| 76 |
-
| `sad` | 0.85 | 0.9 | 0.8 | Empathetic responses |
|
| 77 |
-
| `calm` | 0.9 | 0.95 | 0.85 | **Healthcare guidance** |
|
| 78 |
-
| `excited` | 1.2 | 1.15 | 1.3 | Celebrations |
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
### 🏆 Hackathon API - GET /Get_Inference
|
| 83 |
-
|
| 84 |
-
**This is the official hackathon endpoint** that follows the Voice Tech for All specification:
|
| 85 |
|
| 86 |
```python
|
| 87 |
import requests
|
| 88 |
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
params = {
|
| 93 |
-
'text': 'ಮಾದರಿಯು ಸರಿಯಾಗಿ ಕಾರ್ಯನಿರ್ವಹಿಸುತ್ತಿದೆಯೇ ಎಂದು ಖಚಿತಪಡಿಸಿಕೊಳ್ಳಲು ಬಳಸಲಾಗುವ ಪರೀಕ್ಷಾ ವಾಕ್ಯ ಇದು.',
|
| 94 |
-
'lang': 'kannada',
|
| 95 |
-
}
|
| 96 |
-
|
| 97 |
-
with open(WavPath, "rb") as AudioFile:
|
| 98 |
-
response = requests.get(base_url, params=params, files={'speaker_wav': AudioFile})
|
| 99 |
-
|
| 100 |
-
if response.status_code == 200:
|
| 101 |
-
with open('output.wav', 'wb') as f:
|
| 102 |
-
f.write(response.content)
|
| 103 |
-
print("Audio saved as 'output.wav'")
|
| 104 |
-
```
|
| 105 |
-
|
| 106 |
-
**Query Parameters:**
|
| 107 |
-
|
| 108 |
-
| Parameter | Type | Required | Description |
|
| 109 |
-
| ------------- | ------ | --------- | ---------------------------------------------------------------------------------------------------------------- |
|
| 110 |
-
| `text` | string | Mandatory | Input text to convert to speech. For English, text must be lowercase. |
|
| 111 |
-
| `lang` | string | Mandatory | Language: bhojpuri, bengali, english, gujarati, hindi, chhattisgarhi, kannada, magahi, maithili, marathi, telugu |
|
| 112 |
-
| `speaker_wav` | file | Mandatory | Reference WAV file for speaker voice |
|
| 113 |
-
|
| 114 |
-
**Response:** `200 OK` with `Content-Type: audio/wav`
|
| 115 |
-
|
| 116 |
-
---
|
| 117 |
-
|
| 118 |
-
### Synthesize with Style (POST)
|
| 119 |
-
|
| 120 |
-
```bash
|
| 121 |
-
curl -X POST "http://localhost:8000/synthesize" \
|
| 122 |
-
-H "Content-Type: application/json" \
|
| 123 |
-
-d '{
|
| 124 |
-
"text": "आपका दिन शुभ हो",
|
| 125 |
-
"voice": "hi_female",
|
| 126 |
-
"style": "happy",
|
| 127 |
-
"speed": 1.0,
|
| 128 |
-
"pitch": 1.0
|
| 129 |
-
}' \
|
| 130 |
-
--output speech.wav
|
| 131 |
-
```
|
| 132 |
-
|
| 133 |
-
### Gujarati Synthesis
|
| 134 |
-
|
| 135 |
-
```bash
|
| 136 |
-
curl -X POST "http://localhost:8000/synthesize" \
|
| 137 |
-
-H "Content-Type: application/json" \
|
| 138 |
-
-d '{"text": "નમસ્તે, કેમ છો?", "voice": "gu_mms", "style": "calm"}' \
|
| 139 |
-
--output gujarati.wav
|
| 140 |
-
```
|
| 141 |
-
|
| 142 |
-
### List Style Presets
|
| 143 |
-
|
| 144 |
-
```bash
|
| 145 |
-
curl http://localhost:8000/styles
|
| 146 |
-
```
|
| 147 |
-
|
| 148 |
-
## 🎤 Available Voices
|
| 149 |
-
|
| 150 |
-
| Language | Code | Male | Female | Notes |
|
| 151 |
-
| ------------- | ---- | ----------- | ------------- | ------------ |
|
| 152 |
-
| Hindi | hi | ✅ hi_male | ✅ hi_female | SYSPIN |
|
| 153 |
-
| Bengali | bn | ✅ bn_male | ✅ bn_female | SYSPIN |
|
| 154 |
-
| Marathi | mr | ✅ mr_male | ✅ mr_female | SYSPIN |
|
| 155 |
-
| Telugu | te | ✅ te_male | ✅ te_female | SYSPIN |
|
| 156 |
-
| Kannada | kn | ✅ kn_male | ✅ kn_female | SYSPIN |
|
| 157 |
-
| Bhojpuri | bho | ✅ bho_male | ✅ bho_female | SYSPIN |
|
| 158 |
-
| Chhattisgarhi | hne | ✅ hne_male | ✅ hne_female | SYSPIN |
|
| 159 |
-
| Maithili | mai | ✅ mai_male | ✅ mai_female | SYSPIN |
|
| 160 |
-
| Magahi | mag | ✅ mag_male | ✅ mag_female | SYSPIN |
|
| 161 |
-
| English | en | ✅ en_male | ✅ en_female | SYSPIN |
|
| 162 |
-
| **Gujarati** | gu | ✅ gu_mms | - | Facebook MMS |
|
| 163 |
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
```python
|
| 167 |
-
from src.engine import TTSEngine
|
| 168 |
-
|
| 169 |
-
# Initialize engine
|
| 170 |
-
engine = TTSEngine(device="auto")
|
| 171 |
-
|
| 172 |
-
# Basic synthesis
|
| 173 |
-
output = engine.synthesize(
|
| 174 |
-
text="गर्भावस्था में स्वस्थ आहार महत्वपूर्ण है",
|
| 175 |
-
voice="hi_female"
|
| 176 |
-
)
|
| 177 |
-
|
| 178 |
-
# With style control
|
| 179 |
-
output = engine.synthesize(
|
| 180 |
-
text="आपका दिन शुभ हो",
|
| 181 |
-
voice="hi_male",
|
| 182 |
-
style="happy", # Use preset
|
| 183 |
-
pitch=1.1, # Or manual control
|
| 184 |
-
speed=1.0,
|
| 185 |
-
energy=1.2
|
| 186 |
-
)
|
| 187 |
-
|
| 188 |
-
# Gujarati
|
| 189 |
-
output = engine.synthesize(
|
| 190 |
-
text="સ્વસ્થ રહો, ખુશ રહો",
|
| 191 |
-
voice="gu_mms",
|
| 192 |
-
style="calm"
|
| 193 |
-
)
|
| 194 |
-
|
| 195 |
-
# Save to file
|
| 196 |
-
engine.synthesize_to_file(
|
| 197 |
-
text="નમસ્તે",
|
| 198 |
-
output_path="hello.wav",
|
| 199 |
-
voice="gu_mms",
|
| 200 |
-
style="calm"
|
| 201 |
-
)
|
| 202 |
-
```
|
| 203 |
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
```text
|
| 207 |
-
VoiceAPI/
|
| 208 |
-
├── src/
|
| 209 |
-
│ ├── config.py # Language/voice/style configurations
|
| 210 |
-
│ ├── tokenizer.py # Text tokenization & normalization
|
| 211 |
-
│ ├── engine.py # Main TTS engine with style processor
|
| 212 |
-
│ ├── downloader.py # HuggingFace model downloader
|
| 213 |
-
│ ├── api.py # FastAPI REST server
|
| 214 |
-
│ └── cli.py # Command-line interface
|
| 215 |
-
├── models/ # Downloaded models
|
| 216 |
-
├── dataset/ # SPICOR dataset (for fine-tuning)
|
| 217 |
-
├── technical_report.md
|
| 218 |
-
├── requirements.txt
|
| 219 |
-
└── README.md
|
| 220 |
```
|
| 221 |
|
| 222 |
-
## 📊
|
| 223 |
-
|
| 224 |
-
|
|
| 225 |
-
|
| 226 |
-
|
|
| 227 |
-
|
|
| 228 |
-
|
|
| 229 |
-
|
|
| 230 |
-
|
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
|
| 233 |
## 🙏 Credits
|
| 234 |
|
| 235 |
-
- **
|
| 236 |
-
- **
|
| 237 |
-
- **Architecture**: VITS (Coqui AI)
|
| 238 |
-
- **Dataset**: SPICOR TTS Project, IISc SPIRE Lab
|
| 239 |
-
|
| 240 |
-
## 📜 License
|
| 241 |
-
|
| 242 |
-
CC BY 4.0 (SYSPIN), CC BY-NC 4.0 (MMS)
|
| 243 |
-
|
| 244 |
-
---
|
| 245 |
-
|
| 246 |
-
Built with ❤️ for **Voice Tech for All Hackathon**
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: VoiceAPI - Multi-lingual TTS
|
| 3 |
+
emoji: 🎤
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: true
|
| 9 |
+
license: mit
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# VoiceAPI - Multi-lingual Text-to-Speech
|
| 13 |
|
| 14 |
+
A multi-lingual Text-to-Speech API supporting **11 Indian languages** designed for healthcare applications helping pregnant mothers in low-income communities.
|
| 15 |
|
| 16 |
+
## 🎯 Voice Tech for All Hackathon
|
| 17 |
|
| 18 |
+
Built for LLM-based healthcare assistants to deliver medical guidance in native languages.
|
| 19 |
|
| 20 |
## ✨ Features
|
| 21 |
|
| 22 |
+
- **11 Indian Languages**: Hindi, Bengali, Marathi, Telugu, Kannada, Bhojpuri, Chhattisgarhi, Maithili, Magahi, English, Gujarati
|
| 23 |
- **21 Voice Options**: Male & Female voices for each language
|
| 24 |
+
- **REST API**: FastAPI with OpenAPI docs at `/docs`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
+
## 🔌 API Endpoint
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
|
|
|
|
|
|
| 28 |
```
|
| 29 |
+
GET /Get_Inference?text=नमस्ते&lang=hindi
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
```
|
| 31 |
|
| 32 |
+
### Parameters
|
|
|
|
|
|
|
| 33 |
|
| 34 |
+
| Parameter | Type | Required | Description |
|
| 35 |
+
|-----------|------|----------|-------------|
|
| 36 |
+
| text | string | ✅ | Text to synthesize |
|
| 37 |
+
| lang | string | ✅ | Language: hindi, bengali, marathi, telugu, kannada, bhojpuri, chhattisgarhi, maithili, magahi, english, gujarati |
|
| 38 |
+
| speaker_wav | file | ✅ | Reference WAV file |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
+
### Example
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
```python
|
| 43 |
import requests
|
| 44 |
|
| 45 |
+
url = "https://Harshil748-voiceapi.hf.space/Get_Inference"
|
| 46 |
+
params = {"text": "नमस्ते, आप कैसे हैं?", "lang": "hindi"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
+
with open("reference.wav", "rb") as f:
|
| 49 |
+
response = requests.get(url, params=params, files={"speaker_wav": f})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
+
with open("output.wav", "wb") as f:
|
| 52 |
+
f.write(response.content)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
```
|
| 54 |
|
| 55 |
+
## 📊 Supported Languages
|
| 56 |
+
|
| 57 |
+
| Language | Code | Voices |
|
| 58 |
+
|----------|------|--------|
|
| 59 |
+
| Hindi | hindi | Male, Female |
|
| 60 |
+
| Bengali | bengali | Male, Female |
|
| 61 |
+
| Marathi | marathi | Male, Female |
|
| 62 |
+
| Telugu | telugu | Male, Female |
|
| 63 |
+
| Kannada | kannada | Male, Female |
|
| 64 |
+
| Gujarati | gujarati | MMS |
|
| 65 |
+
| Bhojpuri | bhojpuri | Male, Female |
|
| 66 |
+
| Chhattisgarhi | chhattisgarhi | Male, Female |
|
| 67 |
+
| Maithili | maithili | Male, Female |
|
| 68 |
+
| Magahi | magahi | Male, Female |
|
| 69 |
+
| English | english | Male, Female |
|
| 70 |
|
| 71 |
## 🙏 Credits
|
| 72 |
|
| 73 |
+
- **Models**: SYSPIN (IISc Bangalore), Facebook MMS
|
| 74 |
+
- **Team**: Harshil Patel, Aashvi Maurya, Jaideep, Pratyush
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HuggingFace Spaces entry point for VoiceAPI
|
| 3 |
+
Multi-lingual Text-to-Speech for Maternal Healthcare
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
# Add src to path
|
| 10 |
+
sys.path.insert(0, os.path.dirname(__file__))
|
| 11 |
+
|
| 12 |
+
from src.api import app
|
| 13 |
+
|
| 14 |
+
# For HuggingFace Spaces
|
| 15 |
+
if __name__ == "__main__":
|
| 16 |
+
import uvicorn
|
| 17 |
+
|
| 18 |
+
port = int(os.environ.get("PORT", 7860))
|
| 19 |
+
uvicorn.run(app, host="0.0.0.0", port=port)
|