Spaces:
Running
Running
Upload 7 files
Browse files- .gitignore +53 -0
- README.md +370 -11
- README_HF.md +28 -0
- app.py +753 -0
- examples.py +215 -0
- packages.txt +6 -0
- requirements.txt +20 -0
.gitignore
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
|
| 23 |
+
# Virtual environments
|
| 24 |
+
venv/
|
| 25 |
+
ENV/
|
| 26 |
+
env/
|
| 27 |
+
.venv/
|
| 28 |
+
|
| 29 |
+
# IDE
|
| 30 |
+
.idea/
|
| 31 |
+
.vscode/
|
| 32 |
+
*.swp
|
| 33 |
+
*.swo
|
| 34 |
+
*~
|
| 35 |
+
|
| 36 |
+
# OS
|
| 37 |
+
.DS_Store
|
| 38 |
+
Thumbs.db
|
| 39 |
+
|
| 40 |
+
# Project specific
|
| 41 |
+
*.wav
|
| 42 |
+
*.mp3
|
| 43 |
+
*.ogg
|
| 44 |
+
output/
|
| 45 |
+
cache/
|
| 46 |
+
.cache/
|
| 47 |
+
|
| 48 |
+
# Logs
|
| 49 |
+
*.log
|
| 50 |
+
logs/
|
| 51 |
+
|
| 52 |
+
# Hugging Face
|
| 53 |
+
.huggingface/
|
README.md
CHANGED
|
@@ -1,14 +1,373 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
---
|
| 13 |
|
| 14 |
-
|
|
|
|
| 1 |
+
# 🎙️ Kokoro TTS - Academic Text-to-Speech Application
|
| 2 |
+
|
| 3 |
+
[](https://huggingface.co/spaces)
|
| 4 |
+
[](https://opensource.org/licenses/Apache-2.0)
|
| 5 |
+
[](https://www.python.org/downloads/)
|
| 6 |
+
|
| 7 |
+
A comprehensive, open-source Text-to-Speech application built for academic learning and demonstration. Powered by the [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) model — a lightweight yet high-quality TTS system with 82 million parameters.
|
| 8 |
+
|
| 9 |
+

|
| 10 |
+
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
## ✨ Features
|
| 14 |
+
|
| 15 |
+
### 🎭 28 Built-in Voices
|
| 16 |
+
- **20 American English voices** (11 female, 9 male)
|
| 17 |
+
- **8 British English voices** (4 female, 4 male)
|
| 18 |
+
- Quality grades from A (premium) to D (basic)
|
| 19 |
+
- Each voice has unique characteristics and recommended use cases
|
| 20 |
+
|
| 21 |
+
### 🎨 7 Style Presets
|
| 22 |
+
| Style | Description | Best For |
|
| 23 |
+
|-------|-------------|----------|
|
| 24 |
+
| **Neutral Narrator** | Clear, balanced narration | General content, documentation |
|
| 25 |
+
| **Dramatic / Horror** | Slower, deeper, suspenseful | Horror stories, dramatic readings |
|
| 26 |
+
| **Excited / Surprised** | Faster, higher energy | Announcements, exciting content |
|
| 27 |
+
| **Calm / Meditative** | Slow, soothing | Meditation guides, ASMR |
|
| 28 |
+
| **Storyteller** | Engaging narrative pace | Audiobooks, bedtime stories |
|
| 29 |
+
| **Professional** | Clear, authoritative | Business, corporate content |
|
| 30 |
+
| **Cheerful / Friendly** | Warm, upbeat | Tutorials, friendly explanations |
|
| 31 |
+
|
| 32 |
+
### ⚙️ Full Audio Control
|
| 33 |
+
- **Speed**: 0.5x (slow) to 2.0x (fast)
|
| 34 |
+
- **Pitch**: -5 to +5 semitones adjustment
|
| 35 |
+
- **Pauses**: 0-1000ms between sentences
|
| 36 |
+
- Real-time audio preview with download capability
|
| 37 |
+
|
| 38 |
+
---
|
| 39 |
+
|
| 40 |
+
## 🚀 Quick Start
|
| 41 |
+
|
| 42 |
+
### Option 1: Hugging Face Spaces (Recommended)
|
| 43 |
+
|
| 44 |
+
1. Go to [Hugging Face](https://huggingface.co/new-space)
|
| 45 |
+
2. Create a new Space with **Gradio** SDK
|
| 46 |
+
3. Upload all files from this repository
|
| 47 |
+
4. The Space will automatically install dependencies and launch
|
| 48 |
+
|
| 49 |
+
### Option 2: Local Installation
|
| 50 |
+
|
| 51 |
+
```bash
|
| 52 |
+
# Clone or download this repository
|
| 53 |
+
git clone <your-repo-url>
|
| 54 |
+
cd kokoro-tts-app
|
| 55 |
+
|
| 56 |
+
# Create virtual environment (recommended)
|
| 57 |
+
python -m venv venv
|
| 58 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
| 59 |
+
|
| 60 |
+
# Install system dependencies (Linux/Ubuntu)
|
| 61 |
+
sudo apt-get update
|
| 62 |
+
sudo apt-get install -y espeak-ng ffmpeg libsndfile1
|
| 63 |
+
|
| 64 |
+
# Install Python dependencies
|
| 65 |
+
pip install -r requirements.txt
|
| 66 |
+
|
| 67 |
+
# Run the application
|
| 68 |
+
python app.py
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
The app will be available at `http://localhost:7860`
|
| 72 |
+
|
| 73 |
+
### Option 3: Docker
|
| 74 |
+
|
| 75 |
+
```dockerfile
|
| 76 |
+
FROM python:3.10-slim
|
| 77 |
+
|
| 78 |
+
# Install system dependencies
|
| 79 |
+
RUN apt-get update && apt-get install -y \
|
| 80 |
+
espeak-ng \
|
| 81 |
+
ffmpeg \
|
| 82 |
+
libsndfile1 \
|
| 83 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 84 |
+
|
| 85 |
+
# Install Python dependencies
|
| 86 |
+
COPY requirements.txt .
|
| 87 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 88 |
+
|
| 89 |
+
# Copy application
|
| 90 |
+
COPY app.py .
|
| 91 |
+
|
| 92 |
+
# Run
|
| 93 |
+
CMD ["python", "app.py"]
|
| 94 |
+
```
|
| 95 |
+
|
| 96 |
+
---
|
| 97 |
+
|
| 98 |
+
## 📖 Usage Guide
|
| 99 |
+
|
| 100 |
+
### Basic Usage
|
| 101 |
+
|
| 102 |
+
1. **Enter Text**: Type or paste your text (up to 5000 characters)
|
| 103 |
+
2. **Select Voice**: Choose from 28 available voices
|
| 104 |
+
3. **Pick a Style**: Select a style preset that matches your content
|
| 105 |
+
4. **Generate**: Click the "Generate Speech" button
|
| 106 |
+
5. **Download**: Use the download button on the audio player
|
| 107 |
+
|
| 108 |
+
### Using Style Presets
|
| 109 |
+
|
| 110 |
+
Style presets automatically configure speed, pitch, and pause settings:
|
| 111 |
+
|
| 112 |
+
```
|
| 113 |
+
✅ "Use Style Preset Defaults" checked → Style settings applied
|
| 114 |
+
❌ "Use Style Preset Defaults" unchecked → Manual controls active
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
### Advanced Customization
|
| 118 |
+
|
| 119 |
+
Uncheck "Use Style Preset Defaults" to manually control:
|
| 120 |
+
|
| 121 |
+
- **Speed**: Lower values = slower, more deliberate speech
|
| 122 |
+
- **Pitch**: Negative = deeper voice, Positive = higher voice
|
| 123 |
+
- **Pause**: Higher values = longer pauses between sentences
|
| 124 |
+
|
| 125 |
+
### Sample Texts
|
| 126 |
+
|
| 127 |
+
The app includes sample texts for different scenarios:
|
| 128 |
+
- **Welcome**: General introduction text
|
| 129 |
+
- **Horror**: Spooky story excerpt (pair with Dramatic style)
|
| 130 |
+
- **News**: News broadcast style
|
| 131 |
+
- **Story**: Fairy tale opening (pair with Storyteller style)
|
| 132 |
+
- **Technical**: Technical documentation
|
| 133 |
+
|
| 134 |
+
---
|
| 135 |
+
|
| 136 |
+
## 🎭 Voice Reference
|
| 137 |
+
|
| 138 |
+
### American English - Female (11 voices)
|
| 139 |
+
|
| 140 |
+
| Voice ID | Name | Grade | Description |
|
| 141 |
+
|----------|------|-------|-------------|
|
| 142 |
+
| `af_heart` | Heart ❤️ | A | Premium quality, warm and natural |
|
| 143 |
+
| `af_bella` | Bella 🔥 | A- | Clear and expressive |
|
| 144 |
+
| `af_nicole` | Nicole 🎧 | B- | Professional narrator style |
|
| 145 |
+
| `af_aoede` | Aoede | C+ | Melodic and pleasant |
|
| 146 |
+
| `af_kore` | Kore | C+ | Youthful and energetic |
|
| 147 |
+
| `af_sarah` | Sarah | C+ | Friendly and approachable |
|
| 148 |
+
| `af_nova` | Nova | C | Modern and crisp |
|
| 149 |
+
| `af_sky` | Sky | C- | Light and airy |
|
| 150 |
+
| `af_alloy` | Alloy | C | Balanced and versatile |
|
| 151 |
+
| `af_jessica` | Jessica | D | Casual conversational |
|
| 152 |
+
| `af_river` | River | D | Gentle and flowing |
|
| 153 |
+
|
| 154 |
+
### American English - Male (9 voices)
|
| 155 |
+
|
| 156 |
+
| Voice ID | Name | Grade | Description |
|
| 157 |
+
|----------|------|-------|-------------|
|
| 158 |
+
| `am_michael` | Michael | C+ | Authoritative and clear |
|
| 159 |
+
| `am_fenrir` | Fenrir | C+ | Deep and resonant |
|
| 160 |
+
| `am_puck` | Puck | C+ | Playful and dynamic |
|
| 161 |
+
| `am_echo` | Echo | D | Warm and reflective |
|
| 162 |
+
| `am_eric` | Eric | D | Professional and steady |
|
| 163 |
+
| `am_liam` | Liam | D | Young and natural |
|
| 164 |
+
| `am_onyx` | Onyx | D | Rich and smooth |
|
| 165 |
+
| `am_santa` | Santa 🎅 | D- | Jolly and festive |
|
| 166 |
+
| `am_adam` | Adam | F+ | Basic male voice |
|
| 167 |
+
|
| 168 |
+
### British English - Female (4 voices)
|
| 169 |
+
|
| 170 |
+
| Voice ID | Name | Grade | Description |
|
| 171 |
+
|----------|------|-------|-------------|
|
| 172 |
+
| `bf_emma` | Emma | B- | Elegant British accent |
|
| 173 |
+
| `bf_isabella` | Isabella | C | Sophisticated and refined |
|
| 174 |
+
| `bf_alice` | Alice | D | Classic British tone |
|
| 175 |
+
| `bf_lily` | Lily | D | Soft and gentle |
|
| 176 |
+
|
| 177 |
+
### British English - Male (4 voices)
|
| 178 |
+
|
| 179 |
+
| Voice ID | Name | Grade | Description |
|
| 180 |
+
|----------|------|-------|-------------|
|
| 181 |
+
| `bm_george` | George | C | Distinguished gentleman |
|
| 182 |
+
| `bm_fable` | Fable | C | Storyteller quality |
|
| 183 |
+
| `bm_lewis` | Lewis | D+ | Conversational British |
|
| 184 |
+
| `bm_daniel` | Daniel | D | Standard British male |
|
| 185 |
+
|
| 186 |
---
|
| 187 |
+
|
| 188 |
+
## 🏗️ Architecture
|
| 189 |
+
|
| 190 |
+
```
|
| 191 |
+
┌─────────────────────────────────────────────────────────────┐
|
| 192 |
+
│ Gradio Web Interface │
|
| 193 |
+
│ ┌──────────────┬───────────────┬─────────────────────────┐ │
|
| 194 |
+
│ │ Text Input │ Voice/Style │ Advanced Controls │ │
|
| 195 |
+
│ │ │ Selection │ (Speed/Pitch/Pause) │ │
|
| 196 |
+
│ └──────────────┴───────────────┴─────────────────────────┘ │
|
| 197 |
+
└─────────────────────────────┬───────────────────────────────┘
|
| 198 |
+
│
|
| 199 |
+
▼
|
| 200 |
+
┌─────────────────────────────────────────────────────────────┐
|
| 201 |
+
│ Text Preprocessing │
|
| 202 |
+
│ • Normalize abbreviations (Dr. → Doctor) │
|
| 203 |
+
│ • Clean whitespace │
|
| 204 |
+
│ • Character limit enforcement │
|
| 205 |
+
└─────────────────────────────┬───────────────────────────────┘
|
| 206 |
+
│
|
| 207 |
+
▼
|
| 208 |
+
┌─────────────────────────────────────────────────────────────┐
|
| 209 |
+
│ KokoroTTSEngine │
|
| 210 |
+
│ • KPipeline (American 'a' / British 'b') │
|
| 211 |
+
│ • Voice pack loading │
|
| 212 |
+
│ • Phoneme conversion via Misaki G2P │
|
| 213 |
+
│ • Neural audio synthesis │
|
| 214 |
+
└─────────────────────────────┬───────────────────────────────┘
|
| 215 |
+
│
|
| 216 |
+
▼
|
| 217 |
+
┌─────────────────────────────────────────────────────────────┐
|
| 218 |
+
│ Audio Post-Processing │
|
| 219 |
+
│ • Pitch shifting (semitone adjustment) │
|
| 220 |
+
│ • Pause insertion between segments │
|
| 221 |
+
│ • Audio normalization (-3dB peak) │
|
| 222 |
+
└─────────────────────────────┬───────────────────────────────┘
|
| 223 |
+
│
|
| 224 |
+
▼
|
| 225 |
+
┌─────────────────────────────────────────────────────────────┐
|
| 226 |
+
│ Audio Output (24kHz WAV) │
|
| 227 |
+
│ • Playback in browser │
|
| 228 |
+
│ • Download capability │
|
| 229 |
+
└─────────────────────────────────────────────────────────────┘
|
| 230 |
+
```
|
| 231 |
+
|
| 232 |
+
---
|
| 233 |
+
|
| 234 |
+
## 📁 Project Structure
|
| 235 |
+
|
| 236 |
+
```
|
| 237 |
+
kokoro-tts-app/
|
| 238 |
+
├── app.py # Main Gradio application
|
| 239 |
+
├── requirements.txt # Python dependencies
|
| 240 |
+
├── packages.txt # System dependencies (for HF Spaces)
|
| 241 |
+
└── README.md # This documentation
|
| 242 |
+
```
|
| 243 |
+
|
| 244 |
+
### Code Organization (app.py)
|
| 245 |
+
|
| 246 |
+
```python
|
| 247 |
+
# Section 1: Configuration & Constants
|
| 248 |
+
VOICE_CATALOG = {...} # Voice definitions
|
| 249 |
+
STYLE_PRESETS = {...} # Style preset configurations
|
| 250 |
+
|
| 251 |
+
# Section 2: Audio Processing Utilities
|
| 252 |
+
pitch_shift_audio() # Pitch manipulation
|
| 253 |
+
insert_pauses() # Silence injection
|
| 254 |
+
normalize_audio() # Volume normalization
|
| 255 |
+
preprocess_text() # Text cleaning
|
| 256 |
+
|
| 257 |
+
# Section 3: TTS Engine
|
| 258 |
+
class KokoroTTSEngine: # Main TTS wrapper
|
| 259 |
+
generate() # Basic generation
|
| 260 |
+
generate_with_style() # Style-based generation
|
| 261 |
+
|
| 262 |
+
# Section 4: Gradio Interface
|
| 263 |
+
create_voice_choices() # UI helper functions
|
| 264 |
+
generate_speech() # Main generation callback
|
| 265 |
+
demo = gr.Blocks(...) # Interface definition
|
| 266 |
+
```
|
| 267 |
+
|
| 268 |
+
---
|
| 269 |
+
|
| 270 |
+
## 🔧 Technical Details
|
| 271 |
+
|
| 272 |
+
### Model Specifications
|
| 273 |
+
|
| 274 |
+
| Attribute | Value |
|
| 275 |
+
|-----------|-------|
|
| 276 |
+
| Model | Kokoro-82M |
|
| 277 |
+
| Parameters | 82 million |
|
| 278 |
+
| Model Size | ~330 MB |
|
| 279 |
+
| Sample Rate | 24,000 Hz |
|
| 280 |
+
| Audio Format | 32-bit float WAV |
|
| 281 |
+
| Languages | English (US & UK), Japanese, Chinese, Spanish, French, Hindi, Italian, Portuguese |
|
| 282 |
+
|
| 283 |
+
### Resource Requirements
|
| 284 |
+
|
| 285 |
+
| Environment | CPU | RAM | Notes |
|
| 286 |
+
|-------------|-----|-----|-------|
|
| 287 |
+
| HF Spaces (Free) | 2 vCPU | 16 GB | Recommended |
|
| 288 |
+
| Local (Minimum) | 2 cores | 4 GB | Functional |
|
| 289 |
+
| Local (Recommended) | 4 cores | 8 GB | Faster inference |
|
| 290 |
+
|
| 291 |
+
### Performance Benchmarks (CPU)
|
| 292 |
+
|
| 293 |
+
| Text Length | Approx. Generation Time |
|
| 294 |
+
|-------------|------------------------|
|
| 295 |
+
| 100 chars | 2-4 seconds |
|
| 296 |
+
| 500 chars | 8-15 seconds |
|
| 297 |
+
| 1000 chars | 15-30 seconds |
|
| 298 |
+
| 5000 chars | 60-120 seconds |
|
| 299 |
+
|
| 300 |
+
---
|
| 301 |
+
|
| 302 |
+
## 🎓 Academic Use Cases
|
| 303 |
+
|
| 304 |
+
This project is designed for learning and demonstration:
|
| 305 |
+
|
| 306 |
+
1. **Understanding TTS Pipelines**: Explore how text is converted to phonemes and then to audio
|
| 307 |
+
2. **Audio Signal Processing**: Learn about pitch shifting, normalization, and pause insertion
|
| 308 |
+
3. **ML Model Deployment**: Practice deploying models on Hugging Face Spaces
|
| 309 |
+
4. **UI/UX Design**: Build intuitive interfaces with Gradio
|
| 310 |
+
5. **Code Organization**: Study modular, well-documented Python code
|
| 311 |
+
|
| 312 |
+
---
|
| 313 |
+
|
| 314 |
+
## 📚 Learning Resources
|
| 315 |
+
|
| 316 |
+
- [Kokoro Model Card](https://huggingface.co/hexgrad/Kokoro-82M) - Official model documentation
|
| 317 |
+
- [Misaki G2P](https://github.com/hexgrad/misaki) - Grapheme-to-phoneme library
|
| 318 |
+
- [Gradio Documentation](https://gradio.app/docs/) - UI framework
|
| 319 |
+
- [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces) - Deployment platform
|
| 320 |
+
|
| 321 |
+
---
|
| 322 |
+
|
| 323 |
+
## 🤝 Contributing
|
| 324 |
+
|
| 325 |
+
This is an academic project, but contributions are welcome:
|
| 326 |
+
|
| 327 |
+
1. Fork the repository
|
| 328 |
+
2. Create a feature branch
|
| 329 |
+
3. Make your changes
|
| 330 |
+
4. Submit a pull request
|
| 331 |
+
|
| 332 |
+
---
|
| 333 |
+
|
| 334 |
+
## 📄 License
|
| 335 |
+
|
| 336 |
+
This project is licensed under the **Apache License 2.0**, the same license as the Kokoro-82M model.
|
| 337 |
+
|
| 338 |
+
```
|
| 339 |
+
Copyright 2024 Academic Project
|
| 340 |
+
|
| 341 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
| 342 |
+
you may not use this file except in compliance with the License.
|
| 343 |
+
You may obtain a copy of the License at
|
| 344 |
+
|
| 345 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
| 346 |
+
|
| 347 |
+
Unless required by applicable law or agreed to in writing, software
|
| 348 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
| 349 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 350 |
+
See the License for the specific language governing permissions and
|
| 351 |
+
limitations under the License.
|
| 352 |
+
```
|
| 353 |
+
|
| 354 |
+
---
|
| 355 |
+
|
| 356 |
+
## 🙏 Acknowledgments
|
| 357 |
+
|
| 358 |
+
- [hexgrad](https://huggingface.co/hexgrad) - Creator of Kokoro-82M
|
| 359 |
+
- [Hugging Face](https://huggingface.co) - Model hosting and Spaces platform
|
| 360 |
+
- [Gradio](https://gradio.app) - Web interface framework
|
| 361 |
+
|
| 362 |
+
---
|
| 363 |
+
|
| 364 |
+
## 📞 Support
|
| 365 |
+
|
| 366 |
+
For questions or issues:
|
| 367 |
+
1. Check the [Kokoro Discussions](https://huggingface.co/hexgrad/Kokoro-82M/discussions)
|
| 368 |
+
2. Review the [Gradio Docs](https://gradio.app/docs/)
|
| 369 |
+
3. Open an issue in this repository
|
| 370 |
+
|
| 371 |
---
|
| 372 |
|
| 373 |
+
*Built with ❤️ for academic learning and open-source AI*
|
README_HF.md
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
This file contains the Hugging Face Spaces configuration.
|
| 2 |
+
When deploying to HF Spaces, rename this content to be at the TOP of your main README.md file.
|
| 3 |
+
|
| 4 |
+
YAML Frontmatter to add to README.md:
|
| 5 |
+
|
| 6 |
+
```yaml
|
| 7 |
+
---
|
| 8 |
+
title: Kokoro TTS - Academic Text-to-Speech
|
| 9 |
+
emoji: 🎙️
|
| 10 |
+
colorFrom: blue
|
| 11 |
+
colorTo: purple
|
| 12 |
+
sdk: gradio
|
| 13 |
+
sdk_version: 4.44.0
|
| 14 |
+
app_file: app.py
|
| 15 |
+
pinned: false
|
| 16 |
+
license: apache-2.0
|
| 17 |
+
suggested_hardware: cpu-basic
|
| 18 |
+
short_description: High-quality open-source TTS with 28 voices and style controls
|
| 19 |
+
tags:
|
| 20 |
+
- text-to-speech
|
| 21 |
+
- tts
|
| 22 |
+
- kokoro
|
| 23 |
+
- audio
|
| 24 |
+
- speech-synthesis
|
| 25 |
+
---
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
Reference: https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,753 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Kokoro TTS - Academic Text-to-Speech Application
|
| 3 |
+
================================================
|
| 4 |
+
A comprehensive TTS application using Kokoro-82M model with full voice control.
|
| 5 |
+
|
| 6 |
+
Features:
|
| 7 |
+
- 28 built-in voices (American & British English, Male & Female)
|
| 8 |
+
- Speed control (0.5x - 2.0x)
|
| 9 |
+
- Pitch adjustment via audio post-processing
|
| 10 |
+
- Configurable pause insertion
|
| 11 |
+
- Style presets for different tones (Neutral, Dramatic, Whisper, etc.)
|
| 12 |
+
|
| 13 |
+
Author: Academic Project
|
| 14 |
+
License: Apache 2.0 (same as Kokoro model)
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import gradio as gr
|
| 18 |
+
import numpy as np
|
| 19 |
+
import soundfile as sf
|
| 20 |
+
import io
|
| 21 |
+
import re
|
| 22 |
+
from typing import Optional, Tuple, Generator
|
| 23 |
+
from dataclasses import dataclass
|
| 24 |
+
from enum import Enum
|
| 25 |
+
|
| 26 |
+
# Kokoro TTS imports
|
| 27 |
+
from kokoro import KPipeline
|
| 28 |
+
|
| 29 |
+
# ============================================================================
|
| 30 |
+
# CONFIGURATION & CONSTANTS
|
| 31 |
+
# ============================================================================
|
| 32 |
+
|
| 33 |
+
SAMPLE_RATE = 24000 # Kokoro outputs 24kHz audio
|
| 34 |
+
MAX_CHAR_LIMIT = 5000 # Maximum characters per generation
|
| 35 |
+
|
| 36 |
+
# Voice definitions with metadata
|
| 37 |
+
# Format: voice_id -> (display_name, gender, accent, quality_grade, description)
|
| 38 |
+
VOICE_CATALOG = {
|
| 39 |
+
# American English - Female
|
| 40 |
+
"af_heart": ("Heart ❤️", "Female", "American", "A", "Premium quality, warm and natural"),
|
| 41 |
+
"af_bella": ("Bella 🔥", "Female", "American", "A-", "Clear and expressive"),
|
| 42 |
+
"af_nicole": ("Nicole 🎧", "Female", "American", "B-", "Professional narrator style"),
|
| 43 |
+
"af_aoede": ("Aoede", "Female", "American", "C+", "Melodic and pleasant"),
|
| 44 |
+
"af_kore": ("Kore", "Female", "American", "C+", "Youthful and energetic"),
|
| 45 |
+
"af_sarah": ("Sarah", "Female", "American", "C+", "Friendly and approachable"),
|
| 46 |
+
"af_nova": ("Nova", "Female", "American", "C", "Modern and crisp"),
|
| 47 |
+
"af_sky": ("Sky", "Female", "American", "C-", "Light and airy"),
|
| 48 |
+
"af_alloy": ("Alloy", "Female", "American", "C", "Balanced and versatile"),
|
| 49 |
+
"af_jessica": ("Jessica", "Female", "American", "D", "Casual conversational"),
|
| 50 |
+
"af_river": ("River", "Female", "American", "D", "Gentle and flowing"),
|
| 51 |
+
|
| 52 |
+
# American English - Male
|
| 53 |
+
"am_michael": ("Michael", "Male", "American", "C+", "Authoritative and clear"),
|
| 54 |
+
"am_fenrir": ("Fenrir", "Male", "American", "C+", "Deep and resonant"),
|
| 55 |
+
"am_puck": ("Puck", "Male", "American", "C+", "Playful and dynamic"),
|
| 56 |
+
"am_echo": ("Echo", "Male", "American", "D", "Warm and reflective"),
|
| 57 |
+
"am_eric": ("Eric", "Male", "American", "D", "Professional and steady"),
|
| 58 |
+
"am_liam": ("Liam", "Male", "American", "D", "Young and natural"),
|
| 59 |
+
"am_onyx": ("Onyx", "Male", "American", "D", "Rich and smooth"),
|
| 60 |
+
"am_santa": ("Santa 🎅", "Male", "American", "D-", "Jolly and festive"),
|
| 61 |
+
"am_adam": ("Adam", "Male", "American", "F+", "Basic male voice"),
|
| 62 |
+
|
| 63 |
+
# British English - Female
|
| 64 |
+
"bf_emma": ("Emma", "Female", "British", "B-", "Elegant British accent"),
|
| 65 |
+
"bf_isabella": ("Isabella", "Female", "British", "C", "Sophisticated and refined"),
|
| 66 |
+
"bf_alice": ("Alice", "Female", "British", "D", "Classic British tone"),
|
| 67 |
+
"bf_lily": ("Lily", "Female", "British", "D", "Soft and gentle"),
|
| 68 |
+
|
| 69 |
+
# British English - Male
|
| 70 |
+
"bm_george": ("George", "Male", "British", "C", "Distinguished gentleman"),
|
| 71 |
+
"bm_fable": ("Fable", "Male", "British", "C", "Storyteller quality"),
|
| 72 |
+
"bm_lewis": ("Lewis", "Male", "British", "D+", "Conversational British"),
|
| 73 |
+
"bm_daniel": ("Daniel", "Male", "British", "D", "Standard British male"),
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
@dataclass
|
| 78 |
+
class StylePreset:
|
| 79 |
+
"""Defines a style preset with associated audio parameters."""
|
| 80 |
+
name: str
|
| 81 |
+
description: str
|
| 82 |
+
speed: float
|
| 83 |
+
pitch_shift: float # semitones
|
| 84 |
+
pause_multiplier: float
|
| 85 |
+
recommended_voices: list
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
# Style presets for different tones
|
| 89 |
+
STYLE_PRESETS = {
|
| 90 |
+
"neutral": StylePreset(
|
| 91 |
+
name="Neutral Narrator",
|
| 92 |
+
description="Clear, balanced narration suitable for most content",
|
| 93 |
+
speed=1.0,
|
| 94 |
+
pitch_shift=0,
|
| 95 |
+
pause_multiplier=1.0,
|
| 96 |
+
recommended_voices=["af_heart", "af_bella", "am_michael", "bf_emma"]
|
| 97 |
+
),
|
| 98 |
+
"dramatic": StylePreset(
|
| 99 |
+
name="Dramatic / Horror",
|
| 100 |
+
description="Slower, deeper voice for suspenseful or dramatic content",
|
| 101 |
+
speed=0.85,
|
| 102 |
+
pitch_shift=-2,
|
| 103 |
+
pause_multiplier=1.5,
|
| 104 |
+
recommended_voices=["am_fenrir", "am_onyx", "bm_george", "af_nicole"]
|
| 105 |
+
),
|
| 106 |
+
"excited": StylePreset(
|
| 107 |
+
name="Excited / Surprised",
|
| 108 |
+
description="Faster, higher energy delivery",
|
| 109 |
+
speed=1.2,
|
| 110 |
+
pitch_shift=1,
|
| 111 |
+
pause_multiplier=0.7,
|
| 112 |
+
recommended_voices=["af_kore", "am_puck", "af_nova", "af_sky"]
|
| 113 |
+
),
|
| 114 |
+
"calm": StylePreset(
|
| 115 |
+
name="Calm / Meditative",
|
| 116 |
+
description="Slow, soothing voice for relaxation content",
|
| 117 |
+
speed=0.8,
|
| 118 |
+
pitch_shift=-1,
|
| 119 |
+
pause_multiplier=1.8,
|
| 120 |
+
recommended_voices=["af_heart", "bf_lily", "am_echo", "bf_emma"]
|
| 121 |
+
),
|
| 122 |
+
"storyteller": StylePreset(
|
| 123 |
+
name="Storyteller",
|
| 124 |
+
description="Engaging pace for audiobooks and stories",
|
| 125 |
+
speed=0.95,
|
| 126 |
+
pitch_shift=0,
|
| 127 |
+
pause_multiplier=1.2,
|
| 128 |
+
recommended_voices=["bm_fable", "af_bella", "am_michael", "bf_isabella"]
|
| 129 |
+
),
|
| 130 |
+
"professional": StylePreset(
|
| 131 |
+
name="Professional / Corporate",
|
| 132 |
+
description="Clear, authoritative delivery for business content",
|
| 133 |
+
speed=1.05,
|
| 134 |
+
pitch_shift=0,
|
| 135 |
+
pause_multiplier=1.0,
|
| 136 |
+
recommended_voices=["af_nicole", "am_eric", "bf_emma", "bm_george"]
|
| 137 |
+
),
|
| 138 |
+
"cheerful": StylePreset(
|
| 139 |
+
name="Cheerful / Friendly",
|
| 140 |
+
description="Warm, upbeat tone for friendly content",
|
| 141 |
+
speed=1.1,
|
| 142 |
+
pitch_shift=0.5,
|
| 143 |
+
pause_multiplier=0.9,
|
| 144 |
+
recommended_voices=["af_sarah", "am_puck", "af_kore", "am_liam"]
|
| 145 |
+
),
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
# ============================================================================
|
| 150 |
+
# AUDIO PROCESSING UTILITIES
|
| 151 |
+
# ============================================================================
|
| 152 |
+
|
| 153 |
+
def pitch_shift_audio(audio: np.ndarray, sample_rate: int, semitones: float) -> np.ndarray:
|
| 154 |
+
"""
|
| 155 |
+
Shift the pitch of audio by a given number of semitones.
|
| 156 |
+
Uses simple resampling-based pitch shifting (no external dependencies).
|
| 157 |
+
|
| 158 |
+
Args:
|
| 159 |
+
audio: Input audio array
|
| 160 |
+
sample_rate: Sample rate of the audio
|
| 161 |
+
semitones: Number of semitones to shift (positive = higher, negative = lower)
|
| 162 |
+
|
| 163 |
+
Returns:
|
| 164 |
+
Pitch-shifted audio array
|
| 165 |
+
"""
|
| 166 |
+
if semitones == 0:
|
| 167 |
+
return audio
|
| 168 |
+
|
| 169 |
+
# Calculate the pitch shift factor
|
| 170 |
+
# Each semitone is a factor of 2^(1/12)
|
| 171 |
+
factor = 2 ** (semitones / 12)
|
| 172 |
+
|
| 173 |
+
# Resample to shift pitch
|
| 174 |
+
# To raise pitch: stretch time, then resample to original length
|
| 175 |
+
# To lower pitch: compress time, then resample to original length
|
| 176 |
+
|
| 177 |
+
original_length = len(audio)
|
| 178 |
+
|
| 179 |
+
# Create new sample indices
|
| 180 |
+
new_length = int(original_length / factor)
|
| 181 |
+
indices = np.linspace(0, original_length - 1, new_length)
|
| 182 |
+
|
| 183 |
+
# Linear interpolation for resampling
|
| 184 |
+
shifted = np.interp(indices, np.arange(original_length), audio)
|
| 185 |
+
|
| 186 |
+
# Resample back to original length to maintain duration
|
| 187 |
+
final_indices = np.linspace(0, len(shifted) - 1, original_length)
|
| 188 |
+
result = np.interp(final_indices, np.arange(len(shifted)), shifted)
|
| 189 |
+
|
| 190 |
+
return result.astype(np.float32)
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def insert_pauses(audio_segments: list, pause_duration_ms: int, sample_rate: int) -> np.ndarray:
|
| 194 |
+
"""
|
| 195 |
+
Insert silence between audio segments.
|
| 196 |
+
|
| 197 |
+
Args:
|
| 198 |
+
audio_segments: List of audio arrays
|
| 199 |
+
pause_duration_ms: Pause duration in milliseconds
|
| 200 |
+
sample_rate: Sample rate of the audio
|
| 201 |
+
|
| 202 |
+
Returns:
|
| 203 |
+
Combined audio with pauses inserted
|
| 204 |
+
"""
|
| 205 |
+
if not audio_segments:
|
| 206 |
+
return np.array([], dtype=np.float32)
|
| 207 |
+
|
| 208 |
+
# Create silence array
|
| 209 |
+
pause_samples = int(sample_rate * pause_duration_ms / 1000)
|
| 210 |
+
silence = np.zeros(pause_samples, dtype=np.float32)
|
| 211 |
+
|
| 212 |
+
# Combine segments with pauses
|
| 213 |
+
combined = []
|
| 214 |
+
for i, segment in enumerate(audio_segments):
|
| 215 |
+
combined.append(segment)
|
| 216 |
+
if i < len(audio_segments) - 1: # Don't add pause after last segment
|
| 217 |
+
combined.append(silence)
|
| 218 |
+
|
| 219 |
+
return np.concatenate(combined)
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def normalize_audio(audio: np.ndarray, target_db: float = -3.0) -> np.ndarray:
|
| 223 |
+
"""
|
| 224 |
+
Normalize audio to a target dB level.
|
| 225 |
+
|
| 226 |
+
Args:
|
| 227 |
+
audio: Input audio array
|
| 228 |
+
target_db: Target peak level in dB (default -3 dB)
|
| 229 |
+
|
| 230 |
+
Returns:
|
| 231 |
+
Normalized audio array
|
| 232 |
+
"""
|
| 233 |
+
if len(audio) == 0:
|
| 234 |
+
return audio
|
| 235 |
+
|
| 236 |
+
# Find the peak amplitude
|
| 237 |
+
peak = np.max(np.abs(audio))
|
| 238 |
+
if peak == 0:
|
| 239 |
+
return audio
|
| 240 |
+
|
| 241 |
+
# Calculate the gain needed
|
| 242 |
+
target_amplitude = 10 ** (target_db / 20)
|
| 243 |
+
gain = target_amplitude / peak
|
| 244 |
+
|
| 245 |
+
return (audio * gain).astype(np.float32)
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def preprocess_text(text: str, add_pauses: bool = True) -> str:
|
| 249 |
+
"""
|
| 250 |
+
Preprocess text to improve TTS output quality.
|
| 251 |
+
|
| 252 |
+
Args:
|
| 253 |
+
text: Input text
|
| 254 |
+
add_pauses: Whether to add pause hints at sentence boundaries
|
| 255 |
+
|
| 256 |
+
Returns:
|
| 257 |
+
Preprocessed text
|
| 258 |
+
"""
|
| 259 |
+
# Clean up whitespace
|
| 260 |
+
text = re.sub(r'\s+', ' ', text.strip())
|
| 261 |
+
|
| 262 |
+
# Normalize common abbreviations
|
| 263 |
+
abbreviations = {
|
| 264 |
+
r'\bDr\.': 'Doctor',
|
| 265 |
+
r'\bMr\.': 'Mister',
|
| 266 |
+
r'\bMrs\.': 'Missus',
|
| 267 |
+
r'\bMs\.': 'Miss',
|
| 268 |
+
r'\bProf\.': 'Professor',
|
| 269 |
+
r'\betc\.': 'etcetera',
|
| 270 |
+
r'\be\.g\.': 'for example',
|
| 271 |
+
r'\bi\.e\.': 'that is',
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
for pattern, replacement in abbreviations.items():
|
| 275 |
+
text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
|
| 276 |
+
|
| 277 |
+
return text
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
# ============================================================================
|
| 281 |
+
# TTS ENGINE
|
| 282 |
+
# ============================================================================
|
| 283 |
+
|
| 284 |
+
class KokoroTTSEngine:
|
| 285 |
+
"""
|
| 286 |
+
Wrapper class for Kokoro TTS with additional processing capabilities.
|
| 287 |
+
"""
|
| 288 |
+
|
| 289 |
+
def __init__(self):
|
| 290 |
+
"""Initialize the TTS engine with both American and British English pipelines."""
|
| 291 |
+
print("Initializing Kokoro TTS Engine...")
|
| 292 |
+
|
| 293 |
+
# Initialize pipelines for both accents
|
| 294 |
+
self.pipelines = {
|
| 295 |
+
'a': KPipeline(lang_code='a'), # American English
|
| 296 |
+
'b': KPipeline(lang_code='b'), # British English
|
| 297 |
+
}
|
| 298 |
+
|
| 299 |
+
# Add custom pronunciation for "Kokoro"
|
| 300 |
+
self.pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
|
| 301 |
+
self.pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
|
| 302 |
+
|
| 303 |
+
# Pre-load voice packs for faster inference
|
| 304 |
+
print("Pre-loading voice packs...")
|
| 305 |
+
for voice_id in VOICE_CATALOG.keys():
|
| 306 |
+
lang_code = voice_id[0] # 'a' or 'b'
|
| 307 |
+
try:
|
| 308 |
+
self.pipelines[lang_code].load_voice(voice_id)
|
| 309 |
+
except Exception as e:
|
| 310 |
+
print(f"Warning: Could not pre-load voice {voice_id}: {e}")
|
| 311 |
+
|
| 312 |
+
print("TTS Engine initialized successfully!")
|
| 313 |
+
|
| 314 |
+
def generate(
|
| 315 |
+
self,
|
| 316 |
+
text: str,
|
| 317 |
+
voice: str = "af_heart",
|
| 318 |
+
speed: float = 1.0,
|
| 319 |
+
pitch_shift: float = 0.0,
|
| 320 |
+
pause_between_sentences_ms: int = 300,
|
| 321 |
+
) -> Tuple[int, np.ndarray]:
|
| 322 |
+
"""
|
| 323 |
+
Generate speech from text with full parameter control.
|
| 324 |
+
|
| 325 |
+
Args:
|
| 326 |
+
text: Input text to synthesize
|
| 327 |
+
voice: Voice ID from VOICE_CATALOG
|
| 328 |
+
speed: Speed multiplier (0.5 to 2.0)
|
| 329 |
+
pitch_shift: Pitch adjustment in semitones (-5 to +5)
|
| 330 |
+
pause_between_sentences_ms: Pause duration between sentences
|
| 331 |
+
|
| 332 |
+
Returns:
|
| 333 |
+
Tuple of (sample_rate, audio_array)
|
| 334 |
+
"""
|
| 335 |
+
# Validate inputs
|
| 336 |
+
text = preprocess_text(text.strip()[:MAX_CHAR_LIMIT])
|
| 337 |
+
if not text:
|
| 338 |
+
return SAMPLE_RATE, np.zeros(1, dtype=np.float32)
|
| 339 |
+
|
| 340 |
+
speed = max(0.5, min(2.0, speed))
|
| 341 |
+
pitch_shift = max(-5, min(5, pitch_shift))
|
| 342 |
+
|
| 343 |
+
# Get the appropriate pipeline
|
| 344 |
+
lang_code = voice[0] if voice[0] in self.pipelines else 'a'
|
| 345 |
+
pipeline = self.pipelines[lang_code]
|
| 346 |
+
|
| 347 |
+
# Generate audio segments
|
| 348 |
+
audio_segments = []
|
| 349 |
+
|
| 350 |
+
try:
|
| 351 |
+
for _, phonemes, audio in pipeline(text, voice=voice, speed=speed):
|
| 352 |
+
if audio is not None:
|
| 353 |
+
audio_segments.append(audio.numpy() if hasattr(audio, 'numpy') else audio)
|
| 354 |
+
except Exception as e:
|
| 355 |
+
print(f"Generation error: {e}")
|
| 356 |
+
return SAMPLE_RATE, np.zeros(1, dtype=np.float32)
|
| 357 |
+
|
| 358 |
+
if not audio_segments:
|
| 359 |
+
return SAMPLE_RATE, np.zeros(1, dtype=np.float32)
|
| 360 |
+
|
| 361 |
+
# Combine segments with pauses
|
| 362 |
+
combined_audio = insert_pauses(audio_segments, pause_between_sentences_ms, SAMPLE_RATE)
|
| 363 |
+
|
| 364 |
+
# Apply pitch shift if requested
|
| 365 |
+
if pitch_shift != 0:
|
| 366 |
+
combined_audio = pitch_shift_audio(combined_audio, SAMPLE_RATE, pitch_shift)
|
| 367 |
+
|
| 368 |
+
# Normalize the final audio
|
| 369 |
+
combined_audio = normalize_audio(combined_audio)
|
| 370 |
+
|
| 371 |
+
return SAMPLE_RATE, combined_audio
|
| 372 |
+
|
| 373 |
+
def generate_with_style(
|
| 374 |
+
self,
|
| 375 |
+
text: str,
|
| 376 |
+
voice: str,
|
| 377 |
+
style_preset: str,
|
| 378 |
+
custom_speed: Optional[float] = None,
|
| 379 |
+
custom_pitch: Optional[float] = None,
|
| 380 |
+
custom_pause: Optional[int] = None,
|
| 381 |
+
) -> Tuple[int, np.ndarray]:
|
| 382 |
+
"""
|
| 383 |
+
Generate speech using a style preset with optional custom overrides.
|
| 384 |
+
|
| 385 |
+
Args:
|
| 386 |
+
text: Input text to synthesize
|
| 387 |
+
voice: Voice ID
|
| 388 |
+
style_preset: Style preset name from STYLE_PRESETS
|
| 389 |
+
custom_speed: Override the preset speed (optional)
|
| 390 |
+
custom_pitch: Override the preset pitch (optional)
|
| 391 |
+
custom_pause: Override the preset pause (optional)
|
| 392 |
+
|
| 393 |
+
Returns:
|
| 394 |
+
Tuple of (sample_rate, audio_array)
|
| 395 |
+
"""
|
| 396 |
+
preset = STYLE_PRESETS.get(style_preset, STYLE_PRESETS["neutral"])
|
| 397 |
+
|
| 398 |
+
speed = custom_speed if custom_speed is not None else preset.speed
|
| 399 |
+
pitch = custom_pitch if custom_pitch is not None else preset.pitch_shift
|
| 400 |
+
pause = custom_pause if custom_pause is not None else int(300 * preset.pause_multiplier)
|
| 401 |
+
|
| 402 |
+
return self.generate(
|
| 403 |
+
text=text,
|
| 404 |
+
voice=voice,
|
| 405 |
+
speed=speed,
|
| 406 |
+
pitch_shift=pitch,
|
| 407 |
+
pause_between_sentences_ms=pause,
|
| 408 |
+
)
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
# ============================================================================
|
| 412 |
+
# GRADIO INTERFACE
|
| 413 |
+
# ============================================================================
|
| 414 |
+
|
| 415 |
+
def create_voice_choices():
|
| 416 |
+
"""Create organized voice choices for the dropdown."""
|
| 417 |
+
choices = []
|
| 418 |
+
|
| 419 |
+
# Group by accent and gender
|
| 420 |
+
groups = {
|
| 421 |
+
("American", "Female"): [],
|
| 422 |
+
("American", "Male"): [],
|
| 423 |
+
("British", "Female"): [],
|
| 424 |
+
("British", "Male"): [],
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
for voice_id, (name, gender, accent, grade, desc) in VOICE_CATALOG.items():
|
| 428 |
+
groups[(accent, gender)].append((voice_id, name, grade))
|
| 429 |
+
|
| 430 |
+
# Build choices with group labels
|
| 431 |
+
for (accent, gender), voices in groups.items():
|
| 432 |
+
flag = "🇺🇸" if accent == "American" else "🇬🇧"
|
| 433 |
+
gender_icon = "🚺" if gender == "Female" else "🚹"
|
| 434 |
+
|
| 435 |
+
for voice_id, name, grade in sorted(voices, key=lambda x: x[2]): # Sort by grade
|
| 436 |
+
label = f"{flag} {gender_icon} {name} [{grade}]"
|
| 437 |
+
choices.append((label, voice_id))
|
| 438 |
+
|
| 439 |
+
return choices
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
def create_style_choices():
|
| 443 |
+
"""Create style preset choices for the dropdown."""
|
| 444 |
+
return [(preset.name, key) for key, preset in STYLE_PRESETS.items()]
|
| 445 |
+
|
| 446 |
+
|
| 447 |
+
# Initialize the TTS engine globally
|
| 448 |
+
print("Loading Kokoro TTS Engine...")
|
| 449 |
+
tts_engine = KokoroTTSEngine()
|
| 450 |
+
|
| 451 |
+
|
| 452 |
+
def generate_speech(
|
| 453 |
+
text: str,
|
| 454 |
+
voice: str,
|
| 455 |
+
style: str,
|
| 456 |
+
speed: float,
|
| 457 |
+
pitch: float,
|
| 458 |
+
pause: int,
|
| 459 |
+
use_style_defaults: bool,
|
| 460 |
+
) -> Tuple[int, np.ndarray]:
|
| 461 |
+
"""
|
| 462 |
+
Main generation function for Gradio interface.
|
| 463 |
+
"""
|
| 464 |
+
if not text.strip():
|
| 465 |
+
gr.Warning("Please enter some text to synthesize.")
|
| 466 |
+
return None
|
| 467 |
+
|
| 468 |
+
try:
|
| 469 |
+
if use_style_defaults:
|
| 470 |
+
sample_rate, audio = tts_engine.generate_with_style(
|
| 471 |
+
text=text,
|
| 472 |
+
voice=voice,
|
| 473 |
+
style_preset=style,
|
| 474 |
+
)
|
| 475 |
+
else:
|
| 476 |
+
sample_rate, audio = tts_engine.generate(
|
| 477 |
+
text=text,
|
| 478 |
+
voice=voice,
|
| 479 |
+
speed=speed,
|
| 480 |
+
pitch_shift=pitch,
|
| 481 |
+
pause_between_sentences_ms=pause,
|
| 482 |
+
)
|
| 483 |
+
|
| 484 |
+
return (sample_rate, audio)
|
| 485 |
+
|
| 486 |
+
except Exception as e:
|
| 487 |
+
gr.Error(f"Generation failed: {str(e)}")
|
| 488 |
+
return None
|
| 489 |
+
|
| 490 |
+
|
| 491 |
+
def update_style_info(style: str) -> str:
|
| 492 |
+
"""Update the style information display."""
|
| 493 |
+
preset = STYLE_PRESETS.get(style, STYLE_PRESETS["neutral"])
|
| 494 |
+
|
| 495 |
+
recommended = ", ".join([
|
| 496 |
+
VOICE_CATALOG[v][0] for v in preset.recommended_voices if v in VOICE_CATALOG
|
| 497 |
+
])
|
| 498 |
+
|
| 499 |
+
return f"""**{preset.name}**
|
| 500 |
+
|
| 501 |
+
{preset.description}
|
| 502 |
+
|
| 503 |
+
- **Speed:** {preset.speed}x
|
| 504 |
+
- **Pitch Shift:** {preset.pitch_shift:+.1f} semitones
|
| 505 |
+
- **Pause Multiplier:** {preset.pause_multiplier}x
|
| 506 |
+
|
| 507 |
+
**Recommended Voices:** {recommended}
|
| 508 |
+
"""
|
| 509 |
+
|
| 510 |
+
|
| 511 |
+
def update_controls_from_style(style: str, use_defaults: bool):
|
| 512 |
+
"""Update the control sliders based on selected style."""
|
| 513 |
+
if not use_defaults:
|
| 514 |
+
return gr.update(), gr.update(), gr.update()
|
| 515 |
+
|
| 516 |
+
preset = STYLE_PRESETS.get(style, STYLE_PRESETS["neutral"])
|
| 517 |
+
return (
|
| 518 |
+
gr.update(value=preset.speed),
|
| 519 |
+
gr.update(value=preset.pitch_shift),
|
| 520 |
+
gr.update(value=int(300 * preset.pause_multiplier)),
|
| 521 |
+
)
|
| 522 |
+
|
| 523 |
+
|
| 524 |
+
# Sample texts for demonstration
|
| 525 |
+
SAMPLE_TEXTS = {
|
| 526 |
+
"welcome": """Welcome to Kokoro Text-to-Speech! This is an open-source model with 82 million parameters,
|
| 527 |
+
capable of producing natural-sounding speech. Try different voices and styles to find your perfect combination.""",
|
| 528 |
+
|
| 529 |
+
"horror": """The old house creaked as I pushed open the door. Something moved in the shadows.
|
| 530 |
+
A whisper echoed through the empty halls... "You shouldn't have come here."
|
| 531 |
+
I turned to run, but the door had vanished.""",
|
| 532 |
+
|
| 533 |
+
"news": """Breaking news tonight: Scientists have made a groundbreaking discovery that could change
|
| 534 |
+
our understanding of the universe. The research team announced their findings at a press conference
|
| 535 |
+
held earlier today at the National Science Foundation.""",
|
| 536 |
+
|
| 537 |
+
"story": """Once upon a time, in a kingdom far away, there lived a young princess who dreamed of adventure.
|
| 538 |
+
One day, she discovered a magical map hidden in the castle library.
|
| 539 |
+
Little did she know, this map would lead her to the greatest journey of her life.""",
|
| 540 |
+
|
| 541 |
+
"technical": """The system architecture consists of three main components: the frontend user interface,
|
| 542 |
+
the backend API server, and the database layer. Each component is designed for scalability and
|
| 543 |
+
can be deployed independently using container orchestration.""",
|
| 544 |
+
}
|
| 545 |
+
|
| 546 |
+
|
| 547 |
+
def load_sample_text(sample_key: str) -> str:
|
| 548 |
+
"""Load a sample text."""
|
| 549 |
+
return SAMPLE_TEXTS.get(sample_key, "")
|
| 550 |
+
|
| 551 |
+
|
| 552 |
+
# Build the Gradio interface
|
| 553 |
+
with gr.Blocks(
|
| 554 |
+
title="Kokoro TTS - Academic Text-to-Speech",
|
| 555 |
+
theme=gr.themes.Soft(),
|
| 556 |
+
css="""
|
| 557 |
+
.main-title {
|
| 558 |
+
text-align: center;
|
| 559 |
+
margin-bottom: 1rem;
|
| 560 |
+
}
|
| 561 |
+
.info-box {
|
| 562 |
+
background-color: #f0f7ff;
|
| 563 |
+
border-radius: 8px;
|
| 564 |
+
padding: 1rem;
|
| 565 |
+
margin: 0.5rem 0;
|
| 566 |
+
}
|
| 567 |
+
"""
|
| 568 |
+
) as demo:
|
| 569 |
+
|
| 570 |
+
# Header
|
| 571 |
+
gr.Markdown(
|
| 572 |
+
"""
|
| 573 |
+
# 🎙️ Kokoro TTS - Academic Text-to-Speech
|
| 574 |
+
|
| 575 |
+
**An open-source, high-quality TTS system powered by [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M)**
|
| 576 |
+
|
| 577 |
+
Features: 28 voices • Style presets • Speed/Pitch/Pause control • CPU-friendly
|
| 578 |
+
""",
|
| 579 |
+
elem_classes=["main-title"]
|
| 580 |
+
)
|
| 581 |
+
|
| 582 |
+
with gr.Row():
|
| 583 |
+
# Left column - Input controls
|
| 584 |
+
with gr.Column(scale=1):
|
| 585 |
+
# Text input
|
| 586 |
+
text_input = gr.Textbox(
|
| 587 |
+
label="📝 Text to Synthesize",
|
| 588 |
+
placeholder="Enter your text here...",
|
| 589 |
+
lines=6,
|
| 590 |
+
max_lines=15,
|
| 591 |
+
info=f"Maximum {MAX_CHAR_LIMIT} characters"
|
| 592 |
+
)
|
| 593 |
+
|
| 594 |
+
# Sample text buttons
|
| 595 |
+
with gr.Accordion("📚 Sample Texts", open=False):
|
| 596 |
+
with gr.Row():
|
| 597 |
+
gr.Button("Welcome", size="sm").click(
|
| 598 |
+
lambda: SAMPLE_TEXTS["welcome"], outputs=text_input
|
| 599 |
+
)
|
| 600 |
+
gr.Button("Horror 👻", size="sm").click(
|
| 601 |
+
lambda: SAMPLE_TEXTS["horror"], outputs=text_input
|
| 602 |
+
)
|
| 603 |
+
gr.Button("News 📰", size="sm").click(
|
| 604 |
+
lambda: SAMPLE_TEXTS["news"], outputs=text_input
|
| 605 |
+
)
|
| 606 |
+
with gr.Row():
|
| 607 |
+
gr.Button("Story 📖", size="sm").click(
|
| 608 |
+
lambda: SAMPLE_TEXTS["story"], outputs=text_input
|
| 609 |
+
)
|
| 610 |
+
gr.Button("Technical 💻", size="sm").click(
|
| 611 |
+
lambda: SAMPLE_TEXTS["technical"], outputs=text_input
|
| 612 |
+
)
|
| 613 |
+
|
| 614 |
+
# Voice selection
|
| 615 |
+
voice_dropdown = gr.Dropdown(
|
| 616 |
+
choices=create_voice_choices(),
|
| 617 |
+
value="af_heart",
|
| 618 |
+
label="🎭 Voice",
|
| 619 |
+
info="Select a voice (sorted by quality grade)"
|
| 620 |
+
)
|
| 621 |
+
|
| 622 |
+
# Style preset
|
| 623 |
+
style_dropdown = gr.Dropdown(
|
| 624 |
+
choices=create_style_choices(),
|
| 625 |
+
value="neutral",
|
| 626 |
+
label="🎨 Style Preset",
|
| 627 |
+
info="Choose a style for different content types"
|
| 628 |
+
)
|
| 629 |
+
|
| 630 |
+
# Style info display
|
| 631 |
+
style_info = gr.Markdown(
|
| 632 |
+
value=update_style_info("neutral"),
|
| 633 |
+
elem_classes=["info-box"]
|
| 634 |
+
)
|
| 635 |
+
|
| 636 |
+
# Use style defaults checkbox
|
| 637 |
+
use_style_defaults = gr.Checkbox(
|
| 638 |
+
label="Use Style Preset Defaults",
|
| 639 |
+
value=True,
|
| 640 |
+
info="When checked, style preset values override manual controls"
|
| 641 |
+
)
|
| 642 |
+
|
| 643 |
+
# Right column - Advanced controls and output
|
| 644 |
+
with gr.Column(scale=1):
|
| 645 |
+
# Advanced controls
|
| 646 |
+
with gr.Accordion("⚙️ Advanced Controls", open=True):
|
| 647 |
+
speed_slider = gr.Slider(
|
| 648 |
+
minimum=0.5,
|
| 649 |
+
maximum=2.0,
|
| 650 |
+
value=1.0,
|
| 651 |
+
step=0.05,
|
| 652 |
+
label="🏃 Speed",
|
| 653 |
+
info="Speaking rate (0.5x = slow, 2.0x = fast)"
|
| 654 |
+
)
|
| 655 |
+
|
| 656 |
+
pitch_slider = gr.Slider(
|
| 657 |
+
minimum=-5.0,
|
| 658 |
+
maximum=5.0,
|
| 659 |
+
value=0.0,
|
| 660 |
+
step=0.5,
|
| 661 |
+
label="🎵 Pitch Shift (semitones)",
|
| 662 |
+
info="Adjust voice pitch (-5 = deeper, +5 = higher)"
|
| 663 |
+
)
|
| 664 |
+
|
| 665 |
+
pause_slider = gr.Slider(
|
| 666 |
+
minimum=0,
|
| 667 |
+
maximum=1000,
|
| 668 |
+
value=300,
|
| 669 |
+
step=50,
|
| 670 |
+
label="⏸️ Pause Between Sentences (ms)",
|
| 671 |
+
info="Silence duration between sentences"
|
| 672 |
+
)
|
| 673 |
+
|
| 674 |
+
# Generate button
|
| 675 |
+
generate_btn = gr.Button(
|
| 676 |
+
"🎙️ Generate Speech",
|
| 677 |
+
variant="primary",
|
| 678 |
+
size="lg"
|
| 679 |
+
)
|
| 680 |
+
|
| 681 |
+
# Audio output
|
| 682 |
+
audio_output = gr.Audio(
|
| 683 |
+
label="🔊 Generated Audio",
|
| 684 |
+
type="numpy",
|
| 685 |
+
interactive=False,
|
| 686 |
+
autoplay=True
|
| 687 |
+
)
|
| 688 |
+
|
| 689 |
+
# Download info
|
| 690 |
+
gr.Markdown(
|
| 691 |
+
"""
|
| 692 |
+
💡 **Tips:**
|
| 693 |
+
- Click the download button (⬇️) on the audio player to save
|
| 694 |
+
- Try different voices with the same text to compare
|
| 695 |
+
- Use style presets as starting points, then customize
|
| 696 |
+
"""
|
| 697 |
+
)
|
| 698 |
+
|
| 699 |
+
# Footer
|
| 700 |
+
gr.Markdown(
|
| 701 |
+
"""
|
| 702 |
+
---
|
| 703 |
+
**About:** This is an academic demonstration of the Kokoro-82M TTS model.
|
| 704 |
+
The model is licensed under Apache 2.0 and can be used for both personal and commercial projects.
|
| 705 |
+
|
| 706 |
+
**Resources:** [Model Card](https://huggingface.co/hexgrad/Kokoro-82M) |
|
| 707 |
+
[Voice List](https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md) |
|
| 708 |
+
[GitHub](https://github.com/hexgrad/kokoro)
|
| 709 |
+
"""
|
| 710 |
+
)
|
| 711 |
+
|
| 712 |
+
# Event handlers
|
| 713 |
+
style_dropdown.change(
|
| 714 |
+
fn=update_style_info,
|
| 715 |
+
inputs=[style_dropdown],
|
| 716 |
+
outputs=[style_info]
|
| 717 |
+
)
|
| 718 |
+
|
| 719 |
+
style_dropdown.change(
|
| 720 |
+
fn=update_controls_from_style,
|
| 721 |
+
inputs=[style_dropdown, use_style_defaults],
|
| 722 |
+
outputs=[speed_slider, pitch_slider, pause_slider]
|
| 723 |
+
)
|
| 724 |
+
|
| 725 |
+
use_style_defaults.change(
|
| 726 |
+
fn=update_controls_from_style,
|
| 727 |
+
inputs=[style_dropdown, use_style_defaults],
|
| 728 |
+
outputs=[speed_slider, pitch_slider, pause_slider]
|
| 729 |
+
)
|
| 730 |
+
|
| 731 |
+
generate_btn.click(
|
| 732 |
+
fn=generate_speech,
|
| 733 |
+
inputs=[
|
| 734 |
+
text_input,
|
| 735 |
+
voice_dropdown,
|
| 736 |
+
style_dropdown,
|
| 737 |
+
speed_slider,
|
| 738 |
+
pitch_slider,
|
| 739 |
+
pause_slider,
|
| 740 |
+
use_style_defaults,
|
| 741 |
+
],
|
| 742 |
+
outputs=[audio_output]
|
| 743 |
+
)
|
| 744 |
+
|
| 745 |
+
|
| 746 |
+
# Launch configuration
|
| 747 |
+
if __name__ == "__main__":
|
| 748 |
+
demo.queue().launch(
|
| 749 |
+
server_name="0.0.0.0",
|
| 750 |
+
server_port=7860,
|
| 751 |
+
share=False,
|
| 752 |
+
show_api=True
|
| 753 |
+
)
|
examples.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Kokoro TTS - Example Usage Script
|
| 3 |
+
=================================
|
| 4 |
+
This script demonstrates how to use the Kokoro TTS engine programmatically.
|
| 5 |
+
Useful for understanding the code flow and for batch processing.
|
| 6 |
+
|
| 7 |
+
Run this script after installing dependencies:
|
| 8 |
+
pip install kokoro soundfile numpy
|
| 9 |
+
apt-get install espeak-ng # Linux
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import numpy as np
|
| 13 |
+
import soundfile as sf
|
| 14 |
+
from kokoro import KPipeline
|
| 15 |
+
|
| 16 |
+
# ============================================================================
|
| 17 |
+
# EXAMPLE 1: Basic Text-to-Speech
|
| 18 |
+
# ============================================================================
|
| 19 |
+
|
| 20 |
+
def example_basic_tts():
|
| 21 |
+
"""Generate speech with default settings."""
|
| 22 |
+
print("\n" + "="*50)
|
| 23 |
+
print("Example 1: Basic TTS")
|
| 24 |
+
print("="*50)
|
| 25 |
+
|
| 26 |
+
# Initialize pipeline for American English
|
| 27 |
+
pipeline = KPipeline(lang_code='a')
|
| 28 |
+
|
| 29 |
+
# Text to synthesize
|
| 30 |
+
text = "Hello! This is a demonstration of the Kokoro text to speech model."
|
| 31 |
+
|
| 32 |
+
# Generate audio
|
| 33 |
+
for i, (graphemes, phonemes, audio) in enumerate(pipeline(text, voice='af_heart')):
|
| 34 |
+
print(f"Segment {i}:")
|
| 35 |
+
print(f" Text: {graphemes}")
|
| 36 |
+
print(f" Phonemes: {phonemes}")
|
| 37 |
+
|
| 38 |
+
# Save audio
|
| 39 |
+
audio_np = audio.numpy() if hasattr(audio, 'numpy') else audio
|
| 40 |
+
sf.write(f'example1_segment{i}.wav', audio_np, 24000)
|
| 41 |
+
print(f" Saved: example1_segment{i}.wav")
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
# ============================================================================
|
| 45 |
+
# EXAMPLE 2: Speed Control
|
| 46 |
+
# ============================================================================
|
| 47 |
+
|
| 48 |
+
def example_speed_control():
|
| 49 |
+
"""Generate speech at different speeds."""
|
| 50 |
+
print("\n" + "="*50)
|
| 51 |
+
print("Example 2: Speed Control")
|
| 52 |
+
print("="*50)
|
| 53 |
+
|
| 54 |
+
pipeline = KPipeline(lang_code='a')
|
| 55 |
+
text = "The quick brown fox jumps over the lazy dog."
|
| 56 |
+
|
| 57 |
+
speeds = [0.7, 1.0, 1.3]
|
| 58 |
+
|
| 59 |
+
for speed in speeds:
|
| 60 |
+
print(f"\nGenerating at speed {speed}x...")
|
| 61 |
+
|
| 62 |
+
for _, _, audio in pipeline(text, voice='af_bella', speed=speed):
|
| 63 |
+
audio_np = audio.numpy() if hasattr(audio, 'numpy') else audio
|
| 64 |
+
filename = f'example2_speed_{speed}.wav'
|
| 65 |
+
sf.write(filename, audio_np, 24000)
|
| 66 |
+
print(f" Saved: {filename}")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
# ============================================================================
|
| 70 |
+
# EXAMPLE 3: Different Voices
|
| 71 |
+
# ============================================================================
|
| 72 |
+
|
| 73 |
+
def example_different_voices():
|
| 74 |
+
"""Compare different voices with the same text."""
|
| 75 |
+
print("\n" + "="*50)
|
| 76 |
+
print("Example 3: Different Voices")
|
| 77 |
+
print("="*50)
|
| 78 |
+
|
| 79 |
+
# American and British pipelines
|
| 80 |
+
pipelines = {
|
| 81 |
+
'a': KPipeline(lang_code='a'),
|
| 82 |
+
'b': KPipeline(lang_code='b'),
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
text = "Good morning! How are you doing today?"
|
| 86 |
+
|
| 87 |
+
voices = [
|
| 88 |
+
('af_heart', 'American Female - Heart'),
|
| 89 |
+
('am_michael', 'American Male - Michael'),
|
| 90 |
+
('bf_emma', 'British Female - Emma'),
|
| 91 |
+
('bm_george', 'British Male - George'),
|
| 92 |
+
]
|
| 93 |
+
|
| 94 |
+
for voice_id, voice_name in voices:
|
| 95 |
+
print(f"\nGenerating with {voice_name}...")
|
| 96 |
+
|
| 97 |
+
lang_code = voice_id[0] # 'a' or 'b'
|
| 98 |
+
pipeline = pipelines[lang_code]
|
| 99 |
+
|
| 100 |
+
for _, _, audio in pipeline(text, voice=voice_id):
|
| 101 |
+
audio_np = audio.numpy() if hasattr(audio, 'numpy') else audio
|
| 102 |
+
filename = f'example3_{voice_id}.wav'
|
| 103 |
+
sf.write(filename, audio_np, 24000)
|
| 104 |
+
print(f" Saved: {filename}")
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
# ============================================================================
|
| 108 |
+
# EXAMPLE 4: Combining Audio Segments with Pauses
|
| 109 |
+
# ============================================================================
|
| 110 |
+
|
| 111 |
+
def example_pause_insertion():
|
| 112 |
+
"""Demonstrate inserting pauses between sentences."""
|
| 113 |
+
print("\n" + "="*50)
|
| 114 |
+
print("Example 4: Pause Insertion")
|
| 115 |
+
print("="*50)
|
| 116 |
+
|
| 117 |
+
pipeline = KPipeline(lang_code='a')
|
| 118 |
+
|
| 119 |
+
# Multiple sentences
|
| 120 |
+
text = """First sentence of the story.
|
| 121 |
+
Second sentence with more details.
|
| 122 |
+
And finally, the conclusion."""
|
| 123 |
+
|
| 124 |
+
# Collect all audio segments
|
| 125 |
+
audio_segments = []
|
| 126 |
+
|
| 127 |
+
for _, _, audio in pipeline(text, voice='af_heart'):
|
| 128 |
+
audio_np = audio.numpy() if hasattr(audio, 'numpy') else audio
|
| 129 |
+
audio_segments.append(audio_np)
|
| 130 |
+
|
| 131 |
+
# Create pause (500ms of silence)
|
| 132 |
+
sample_rate = 24000
|
| 133 |
+
pause_duration = 0.5 # seconds
|
| 134 |
+
silence = np.zeros(int(sample_rate * pause_duration), dtype=np.float32)
|
| 135 |
+
|
| 136 |
+
# Combine with pauses
|
| 137 |
+
combined = []
|
| 138 |
+
for i, segment in enumerate(audio_segments):
|
| 139 |
+
combined.append(segment)
|
| 140 |
+
if i < len(audio_segments) - 1: # Don't add pause after last segment
|
| 141 |
+
combined.append(silence)
|
| 142 |
+
|
| 143 |
+
final_audio = np.concatenate(combined)
|
| 144 |
+
|
| 145 |
+
# Normalize
|
| 146 |
+
max_val = np.max(np.abs(final_audio))
|
| 147 |
+
if max_val > 0:
|
| 148 |
+
final_audio = final_audio / max_val * 0.9
|
| 149 |
+
|
| 150 |
+
filename = 'example4_with_pauses.wav'
|
| 151 |
+
sf.write(filename, final_audio, 24000)
|
| 152 |
+
print(f" Saved: {filename}")
|
| 153 |
+
print(f" Duration: {len(final_audio)/24000:.2f} seconds")
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
# ============================================================================
|
| 157 |
+
# EXAMPLE 5: Custom Pronunciation
|
| 158 |
+
# ============================================================================
|
| 159 |
+
|
| 160 |
+
def example_custom_pronunciation():
|
| 161 |
+
"""Use phoneme markup for custom pronunciations."""
|
| 162 |
+
print("\n" + "="*50)
|
| 163 |
+
print("Example 5: Custom Pronunciation")
|
| 164 |
+
print("="*50)
|
| 165 |
+
|
| 166 |
+
pipeline = KPipeline(lang_code='a')
|
| 167 |
+
|
| 168 |
+
# Custom pronunciation using markdown-style markup
|
| 169 |
+
# [word](/phonemes/) syntax
|
| 170 |
+
text_normal = "I love Kokoro text to speech."
|
| 171 |
+
text_custom = "I love [Kokoro](/kˈOkəɹO/) text to speech."
|
| 172 |
+
|
| 173 |
+
print("\nNormal pronunciation:")
|
| 174 |
+
for _, phonemes, audio in pipeline(text_normal, voice='af_heart'):
|
| 175 |
+
print(f" Phonemes: {phonemes}")
|
| 176 |
+
audio_np = audio.numpy() if hasattr(audio, 'numpy') else audio
|
| 177 |
+
sf.write('example5_normal.wav', audio_np, 24000)
|
| 178 |
+
|
| 179 |
+
print("\nCustom pronunciation:")
|
| 180 |
+
for _, phonemes, audio in pipeline(text_custom, voice='af_heart'):
|
| 181 |
+
print(f" Phonemes: {phonemes}")
|
| 182 |
+
audio_np = audio.numpy() if hasattr(audio, 'numpy') else audio
|
| 183 |
+
sf.write('example5_custom.wav', audio_np, 24000)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
# ============================================================================
|
| 187 |
+
# MAIN
|
| 188 |
+
# ============================================================================
|
| 189 |
+
|
| 190 |
+
if __name__ == "__main__":
|
| 191 |
+
print("Kokoro TTS - Example Usage")
|
| 192 |
+
print("==========================")
|
| 193 |
+
print("This script generates several example audio files.")
|
| 194 |
+
print("Make sure you have installed: pip install kokoro soundfile")
|
| 195 |
+
print("And system dependency: apt-get install espeak-ng")
|
| 196 |
+
|
| 197 |
+
try:
|
| 198 |
+
example_basic_tts()
|
| 199 |
+
example_speed_control()
|
| 200 |
+
example_different_voices()
|
| 201 |
+
example_pause_insertion()
|
| 202 |
+
example_custom_pronunciation()
|
| 203 |
+
|
| 204 |
+
print("\n" + "="*50)
|
| 205 |
+
print("All examples completed successfully!")
|
| 206 |
+
print("Check the current directory for generated .wav files")
|
| 207 |
+
print("="*50)
|
| 208 |
+
|
| 209 |
+
except ImportError as e:
|
| 210 |
+
print(f"\nError: {e}")
|
| 211 |
+
print("Please install required packages:")
|
| 212 |
+
print(" pip install kokoro soundfile numpy")
|
| 213 |
+
print(" apt-get install espeak-ng")
|
| 214 |
+
except Exception as e:
|
| 215 |
+
print(f"\nError during generation: {e}")
|
packages.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# System dependencies for Kokoro TTS
|
| 2 |
+
# Required for phoneme processing and audio encoding
|
| 3 |
+
|
| 4 |
+
espeak-ng
|
| 5 |
+
ffmpeg
|
| 6 |
+
libsndfile1
|
requirements.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Kokoro TTS - Academic Project Requirements
|
| 2 |
+
# ==========================================
|
| 3 |
+
# For Hugging Face Spaces free tier (CPU-based)
|
| 4 |
+
|
| 5 |
+
# Core TTS Model
|
| 6 |
+
kokoro>=0.9.4
|
| 7 |
+
|
| 8 |
+
# Audio Processing
|
| 9 |
+
soundfile>=0.12.1
|
| 10 |
+
numpy>=1.24.0
|
| 11 |
+
|
| 12 |
+
# Web Interface
|
| 13 |
+
gradio>=4.0.0
|
| 14 |
+
|
| 15 |
+
# PyTorch (CPU version for free tier compatibility)
|
| 16 |
+
torch>=2.0.0
|
| 17 |
+
torchaudio>=2.0.0
|
| 18 |
+
|
| 19 |
+
# G2P (Grapheme-to-Phoneme) for text processing
|
| 20 |
+
misaki[en]>=0.9.0
|