Spaces:
Build error
Build error
Upload 7 files
Browse files- .gitattributes +1 -0
- .gitignore +48 -0
- COLAB_SETUP.md +254 -0
- Chatterbox_TTS_Colab.ipynb +614 -0
- Run Chatterbox TTS.bat +10 -0
- VC_redist.x64.exe +3 -0
- app.py +505 -0
- requirements.txt +14 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
VC_redist.x64.exe filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.vscode
|
| 2 |
+
|
| 3 |
+
# Pylance
|
| 4 |
+
pyrightconfig.json
|
| 5 |
+
|
| 6 |
+
# Byte-compiled / optimized / DLL files
|
| 7 |
+
__pycache__/
|
| 8 |
+
*.py[cod]
|
| 9 |
+
*$py.class
|
| 10 |
+
|
| 11 |
+
# C extensions
|
| 12 |
+
*.so
|
| 13 |
+
|
| 14 |
+
# Distribution / packaging
|
| 15 |
+
.Python
|
| 16 |
+
build/
|
| 17 |
+
develop-eggs/
|
| 18 |
+
dist/
|
| 19 |
+
downloads/
|
| 20 |
+
eggs/
|
| 21 |
+
.eggs/
|
| 22 |
+
lib/
|
| 23 |
+
lib64/
|
| 24 |
+
parts/
|
| 25 |
+
sdist/
|
| 26 |
+
var/
|
| 27 |
+
wheels/
|
| 28 |
+
*.egg-info/
|
| 29 |
+
.installed.cfg
|
| 30 |
+
*.egg
|
| 31 |
+
MANIFEST
|
| 32 |
+
|
| 33 |
+
# PyInstaller
|
| 34 |
+
# Usually these files are written by a python script from a template
|
| 35 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 36 |
+
*.manifest
|
| 37 |
+
*.spec
|
| 38 |
+
|
| 39 |
+
# Installer logs
|
| 40 |
+
pip-log.txt
|
| 41 |
+
pip-delete-this-directory.txt
|
| 42 |
+
|
| 43 |
+
syn_out/
|
| 44 |
+
checkpoints/
|
| 45 |
+
.gradio
|
| 46 |
+
|
| 47 |
+
# Ignore generated sample .wav files
|
| 48 |
+
**/*.wav
|
COLAB_SETUP.md
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🚀 Google Colab Setup Guide for Chatterbox TTS
|
| 2 |
+
|
| 3 |
+
This guide will help you run Chatterbox TTS on Google Colab for free!
|
| 4 |
+
|
| 5 |
+
## 📋 Prerequisites
|
| 6 |
+
|
| 7 |
+
- Google account (for Google Colab access)
|
| 8 |
+
- Your Chatterbox TTS project files
|
| 9 |
+
|
| 10 |
+
## 🎯 Quick Start
|
| 11 |
+
|
| 12 |
+
### Option 1: Using GitHub Repository (Recommended)
|
| 13 |
+
|
| 14 |
+
1. **Push your code to GitHub:**
|
| 15 |
+
```bash
|
| 16 |
+
git init
|
| 17 |
+
git add .
|
| 18 |
+
git commit -m "Initial commit"
|
| 19 |
+
git remote add origin https://github.com/YOUR_USERNAME/chatterbox-tts.git
|
| 20 |
+
git push -u origin main
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
2. **Upload the notebook to Google Colab:**
|
| 24 |
+
- Go to [Google Colab](https://colab.research.google.com/)
|
| 25 |
+
- Click `File` → `Upload notebook`
|
| 26 |
+
- Upload `Chatterbox_TTS_Colab.ipynb`
|
| 27 |
+
|
| 28 |
+
3. **Update the repository URL:**
|
| 29 |
+
- In the notebook, find Step 2 (Clone Repository)
|
| 30 |
+
- Replace `YOUR_USERNAME` with your actual GitHub username
|
| 31 |
+
|
| 32 |
+
4. **Run all cells:**
|
| 33 |
+
- Click `Runtime` → `Run all`
|
| 34 |
+
- Wait for setup to complete (~5-10 minutes)
|
| 35 |
+
- Click the public URL when it appears
|
| 36 |
+
|
| 37 |
+
### Option 2: Manual Upload (No GitHub Required)
|
| 38 |
+
|
| 39 |
+
1. **Prepare your project:**
|
| 40 |
+
- Create a ZIP file of your entire project folder
|
| 41 |
+
- Include: `app.py`, `modules/`, `src/`, `requirements.txt`
|
| 42 |
+
|
| 43 |
+
2. **Upload to Colab:**
|
| 44 |
+
- Go to [Google Colab](https://colab.research.google.com/)
|
| 45 |
+
- Upload `Chatterbox_TTS_Colab.ipynb`
|
| 46 |
+
- In Step 2, uncomment the "Alternative: Upload Files Manually" cell
|
| 47 |
+
- Run that cell and upload your ZIP file
|
| 48 |
+
|
| 49 |
+
3. **Run the notebook:**
|
| 50 |
+
- Run all remaining cells
|
| 51 |
+
- Wait for the Gradio interface to launch
|
| 52 |
+
|
| 53 |
+
## ⚙️ Configuration
|
| 54 |
+
|
| 55 |
+
### Enable GPU (Highly Recommended)
|
| 56 |
+
|
| 57 |
+
1. Click `Runtime` → `Change runtime type`
|
| 58 |
+
2. Select `T4 GPU` under Hardware accelerator
|
| 59 |
+
3. Click `Save`
|
| 60 |
+
|
| 61 |
+
This will significantly speed up model loading and inference!
|
| 62 |
+
|
| 63 |
+
### Adjust Settings
|
| 64 |
+
|
| 65 |
+
You can modify these in the notebook:
|
| 66 |
+
|
| 67 |
+
- **Model download location**: Models are cached in `/root/.cache/huggingface/`
|
| 68 |
+
- **Gradio share link**: Set `share=True` for public URL (default)
|
| 69 |
+
- **Queue settings**: Adjust `max_size` and `concurrency_limit` as needed
|
| 70 |
+
|
| 71 |
+
## 📦 What Gets Installed
|
| 72 |
+
|
| 73 |
+
The notebook automatically installs:
|
| 74 |
+
|
| 75 |
+
- PyTorch 2.7.1 with CUDA support
|
| 76 |
+
- Gradio 5.44.1
|
| 77 |
+
- All Chatterbox TTS dependencies
|
| 78 |
+
- System packages (ffmpeg)
|
| 79 |
+
|
| 80 |
+
Total installation time: ~3-5 minutes
|
| 81 |
+
|
| 82 |
+
## 🎮 Using the Application
|
| 83 |
+
|
| 84 |
+
Once launched, you'll see a public URL like:
|
| 85 |
+
```
|
| 86 |
+
Running on public URL: https://xxxxx.gradio.live
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
Click this URL to access your TTS application from anywhere!
|
| 90 |
+
|
| 91 |
+
### Features Available:
|
| 92 |
+
|
| 93 |
+
1. **⚡ Turbo TTS** - Fast synthesis with paralinguistic tags
|
| 94 |
+
2. **🎤 TTS Main** - Advanced English TTS
|
| 95 |
+
3. **🌍 Multilingual TTS** - Multiple language support
|
| 96 |
+
4. **🔄 Voice Conversion** - Convert voices
|
| 97 |
+
5. **🧬 Clone Voice** - Clone custom voices
|
| 98 |
+
|
| 99 |
+
## ⚠️ Important Notes
|
| 100 |
+
|
| 101 |
+
### Session Limits
|
| 102 |
+
|
| 103 |
+
- **Free Colab**: ~12 hours max session time
|
| 104 |
+
- **GPU usage**: Limited hours per week
|
| 105 |
+
- **Files are temporary**: Download outputs before session ends!
|
| 106 |
+
|
| 107 |
+
### Saving Your Work
|
| 108 |
+
|
| 109 |
+
To save generated audio:
|
| 110 |
+
1. Right-click on the audio player
|
| 111 |
+
2. Select "Download"
|
| 112 |
+
3. Or use the download button in Gradio
|
| 113 |
+
|
| 114 |
+
### Keeping Session Alive
|
| 115 |
+
|
| 116 |
+
Colab disconnects after inactivity. To prevent this:
|
| 117 |
+
- Install [Colab Keep Alive](https://chrome.google.com/webstore/detail/colab-alive/eookkckfbbgnhdgcbfbicoahejkdoele) extension
|
| 118 |
+
- Or periodically interact with the notebook
|
| 119 |
+
|
| 120 |
+
## 🐛 Troubleshooting
|
| 121 |
+
|
| 122 |
+
### "Out of Memory" Error
|
| 123 |
+
|
| 124 |
+
**Solution:**
|
| 125 |
+
```python
|
| 126 |
+
# Restart runtime
|
| 127 |
+
Runtime → Restart runtime
|
| 128 |
+
|
| 129 |
+
# Or upgrade to Colab Pro for more RAM
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
### "Module Not Found" Error
|
| 133 |
+
|
| 134 |
+
**Solution:**
|
| 135 |
+
```python
|
| 136 |
+
# Re-run Step 1 (Install Dependencies)
|
| 137 |
+
# Make sure all cells complete without errors
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
### Models Won't Download
|
| 141 |
+
|
| 142 |
+
**Solution:**
|
| 143 |
+
1. Check internet connection
|
| 144 |
+
2. Verify HuggingFace Hub is accessible
|
| 145 |
+
3. Try running Step 4 again
|
| 146 |
+
4. Check if `token=False` is set (no authentication required)
|
| 147 |
+
|
| 148 |
+
### Import Errors
|
| 149 |
+
|
| 150 |
+
**Solution:**
|
| 151 |
+
```python
|
| 152 |
+
# Make sure project structure is correct:
|
| 153 |
+
# ├── app.py
|
| 154 |
+
# ├── modules/
|
| 155 |
+
# │ ├── config.py
|
| 156 |
+
# │ ├── generation_functions.py
|
| 157 |
+
# │ ├── model_manager.py
|
| 158 |
+
# │ ├── ui_components.py
|
| 159 |
+
# │ └── voice_manager.py
|
| 160 |
+
# └── src/
|
| 161 |
+
# └── chatterbox/
|
| 162 |
+
# ├── tts.py
|
| 163 |
+
# └── tts_turbo.py
|
| 164 |
+
```
|
| 165 |
+
|
| 166 |
+
### Gradio Won't Launch
|
| 167 |
+
|
| 168 |
+
**Solution:**
|
| 169 |
+
```python
|
| 170 |
+
# Check if port is already in use
|
| 171 |
+
# Restart runtime and try again
|
| 172 |
+
# Make sure all previous cells ran successfully
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
## 🚀 Performance Optimization
|
| 176 |
+
|
| 177 |
+
### For Faster Loading:
|
| 178 |
+
|
| 179 |
+
1. **Pre-download models** (Step 4)
|
| 180 |
+
- Run this cell first
|
| 181 |
+
- Models will be cached for future use
|
| 182 |
+
|
| 183 |
+
2. **Use GPU runtime**
|
| 184 |
+
- T4 GPU is free and fast
|
| 185 |
+
- Significantly faster than CPU
|
| 186 |
+
|
| 187 |
+
3. **Reduce queue size**
|
| 188 |
+
```python
|
| 189 |
+
demo.queue(max_size=10) # Lower = less memory
|
| 190 |
+
```
|
| 191 |
+
|
| 192 |
+
### For Better Quality:
|
| 193 |
+
|
| 194 |
+
1. **Adjust generation parameters**
|
| 195 |
+
- Temperature: 0.7-0.9 for more natural speech
|
| 196 |
+
- CFG weight: 0.3-0.7 for better control
|
| 197 |
+
|
| 198 |
+
2. **Use high-quality voice samples**
|
| 199 |
+
- 5+ seconds of clear audio
|
| 200 |
+
- Minimal background noise
|
| 201 |
+
|
| 202 |
+
## 📊 Resource Usage
|
| 203 |
+
|
| 204 |
+
Typical resource consumption:
|
| 205 |
+
|
| 206 |
+
| Component | RAM | GPU Memory | Time |
|
| 207 |
+
|-----------|-----|------------|------|
|
| 208 |
+
| Setup | 2 GB | 0 GB | 3-5 min |
|
| 209 |
+
| Model Load | 4 GB | 3-4 GB | 1-2 min |
|
| 210 |
+
| Inference | 6 GB | 4-5 GB | 5-15 sec |
|
| 211 |
+
|
| 212 |
+
**Recommended**: Colab Pro for heavy usage
|
| 213 |
+
|
| 214 |
+
## 🔗 Useful Links
|
| 215 |
+
|
| 216 |
+
- [Google Colab](https://colab.research.google.com/)
|
| 217 |
+
- [Colab FAQ](https://research.google.com/colaboratory/faq.html)
|
| 218 |
+
- [HuggingFace Hub](https://huggingface.co/)
|
| 219 |
+
- [Gradio Documentation](https://gradio.app/docs/)
|
| 220 |
+
|
| 221 |
+
## 💡 Pro Tips
|
| 222 |
+
|
| 223 |
+
1. **Save notebook to Google Drive**
|
| 224 |
+
- File → Save a copy in Drive
|
| 225 |
+
- Your changes will persist
|
| 226 |
+
|
| 227 |
+
2. **Mount Google Drive for persistent storage**
|
| 228 |
+
```python
|
| 229 |
+
from google.colab import drive
|
| 230 |
+
drive.mount('/content/drive')
|
| 231 |
+
```
|
| 232 |
+
|
| 233 |
+
3. **Monitor GPU usage**
|
| 234 |
+
```python
|
| 235 |
+
!nvidia-smi
|
| 236 |
+
```
|
| 237 |
+
|
| 238 |
+
4. **Clear outputs to save space**
|
| 239 |
+
- Edit → Clear all outputs
|
| 240 |
+
|
| 241 |
+
5. **Use keyboard shortcuts**
|
| 242 |
+
- `Ctrl+Enter`: Run cell
|
| 243 |
+
- `Shift+Enter`: Run cell and select next
|
| 244 |
+
- `Ctrl+M B`: Insert cell below
|
| 245 |
+
|
| 246 |
+
## 🎉 You're All Set!
|
| 247 |
+
|
| 248 |
+
Enjoy using Chatterbox TTS on Google Colab!
|
| 249 |
+
|
| 250 |
+
For issues or questions, please open an issue on GitHub.
|
| 251 |
+
|
| 252 |
+
---
|
| 253 |
+
|
| 254 |
+
**Happy Synthesizing! 🎙️**
|
Chatterbox_TTS_Colab.ipynb
ADDED
|
@@ -0,0 +1,614 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {
|
| 6 |
+
"id": "header"
|
| 7 |
+
},
|
| 8 |
+
"source": [
|
| 9 |
+
"# 🎙️ Chatterbox TTS - Google Colab Edition\n",
|
| 10 |
+
"\n",
|
| 11 |
+
"High-quality voice cloning, text-to-speech & voice conversion powered by Chatterbox.\n",
|
| 12 |
+
"\n",
|
| 13 |
+
"**Features:**\n",
|
| 14 |
+
"- ⚡ **Turbo TTS** - Fast, high-quality speech synthesis with paralinguistic tags\n",
|
| 15 |
+
"- 🎤 **TTS Main** - Advanced English TTS with fine-tuned controls\n",
|
| 16 |
+
"- 🌍 **Multilingual TTS** - Support for multiple languages\n",
|
| 17 |
+
"- 🔄 **Voice Conversion** - Convert any voice to your target voice\n",
|
| 18 |
+
"- 🧬 **Voice Cloning** - Clone voices from audio samples\n",
|
| 19 |
+
"\n",
|
| 20 |
+
"---"
|
| 21 |
+
]
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"cell_type": "markdown",
|
| 25 |
+
"metadata": {
|
| 26 |
+
"id": "setup-header"
|
| 27 |
+
},
|
| 28 |
+
"source": [
|
| 29 |
+
"## 📦 Step 1: Setup Environment\n",
|
| 30 |
+
"\n",
|
| 31 |
+
"This will install all required dependencies. Takes ~3-5 minutes."
|
| 32 |
+
]
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"cell_type": "code",
|
| 36 |
+
"execution_count": null,
|
| 37 |
+
"metadata": {
|
| 38 |
+
"id": "install-dependencies"
|
| 39 |
+
},
|
| 40 |
+
"outputs": [],
|
| 41 |
+
"source": [
|
| 42 |
+
"%%capture\n",
|
| 43 |
+
"# Install system dependencies\n",
|
| 44 |
+
"!apt-get update -qq\n",
|
| 45 |
+
"!apt-get install -y -qq ffmpeg\n",
|
| 46 |
+
"\n",
|
| 47 |
+
"# Install Python packages\n",
|
| 48 |
+
"!pip install -q numpy>=1.24.0,<1.26.0\n",
|
| 49 |
+
"!pip install -q librosa==0.11.0\n",
|
| 50 |
+
"!pip install -q s3tokenizer\n",
|
| 51 |
+
"!pip install -q torch==2.7.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu118\n",
|
| 52 |
+
"!pip install -q transformers==4.46.3\n",
|
| 53 |
+
"!pip install -q diffusers==0.29.0\n",
|
| 54 |
+
"!pip install -q resemble-perth==1.0.1\n",
|
| 55 |
+
"!pip install -q conformer==0.3.2\n",
|
| 56 |
+
"!pip install -q safetensors==0.5.3\n",
|
| 57 |
+
"!pip install -q pykakasi==2.3.0\n",
|
| 58 |
+
"!pip install -q gradio==5.44.1\n",
|
| 59 |
+
"!pip install -q pyloudnorm\n",
|
| 60 |
+
"!pip install -q omegaconf\n",
|
| 61 |
+
"!pip install -q huggingface_hub\n",
|
| 62 |
+
"\n",
|
| 63 |
+
"print(\"✅ All dependencies installed successfully!\")"
|
| 64 |
+
]
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"cell_type": "markdown",
|
| 68 |
+
"metadata": {
|
| 69 |
+
"id": "clone-header"
|
| 70 |
+
},
|
| 71 |
+
"source": [
|
| 72 |
+
"## 📥 Step 2: Clone Repository\n",
|
| 73 |
+
"\n",
|
| 74 |
+
"Download the Chatterbox TTS application code."
|
| 75 |
+
]
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"cell_type": "code",
|
| 79 |
+
"execution_count": null,
|
| 80 |
+
"metadata": {
|
| 81 |
+
"id": "clone-repo"
|
| 82 |
+
},
|
| 83 |
+
"outputs": [],
|
| 84 |
+
"source": [
|
| 85 |
+
"import os\n",
|
| 86 |
+
"\n",
|
| 87 |
+
"# Clone the repository (replace with your actual repo URL)\n",
|
| 88 |
+
"if not os.path.exists('chatterbox-tts'):\n",
|
| 89 |
+
" !git clone https://github.com/YOUR_USERNAME/chatterbox-tts.git\n",
|
| 90 |
+
" print(\"✅ Repository cloned!\")\n",
|
| 91 |
+
"else:\n",
|
| 92 |
+
" print(\"ℹ️ Repository already exists\")\n",
|
| 93 |
+
"\n",
|
| 94 |
+
"# Change to project directory\n",
|
| 95 |
+
"%cd chatterbox-tts"
|
| 96 |
+
]
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"cell_type": "markdown",
|
| 100 |
+
"metadata": {
|
| 101 |
+
"id": "alternative-setup"
|
| 102 |
+
},
|
| 103 |
+
"source": [
|
| 104 |
+
"### Alternative: Upload Files Manually\n",
|
| 105 |
+
"\n",
|
| 106 |
+
"If you don't have a GitHub repo, run this cell to upload your project files:"
|
| 107 |
+
]
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"cell_type": "code",
|
| 111 |
+
"execution_count": null,
|
| 112 |
+
"metadata": {
|
| 113 |
+
"id": "upload-files"
|
| 114 |
+
},
|
| 115 |
+
"outputs": [],
|
| 116 |
+
"source": [
|
| 117 |
+
"# Uncomment and run this if you want to upload files manually\n",
|
| 118 |
+
"# from google.colab import files\n",
|
| 119 |
+
"# import zipfile\n",
|
| 120 |
+
"\n",
|
| 121 |
+
"# print(\"Please upload your project as a ZIP file:\")\n",
|
| 122 |
+
"# uploaded = files.upload()\n",
|
| 123 |
+
"\n",
|
| 124 |
+
"# for filename in uploaded.keys():\n",
|
| 125 |
+
"# print(f\"Extracting {filename}...\")\n",
|
| 126 |
+
"# with zipfile.ZipFile(filename, 'r') as zip_ref:\n",
|
| 127 |
+
"# zip_ref.extractall('.')\n",
|
| 128 |
+
"# print(\"✅ Files extracted!\")"
|
| 129 |
+
]
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"cell_type": "markdown",
|
| 133 |
+
"metadata": {
|
| 134 |
+
"id": "verify-header"
|
| 135 |
+
},
|
| 136 |
+
"source": [
|
| 137 |
+
"## 🔍 Step 3: Verify Installation\n",
|
| 138 |
+
"\n",
|
| 139 |
+
"Check if everything is set up correctly."
|
| 140 |
+
]
|
| 141 |
+
},
|
| 142 |
+
{
|
| 143 |
+
"cell_type": "code",
|
| 144 |
+
"execution_count": null,
|
| 145 |
+
"metadata": {
|
| 146 |
+
"id": "verify-setup"
|
| 147 |
+
},
|
| 148 |
+
"outputs": [],
|
| 149 |
+
"source": [
|
| 150 |
+
"import sys\n",
|
| 151 |
+
"import torch\n",
|
| 152 |
+
"\n",
|
| 153 |
+
"# Check CUDA availability\n",
|
| 154 |
+
"print(f\"🔧 Python version: {sys.version}\")\n",
|
| 155 |
+
"print(f\"🔥 PyTorch version: {torch.__version__}\")\n",
|
| 156 |
+
"print(f\"🎮 CUDA available: {torch.cuda.is_available()}\")\n",
|
| 157 |
+
"if torch.cuda.is_available():\n",
|
| 158 |
+
" print(f\"🎮 CUDA device: {torch.cuda.get_device_name(0)}\")\n",
|
| 159 |
+
" print(f\"💾 CUDA memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB\")\n",
|
| 160 |
+
"\n",
|
| 161 |
+
"# Verify project structure\n",
|
| 162 |
+
"required_files = ['app.py', 'requirements.txt', 'src', 'modules']\n",
|
| 163 |
+
"for item in required_files:\n",
|
| 164 |
+
" if os.path.exists(item):\n",
|
| 165 |
+
" print(f\"✅ Found: {item}\")\n",
|
| 166 |
+
" else:\n",
|
| 167 |
+
" print(f\"❌ Missing: {item}\")\n",
|
| 168 |
+
"\n",
|
| 169 |
+
"print(\"\\n✅ Setup verification complete!\")"
|
| 170 |
+
]
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"cell_type": "markdown",
|
| 174 |
+
"metadata": {
|
| 175 |
+
"id": "download-models-header"
|
| 176 |
+
},
|
| 177 |
+
"source": [
|
| 178 |
+
"## 🤖 Step 4: Download Models (Optional Pre-download)\n",
|
| 179 |
+
"\n",
|
| 180 |
+
"Pre-download the models to speed up first use. This is optional - models will auto-download when first used."
|
| 181 |
+
]
|
| 182 |
+
},
|
| 183 |
+
{
|
| 184 |
+
"cell_type": "code",
|
| 185 |
+
"execution_count": null,
|
| 186 |
+
"metadata": {
|
| 187 |
+
"id": "download-models"
|
| 188 |
+
},
|
| 189 |
+
"outputs": [],
|
| 190 |
+
"source": [
|
| 191 |
+
"from huggingface_hub import snapshot_download\n",
|
| 192 |
+
"\n",
|
| 193 |
+
"print(\"📥 Downloading Chatterbox Turbo model...\")\n",
|
| 194 |
+
"snapshot_download(\n",
|
| 195 |
+
" repo_id=\"ResembleAI/chatterbox-turbo\",\n",
|
| 196 |
+
" token=False, # Use public access without authentication\n",
|
| 197 |
+
" allow_patterns=[\"*.safetensors\", \"*.json\", \"*.txt\", \"*.pt\", \"*.model\"]\n",
|
| 198 |
+
")\n",
|
| 199 |
+
"print(\"✅ Turbo model downloaded!\")\n",
|
| 200 |
+
"\n",
|
| 201 |
+
"print(\"\\n📥 Downloading Chatterbox main model...\")\n",
|
| 202 |
+
"from huggingface_hub import hf_hub_download\n",
|
| 203 |
+
"for fpath in [\"ve.safetensors\", \"t3_cfg.safetensors\", \"s3gen.safetensors\", \"tokenizer.json\", \"conds.pt\"]:\n",
|
| 204 |
+
" hf_hub_download(repo_id=\"ResembleAI/chatterbox\", filename=fpath)\n",
|
| 205 |
+
"print(\"✅ Main model downloaded!\")\n",
|
| 206 |
+
"\n",
|
| 207 |
+
"print(\"\\n🎉 All models ready!\")"
|
| 208 |
+
]
|
| 209 |
+
},
|
| 210 |
+
{
|
| 211 |
+
"cell_type": "markdown",
|
| 212 |
+
"metadata": {
|
| 213 |
+
"id": "launch-header"
|
| 214 |
+
},
|
| 215 |
+
"source": [
|
| 216 |
+
"## 🚀 Step 5: Launch Application\n",
|
| 217 |
+
"\n",
|
| 218 |
+
"Start the Gradio interface. Click the public URL to access the app!"
|
| 219 |
+
]
|
| 220 |
+
},
|
| 221 |
+
{
|
| 222 |
+
"cell_type": "code",
|
| 223 |
+
"execution_count": null,
|
| 224 |
+
"metadata": {
|
| 225 |
+
"id": "launch-app"
|
| 226 |
+
},
|
| 227 |
+
"outputs": [],
|
| 228 |
+
"source": [
|
| 229 |
+
"# Add project paths\n",
|
| 230 |
+
"import sys\n",
|
| 231 |
+
"import os\n",
|
| 232 |
+
"\n",
|
| 233 |
+
"project_root = os.getcwd()\n",
|
| 234 |
+
"if project_root not in sys.path:\n",
|
| 235 |
+
" sys.path.append(project_root)\n",
|
| 236 |
+
"\n",
|
| 237 |
+
"src_path = os.path.join(project_root, \"src\")\n",
|
| 238 |
+
"if src_path not in sys.path:\n",
|
| 239 |
+
" sys.path.append(src_path)\n",
|
| 240 |
+
"\n",
|
| 241 |
+
"# Import and launch\n",
|
| 242 |
+
"import gradio as gr\n",
|
| 243 |
+
"from modules.config import LANGUAGE_CONFIG, SUPPORTED_LANGUAGES\n",
|
| 244 |
+
"from modules.voice_manager import (\n",
|
| 245 |
+
" load_voices, \n",
|
| 246 |
+
" get_voices_for_language, \n",
|
| 247 |
+
" get_all_voices_with_gender,\n",
|
| 248 |
+
" resolve_voice_path,\n",
|
| 249 |
+
" clone_voice,\n",
|
| 250 |
+
" delete_voice\n",
|
| 251 |
+
")\n",
|
| 252 |
+
"from modules.generation_functions import (\n",
|
| 253 |
+
" generate_speech,\n",
|
| 254 |
+
" generate_multilingual_speech,\n",
|
| 255 |
+
" convert_voice,\n",
|
| 256 |
+
" generate_turbo_speech\n",
|
| 257 |
+
")\n",
|
| 258 |
+
"from modules.ui_components import (\n",
|
| 259 |
+
" create_header,\n",
|
| 260 |
+
" create_tts_tab,\n",
|
| 261 |
+
" create_multilingual_tab,\n",
|
| 262 |
+
" create_voice_conversion_tab,\n",
|
| 263 |
+
" create_clone_voice_tab,\n",
|
| 264 |
+
" create_turbo_tab\n",
|
| 265 |
+
")\n",
|
| 266 |
+
"\n",
|
| 267 |
+
"# Load voices\n",
|
| 268 |
+
"available_voices = load_voices()\n",
|
| 269 |
+
"\n",
|
| 270 |
+
"# Custom CSS\n",
|
| 271 |
+
"CUSTOM_CSS = \"\"\"\n",
|
| 272 |
+
".tag-container {\n",
|
| 273 |
+
" display: flex !important;\n",
|
| 274 |
+
" flex-wrap: wrap !important;\n",
|
| 275 |
+
" gap: 8px !important;\n",
|
| 276 |
+
" margin-top: 5px !important;\n",
|
| 277 |
+
" margin-bottom: 10px !important;\n",
|
| 278 |
+
" border: none !important;\n",
|
| 279 |
+
" background: transparent !important;\n",
|
| 280 |
+
"}\n",
|
| 281 |
+
".tag-btn {\n",
|
| 282 |
+
" min-width: fit-content !important;\n",
|
| 283 |
+
" width: auto !important;\n",
|
| 284 |
+
" height: 32px !important;\n",
|
| 285 |
+
" font-size: 13px !important;\n",
|
| 286 |
+
" background: #eef2ff !important;\n",
|
| 287 |
+
" border: 1px solid #c7d2fe !important;\n",
|
| 288 |
+
" color: #3730a3 !important;\n",
|
| 289 |
+
" border-radius: 6px !important;\n",
|
| 290 |
+
" padding: 0 10px !important;\n",
|
| 291 |
+
" margin: 0 !important;\n",
|
| 292 |
+
" box-shadow: none !important;\n",
|
| 293 |
+
"}\n",
|
| 294 |
+
".tag-btn:hover {\n",
|
| 295 |
+
" background: #c7d2fe !important;\n",
|
| 296 |
+
" transform: translateY(-1px);\n",
|
| 297 |
+
"}\n",
|
| 298 |
+
"\"\"\"\n",
|
| 299 |
+
"\n",
|
| 300 |
+
"# Create Gradio app\n",
|
| 301 |
+
"with gr.Blocks(title=\"Chatterbox TTS Enhanced\", theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:\n",
|
| 302 |
+
" # State variables\n",
|
| 303 |
+
" tts_model_state = gr.State(None)\n",
|
| 304 |
+
" vc_model_state = gr.State(None)\n",
|
| 305 |
+
" mtl_model_state = gr.State(None)\n",
|
| 306 |
+
" \n",
|
| 307 |
+
" # Header\n",
|
| 308 |
+
" create_header()\n",
|
| 309 |
+
" \n",
|
| 310 |
+
" # Create tabs\n",
|
| 311 |
+
" with gr.Tab(\"⚡ Turbo TTS\"):\n",
|
| 312 |
+
" turbo_components = create_turbo_tab()\n",
|
| 313 |
+
"\n",
|
| 314 |
+
" with gr.Tab(\"🎤 TTS Main (English)\"):\n",
|
| 315 |
+
" tts_components = create_tts_tab()\n",
|
| 316 |
+
" \n",
|
| 317 |
+
" with gr.Tab(\"🌍 Multilingual TTS\"):\n",
|
| 318 |
+
" mtl_components = create_multilingual_tab()\n",
|
| 319 |
+
" \n",
|
| 320 |
+
" with gr.Tab(\"🔄 Voice Conversion\"):\n",
|
| 321 |
+
" vc_components = create_voice_conversion_tab()\n",
|
| 322 |
+
" \n",
|
| 323 |
+
" with gr.Tab(\"🧬 Clone Voice\"):\n",
|
| 324 |
+
" clone_components = create_clone_voice_tab()\n",
|
| 325 |
+
" \n",
|
| 326 |
+
" # Event Handlers - TTS Tab\n",
|
| 327 |
+
" tts_components['generate_btn'].click(\n",
|
| 328 |
+
" fn=generate_speech,\n",
|
| 329 |
+
" inputs=[\n",
|
| 330 |
+
" tts_components['text'],\n",
|
| 331 |
+
" tts_components['voice_select'],\n",
|
| 332 |
+
" tts_components['exaggeration'],\n",
|
| 333 |
+
" tts_components['temp'],\n",
|
| 334 |
+
" tts_components['seed_num'],\n",
|
| 335 |
+
" tts_components['cfg_weight'],\n",
|
| 336 |
+
" tts_components['min_p'],\n",
|
| 337 |
+
" tts_components['top_p'],\n",
|
| 338 |
+
" tts_components['repetition_penalty']\n",
|
| 339 |
+
" ],\n",
|
| 340 |
+
" outputs=[\n",
|
| 341 |
+
" tts_components['progress_bar'],\n",
|
| 342 |
+
" tts_components['audio_output'],\n",
|
| 343 |
+
" tts_components['status_box']\n",
|
| 344 |
+
" ]\n",
|
| 345 |
+
" )\n",
|
| 346 |
+
" \n",
|
| 347 |
+
" def update_tts_preview(voice_name):\n",
|
| 348 |
+
" path = resolve_voice_path(voice_name, \"en\")\n",
|
| 349 |
+
" return path\n",
|
| 350 |
+
"\n",
|
| 351 |
+
" tts_components['voice_select'].change(\n",
|
| 352 |
+
" fn=update_tts_preview,\n",
|
| 353 |
+
" inputs=[tts_components['voice_select']],\n",
|
| 354 |
+
" outputs=[tts_components['preview_audio']]\n",
|
| 355 |
+
" )\n",
|
| 356 |
+
" \n",
|
| 357 |
+
" # Event Handlers - Turbo Tab\n",
|
| 358 |
+
" turbo_components['generate_btn'].click(\n",
|
| 359 |
+
" fn=generate_turbo_speech,\n",
|
| 360 |
+
" inputs=[\n",
|
| 361 |
+
" turbo_components['text'],\n",
|
| 362 |
+
" turbo_components['voice_select']\n",
|
| 363 |
+
" ],\n",
|
| 364 |
+
" outputs=[\n",
|
| 365 |
+
" turbo_components['progress_bar'],\n",
|
| 366 |
+
" turbo_components['audio_output'],\n",
|
| 367 |
+
" turbo_components['status_box']\n",
|
| 368 |
+
" ]\n",
|
| 369 |
+
" )\n",
|
| 370 |
+
" \n",
|
| 371 |
+
" def update_turbo_preview(voice_name):\n",
|
| 372 |
+
" path = resolve_voice_path(voice_name, \"en\")\n",
|
| 373 |
+
" return path\n",
|
| 374 |
+
"\n",
|
| 375 |
+
" turbo_components['voice_select'].change(\n",
|
| 376 |
+
" fn=update_turbo_preview,\n",
|
| 377 |
+
" inputs=[turbo_components['voice_select']],\n",
|
| 378 |
+
" outputs=[turbo_components['preview_audio']]\n",
|
| 379 |
+
" )\n",
|
| 380 |
+
" \n",
|
| 381 |
+
" # Tag insertion buttons (Turbo)\n",
|
| 382 |
+
" INSERT_TAG_JS = \"\"\"\n",
|
| 383 |
+
" (tag_val, current_text) => {\n",
|
| 384 |
+
" const textarea = document.querySelector('#turbo_textbox textarea');\n",
|
| 385 |
+
" if (!textarea) return current_text + \" \" + tag_val;\n",
|
| 386 |
+
" const start = textarea.selectionStart;\n",
|
| 387 |
+
" const end = textarea.selectionEnd;\n",
|
| 388 |
+
" let prefix = \" \";\n",
|
| 389 |
+
" let suffix = \" \";\n",
|
| 390 |
+
" if (start === 0) prefix = \"\";\n",
|
| 391 |
+
" else if (current_text[start - 1] === ' ') prefix = \"\";\n",
|
| 392 |
+
" if (end < current_text.length && current_text[end] === ' ') suffix = \"\";\n",
|
| 393 |
+
" return current_text.slice(0, start) + prefix + tag_val + suffix + current_text.slice(end);\n",
|
| 394 |
+
" }\n",
|
| 395 |
+
" \"\"\"\n",
|
| 396 |
+
" \n",
|
| 397 |
+
" # Tag button handlers\n",
|
| 398 |
+
" for tag_name in ['clear_throat', 'sigh', 'shush', 'cough', 'groan', 'sniff', 'gasp', 'chuckle', 'laugh']:\n",
|
| 399 |
+
" btn_key = f'btn_{tag_name}'\n",
|
| 400 |
+
" if btn_key in turbo_components:\n",
|
| 401 |
+
" turbo_components[btn_key].click(\n",
|
| 402 |
+
" fn=None,\n",
|
| 403 |
+
" inputs=[turbo_components[btn_key], turbo_components['text']],\n",
|
| 404 |
+
" outputs=turbo_components['text'],\n",
|
| 405 |
+
" js=INSERT_TAG_JS\n",
|
| 406 |
+
" )\n",
|
| 407 |
+
" \n",
|
| 408 |
+
" # Event Handlers - Multilingual Tab\n",
|
| 409 |
+
" mtl_components['generate_btn'].click(\n",
|
| 410 |
+
" fn=generate_multilingual_speech,\n",
|
| 411 |
+
" inputs=[\n",
|
| 412 |
+
" mtl_components['text'],\n",
|
| 413 |
+
" mtl_components['voice_select'],\n",
|
| 414 |
+
" mtl_components['language_select'],\n",
|
| 415 |
+
" mtl_components['exaggeration'],\n",
|
| 416 |
+
" mtl_components['temp'],\n",
|
| 417 |
+
" mtl_components['seed_num'],\n",
|
| 418 |
+
" mtl_components['cfg_weight']\n",
|
| 419 |
+
" ],\n",
|
| 420 |
+
" outputs=[\n",
|
| 421 |
+
" mtl_components['progress_bar'],\n",
|
| 422 |
+
" mtl_components['audio_output'],\n",
|
| 423 |
+
" mtl_components['status_box']\n",
|
| 424 |
+
" ]\n",
|
| 425 |
+
" )\n",
|
| 426 |
+
" \n",
|
| 427 |
+
" mtl_components['language_select'].change(\n",
|
| 428 |
+
" fn=lambda lang: (\n",
|
| 429 |
+
" LANGUAGE_CONFIG.get(lang, {}).get(\"text\", \"\"),\n",
|
| 430 |
+
" gr.update(choices=get_voices_for_language(lang), value=f\"Default ({SUPPORTED_LANGUAGES.get(lang, lang)})\")\n",
|
| 431 |
+
" ),\n",
|
| 432 |
+
" inputs=[mtl_components['language_select']],\n",
|
| 433 |
+
" outputs=[mtl_components['text'], mtl_components['voice_select']]\n",
|
| 434 |
+
" )\n",
|
| 435 |
+
" \n",
|
| 436 |
+
" def update_mtl_preview(voice_name, language_code):\n",
|
| 437 |
+
" path = resolve_voice_path(voice_name, language_code)\n",
|
| 438 |
+
" return path\n",
|
| 439 |
+
"\n",
|
| 440 |
+
" mtl_components['voice_select'].change(\n",
|
| 441 |
+
" fn=update_mtl_preview,\n",
|
| 442 |
+
" inputs=[mtl_components['voice_select'], mtl_components['language_select']],\n",
|
| 443 |
+
" outputs=[mtl_components['sample_audio']]\n",
|
| 444 |
+
" )\n",
|
| 445 |
+
" \n",
|
| 446 |
+
" # Event Handlers - Voice Conversion Tab\n",
|
| 447 |
+
" vc_components['convert_btn'].click(\n",
|
| 448 |
+
" fn=convert_voice,\n",
|
| 449 |
+
" inputs=[vc_components['input_audio'], vc_components['target_voice_select']],\n",
|
| 450 |
+
" outputs=[vc_components['progress_bar'], vc_components['audio_output'], vc_components['status_box']]\n",
|
| 451 |
+
" )\n",
|
| 452 |
+
" \n",
|
| 453 |
+
" def update_vc_preview(voice_name):\n",
|
| 454 |
+
" if voice_name == \"None\": \n",
|
| 455 |
+
" return None\n",
|
| 456 |
+
" \n",
|
| 457 |
+
" clean_name = voice_name.replace(\" ♂️\", \"\").replace(\" ♀️\", \"\")\n",
|
| 458 |
+
" \n",
|
| 459 |
+
" if clean_name.startswith(\"Default (\"):\n",
|
| 460 |
+
" lang_name = clean_name.split(\"(\")[1].split(\")\")[0]\n",
|
| 461 |
+
" for code, name in SUPPORTED_LANGUAGES.items():\n",
|
| 462 |
+
" if name == lang_name:\n",
|
| 463 |
+
" return LANGUAGE_CONFIG.get(code, {}).get(\"audio\")\n",
|
| 464 |
+
" \n",
|
| 465 |
+
" from modules.voice_manager import VOICES\n",
|
| 466 |
+
" possible_names = [\n",
|
| 467 |
+
" clean_name,\n",
|
| 468 |
+
" f\"{clean_name}_male\",\n",
|
| 469 |
+
" f\"{clean_name}_female\"\n",
|
| 470 |
+
" ]\n",
|
| 471 |
+
" \n",
|
| 472 |
+
" for name in possible_names:\n",
|
| 473 |
+
" if name in VOICES[\"samples\"]:\n",
|
| 474 |
+
" return VOICES[\"samples\"][name]\n",
|
| 475 |
+
" \n",
|
| 476 |
+
" for code in SUPPORTED_LANGUAGES:\n",
|
| 477 |
+
" for name in possible_names:\n",
|
| 478 |
+
" full_name = f\"{name}_{code}\"\n",
|
| 479 |
+
" if full_name in VOICES[\"samples\"]:\n",
|
| 480 |
+
" return VOICES[\"samples\"][full_name]\n",
|
| 481 |
+
" \n",
|
| 482 |
+
" return None\n",
|
| 483 |
+
"\n",
|
| 484 |
+
" vc_components['target_voice_select'].change(\n",
|
| 485 |
+
" fn=update_vc_preview,\n",
|
| 486 |
+
" inputs=[vc_components['target_voice_select']],\n",
|
| 487 |
+
" outputs=[vc_components['preview_audio']]\n",
|
| 488 |
+
" )\n",
|
| 489 |
+
" \n",
|
| 490 |
+
" # Event Handlers - Clone Voice Tab\n",
|
| 491 |
+
" clone_components['clone_btn'].click(\n",
|
| 492 |
+
" fn=clone_voice,\n",
|
| 493 |
+
" inputs=[\n",
|
| 494 |
+
" clone_components['ref_audio_input'],\n",
|
| 495 |
+
" clone_components['new_voice_name'],\n",
|
| 496 |
+
" clone_components['voice_language'],\n",
|
| 497 |
+
" clone_components['voice_gender']\n",
|
| 498 |
+
" ],\n",
|
| 499 |
+
" outputs=[clone_components['clone_status'], tts_components['voice_select']]\n",
|
| 500 |
+
" ).then(\n",
|
| 501 |
+
" fn=lambda: gr.update(choices=get_voices_for_language(\"en\")),\n",
|
| 502 |
+
" outputs=[tts_components['voice_select']]\n",
|
| 503 |
+
" ).then(\n",
|
| 504 |
+
" fn=lambda: gr.update(choices=get_voices_for_language(\"en\")),\n",
|
| 505 |
+
" outputs=[turbo_components['voice_select']]\n",
|
| 506 |
+
" ).then(\n",
|
| 507 |
+
" fn=lambda lang: gr.update(choices=get_voices_for_language(lang)),\n",
|
| 508 |
+
" inputs=[mtl_components['language_select']],\n",
|
| 509 |
+
" outputs=[mtl_components['voice_select']]\n",
|
| 510 |
+
" ).then(\n",
|
| 511 |
+
" fn=lambda: gr.update(choices=[\"None\"] + get_all_voices_with_gender()),\n",
|
| 512 |
+
" outputs=[vc_components['target_voice_select']]\n",
|
| 513 |
+
" ).then(\n",
|
| 514 |
+
" fn=lambda: \"\\n\".join(load_voices()) if load_voices() else \"No voices cloned yet\",\n",
|
| 515 |
+
" outputs=[clone_components['current_voices_display']]\n",
|
| 516 |
+
" ).then(\n",
|
| 517 |
+
" fn=lambda: gr.update(choices=[\"None\"] + get_all_voices_with_gender(), value=\"None\"),\n",
|
| 518 |
+
" outputs=[clone_components['voice_to_delete']]\n",
|
| 519 |
+
" )\n",
|
| 520 |
+
" \n",
|
| 521 |
+
" clone_components['delete_btn'].click(\n",
|
| 522 |
+
" fn=delete_voice,\n",
|
| 523 |
+
" inputs=[clone_components['voice_to_delete']],\n",
|
| 524 |
+
" outputs=[clone_components['delete_status'], clone_components['voice_to_delete']]\n",
|
| 525 |
+
" ).then(\n",
|
| 526 |
+
" fn=lambda: gr.update(choices=get_voices_for_language(\"en\")),\n",
|
| 527 |
+
" outputs=[tts_components['voice_select']]\n",
|
| 528 |
+
" ).then(\n",
|
| 529 |
+
" fn=lambda: gr.update(choices=get_voices_for_language(\"en\")),\n",
|
| 530 |
+
" outputs=[turbo_components['voice_select']]\n",
|
| 531 |
+
" ).then(\n",
|
| 532 |
+
" fn=lambda lang: gr.update(choices=get_voices_for_language(lang)),\n",
|
| 533 |
+
" inputs=[mtl_components['language_select']],\n",
|
| 534 |
+
" outputs=[mtl_components['voice_select']]\n",
|
| 535 |
+
" ).then(\n",
|
| 536 |
+
" fn=lambda: gr.update(choices=[\"None\"] + get_all_voices_with_gender()),\n",
|
| 537 |
+
" outputs=[vc_components['target_voice_select']]\n",
|
| 538 |
+
" ).then(\n",
|
| 539 |
+
" fn=lambda: \"\\n\".join(load_voices()) if load_voices() else \"No voices cloned yet\",\n",
|
| 540 |
+
" outputs=[clone_components['current_voices_display']]\n",
|
| 541 |
+
" )\n",
|
| 542 |
+
"\n",
|
| 543 |
+
"# Launch with public sharing enabled\n",
|
| 544 |
+
"print(\"\\n🚀 Launching Chatterbox TTS...\\n\")\n",
|
| 545 |
+
"demo.queue(\n",
|
| 546 |
+
" max_size=50,\n",
|
| 547 |
+
" default_concurrency_limit=1,\n",
|
| 548 |
+
").launch(\n",
|
| 549 |
+
" share=True, # Create public URL\n",
|
| 550 |
+
" debug=True,\n",
|
| 551 |
+
" show_error=True\n",
|
| 552 |
+
")"
|
| 553 |
+
]
|
| 554 |
+
},
|
| 555 |
+
{
|
| 556 |
+
"cell_type": "markdown",
|
| 557 |
+
"metadata": {
|
| 558 |
+
"id": "tips-header"
|
| 559 |
+
},
|
| 560 |
+
"source": [
|
| 561 |
+
"## 💡 Tips & Troubleshooting\n",
|
| 562 |
+
"\n",
|
| 563 |
+
"### Common Issues:\n",
|
| 564 |
+
"\n",
|
| 565 |
+
"1. **Out of Memory Error**\n",
|
| 566 |
+
" - Use a smaller batch size\n",
|
| 567 |
+
" - Restart runtime and clear outputs\n",
|
| 568 |
+
" - Use Colab Pro for more RAM/GPU\n",
|
| 569 |
+
"\n",
|
| 570 |
+
"2. **Model Download Fails**\n",
|
| 571 |
+
" - Check internet connection\n",
|
| 572 |
+
" - Verify HuggingFace is accessible\n",
|
| 573 |
+
" - Try running Step 4 again\n",
|
| 574 |
+
"\n",
|
| 575 |
+
"3. **Import Errors**\n",
|
| 576 |
+
" - Restart runtime\n",
|
| 577 |
+
" - Re-run Step 1 (dependencies)\n",
|
| 578 |
+
" - Check if all files are uploaded\n",
|
| 579 |
+
"\n",
|
| 580 |
+
"### Performance Tips:\n",
|
| 581 |
+
"\n",
|
| 582 |
+
"- **Use GPU**: Runtime → Change runtime type → GPU (T4)\n",
|
| 583 |
+
"- **Keep session alive**: Install Colab Keep Alive extension\n",
|
| 584 |
+
"- **Save outputs**: Download generated audio before session expires\n",
|
| 585 |
+
"\n",
|
| 586 |
+
"### Colab Limitations:\n",
|
| 587 |
+
"\n",
|
| 588 |
+
"- Sessions timeout after ~12 hours of inactivity\n",
|
| 589 |
+
"- GPU usage is limited (use wisely)\n",
|
| 590 |
+
"- Files are temporary (download important outputs)\n",
|
| 591 |
+
"\n",
|
| 592 |
+
"---\n",
|
| 593 |
+
"\n",
|
| 594 |
+
"**Enjoy using Chatterbox TTS! 🎉**"
|
| 595 |
+
]
|
| 596 |
+
}
|
| 597 |
+
],
|
| 598 |
+
"metadata": {
|
| 599 |
+
"accelerator": "GPU",
|
| 600 |
+
"colab": {
|
| 601 |
+
"gpuType": "T4",
|
| 602 |
+
"provenance": []
|
| 603 |
+
},
|
| 604 |
+
"kernelspec": {
|
| 605 |
+
"display_name": "Python 3",
|
| 606 |
+
"name": "python3"
|
| 607 |
+
},
|
| 608 |
+
"language_info": {
|
| 609 |
+
"name": "python"
|
| 610 |
+
}
|
| 611 |
+
},
|
| 612 |
+
"nbformat": 4,
|
| 613 |
+
"nbformat_minor": 0
|
| 614 |
+
}
|
Run Chatterbox TTS.bat
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@echo off
|
| 2 |
+
echo Starting Chatterbox Turbo TTS Enhanced App...
|
| 3 |
+
echo.
|
| 4 |
+
|
| 5 |
+
set PYTHON_DIR=%~dp0python
|
| 6 |
+
set PYTHON_EXE=%PYTHON_DIR%\python.exe
|
| 7 |
+
set PROJECT_DIR=%~dp0
|
| 8 |
+
|
| 9 |
+
"%PYTHON_EXE%" "%PROJECT_DIR%app.py"
|
| 10 |
+
pause
|
VC_redist.x64.exe
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8995548dfffcde7c49987029c764355612ba6850ee09a7b6f0fddc85bdc5c280
|
| 3 |
+
size 18558944
|
app.py
ADDED
|
@@ -0,0 +1,505 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Chatterbox TTS Enhanced - Monolithic Pro Edition (UI UPDATE)
|
| 3 |
+
Fixes: Progress bar moved below audio output.
|
| 4 |
+
"""
|
| 5 |
+
import sys
|
| 6 |
+
import os
|
| 7 |
+
import glob
|
| 8 |
+
import shutil
|
| 9 |
+
import time
|
| 10 |
+
import random
|
| 11 |
+
import re
|
| 12 |
+
import numpy as np
|
| 13 |
+
import torch
|
| 14 |
+
import gradio as gr
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
import gc
|
| 17 |
+
|
| 18 |
+
# ==============================================================================
|
| 19 |
+
# 0. SETUP DE RUTAS Y LIBRERÍA
|
| 20 |
+
# ==============================================================================
|
| 21 |
+
|
| 22 |
+
PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
|
| 23 |
+
SRC_PATH = os.path.join(PROJECT_ROOT, "src")
|
| 24 |
+
|
| 25 |
+
# Añadir 'src' a las rutas de búsqueda
|
| 26 |
+
if os.path.exists(SRC_PATH):
|
| 27 |
+
if SRC_PATH not in sys.path:
|
| 28 |
+
sys.path.append(SRC_PATH)
|
| 29 |
+
print(f"✅ Path 'src' añadido: {SRC_PATH}")
|
| 30 |
+
else:
|
| 31 |
+
if PROJECT_ROOT not in sys.path:
|
| 32 |
+
sys.path.append(PROJECT_ROOT)
|
| 33 |
+
|
| 34 |
+
# Intentar importar la librería REAL
|
| 35 |
+
try:
|
| 36 |
+
from chatterbox.tts import ChatterboxTTS
|
| 37 |
+
from chatterbox.vc import ChatterboxVC
|
| 38 |
+
from chatterbox.mtl_tts import ChatterboxMultilingualTTS, SUPPORTED_LANGUAGES
|
| 39 |
+
from chatterbox.tts_turbo import ChatterboxTurboTTS
|
| 40 |
+
print("✅ Librería 'chatterbox' importada correctamente.")
|
| 41 |
+
|
| 42 |
+
except ImportError as e:
|
| 43 |
+
print(f"❌ ERROR CRÍTICO: No se pudo importar 'chatterbox'. Detalle: {e}")
|
| 44 |
+
print("⚠️ EJECUTANDO EN MODO MOCK (SIN AUDIO REAL)")
|
| 45 |
+
|
| 46 |
+
SUPPORTED_LANGUAGES = {"en": "English", "es": "Spanish"}
|
| 47 |
+
class MockModel:
|
| 48 |
+
def __init__(self, *args, **kwargs): self.sr = 24000
|
| 49 |
+
@classmethod
|
| 50 |
+
def from_pretrained(cls, device): return cls()
|
| 51 |
+
def generate(self, *args, **kwargs):
|
| 52 |
+
time.sleep(1)
|
| 53 |
+
return torch.zeros(1, 48000)
|
| 54 |
+
ChatterboxTTS = ChatterboxVC = ChatterboxMultilingualTTS = ChatterboxTurboTTS = MockModel
|
| 55 |
+
|
| 56 |
+
# ==============================================================================
|
| 57 |
+
# 1. CONFIGURACIÓN Y UTILIDADES UI
|
| 58 |
+
# ==============================================================================
|
| 59 |
+
|
| 60 |
+
VOICE_WAV_ROOT = os.path.join(PROJECT_ROOT, "modules", "voice_wav")
|
| 61 |
+
os.makedirs(VOICE_WAV_ROOT, exist_ok=True)
|
| 62 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 63 |
+
|
| 64 |
+
# --- GENERADOR DE BARRA DE PROGRESO HTML ---
|
| 65 |
+
def make_progress_html(percentage, message="Ready"):
|
| 66 |
+
color = "linear-gradient(90deg, #f97316 0%, #fbbf24 100%)"
|
| 67 |
+
if percentage == 100: color = "#22c55e"
|
| 68 |
+
|
| 69 |
+
html = f"""
|
| 70 |
+
<div style="display: flex; flex-direction: column; gap: 5px; width: 100%; margin-top: 10px;">
|
| 71 |
+
<div style="width: 100%; background-color: #334155; border-radius: 99px; height: 24px; border: 1px solid #475569; overflow: hidden; position: relative; box-shadow: inset 0 2px 4px rgba(0,0,0,0.3);">
|
| 72 |
+
<div style="width: {percentage}%; background: {color}; height: 100%; transition: width 0.3s ease-out, background 0.3s ease;"></div>
|
| 73 |
+
<div style="position: absolute; top: 0; left: 0; width: 100%; height: 100%; display: flex; align-items: center; justify-content: center; color: white; font-size: 11px; font-weight: bold; text-shadow: 0 1px 2px rgba(0,0,0,0.5);">
|
| 74 |
+
{int(percentage)}%
|
| 75 |
+
</div>
|
| 76 |
+
</div>
|
| 77 |
+
<p style="color: #cbd5e1; font-size: 0.9em; margin: 0; padding: 0; white-space: pre-wrap; font-family: monospace;">{message}</p>
|
| 78 |
+
</div>
|
| 79 |
+
"""
|
| 80 |
+
return html
|
| 81 |
+
|
| 82 |
+
# ==============================================================================
|
| 83 |
+
# 2. MODEL MANAGER & VOICE MANAGER
|
| 84 |
+
# ==============================================================================
|
| 85 |
+
|
| 86 |
+
class ModelManager:
|
| 87 |
+
def __init__(self):
|
| 88 |
+
self.tts_model = None; self.mtl_model = None; self.vc_model = None; self.turbo_model = None
|
| 89 |
+
self.current_model_type = None
|
| 90 |
+
|
| 91 |
+
def unload_all(self):
|
| 92 |
+
self.tts_model = None; self.mtl_model = None; self.vc_model = None; self.turbo_model = None
|
| 93 |
+
if DEVICE == "cuda": torch.cuda.empty_cache(); gc.collect()
|
| 94 |
+
self.current_model_type = None
|
| 95 |
+
|
| 96 |
+
def get_model(self, type_key):
|
| 97 |
+
if self.current_model_type != type_key:
|
| 98 |
+
print(f"🔄 Switching to {type_key.upper()} model...")
|
| 99 |
+
self.unload_all()
|
| 100 |
+
try:
|
| 101 |
+
if type_key == "tts": self.tts_model = ChatterboxTTS.from_pretrained(DEVICE)
|
| 102 |
+
elif type_key == "mtl": self.mtl_model = ChatterboxMultilingualTTS.from_pretrained(DEVICE)
|
| 103 |
+
elif type_key == "vc": self.vc_model = ChatterboxVC.from_pretrained(DEVICE)
|
| 104 |
+
elif type_key == "turbo": self.turbo_model = ChatterboxTurboTTS.from_pretrained(device=DEVICE)
|
| 105 |
+
self.current_model_type = type_key
|
| 106 |
+
print(f"✅ {type_key.upper()} loaded.")
|
| 107 |
+
except Exception as e:
|
| 108 |
+
print(f"❌ Error loading {type_key}: {e}")
|
| 109 |
+
return None
|
| 110 |
+
return getattr(self, f"{type_key}_model")
|
| 111 |
+
|
| 112 |
+
model_manager = ModelManager()
|
| 113 |
+
|
| 114 |
+
# --- VOICE DB ---
|
| 115 |
+
VOICE_DB = {}; CATEGORY_LIST = []; FLAT_PATH_MAP = {}; ALL_VOICES_FLAT_LIST = []
|
| 116 |
+
|
| 117 |
+
def scan_voice_wav_structure():
|
| 118 |
+
global VOICE_DB, CATEGORY_LIST, FLAT_PATH_MAP, ALL_VOICES_FLAT_LIST
|
| 119 |
+
VOICE_DB = {}; CATEGORY_LIST = []; FLAT_PATH_MAP = {}; ALL_VOICES_FLAT_LIST = []
|
| 120 |
+
|
| 121 |
+
if not os.path.exists(VOICE_WAV_ROOT): return
|
| 122 |
+
|
| 123 |
+
print(f"📂 Escaneando voces en: {VOICE_WAV_ROOT}...")
|
| 124 |
+
for root, dirs, files in os.walk(VOICE_WAV_ROOT):
|
| 125 |
+
for f in files:
|
| 126 |
+
if f.endswith((".wav", ".mp3")):
|
| 127 |
+
full_path = os.path.join(root, f)
|
| 128 |
+
rel_path = os.path.relpath(full_path, VOICE_WAV_ROOT)
|
| 129 |
+
parts = rel_path.split(os.sep)
|
| 130 |
+
|
| 131 |
+
if len(parts) >= 2:
|
| 132 |
+
raw_cat = parts[0]
|
| 133 |
+
clean_cat = raw_cat.replace("ElevenLabs_", "").replace("_female", "").replace("_male", "").replace("_", " ").strip().title()
|
| 134 |
+
|
| 135 |
+
if clean_cat not in VOICE_DB:
|
| 136 |
+
VOICE_DB[clean_cat] = {}
|
| 137 |
+
if clean_cat not in CATEGORY_LIST: CATEGORY_LIST.append(clean_cat)
|
| 138 |
+
|
| 139 |
+
lang = "en"
|
| 140 |
+
if len(parts) >= 3:
|
| 141 |
+
possible_lang = parts[1].lower()
|
| 142 |
+
if len(possible_lang) == 2:
|
| 143 |
+
lang = possible_lang
|
| 144 |
+
|
| 145 |
+
icon = "♀️" if "_female" in raw_cat.lower() else ("♂️" if "_male" in raw_cat.lower() else "🎙️")
|
| 146 |
+
raw_name = os.path.splitext(f)[0]
|
| 147 |
+
display = f"{icon} {raw_name}"
|
| 148 |
+
|
| 149 |
+
if lang not in VOICE_DB[clean_cat]: VOICE_DB[clean_cat][lang] = {}
|
| 150 |
+
VOICE_DB[clean_cat][lang][display] = full_path
|
| 151 |
+
FLAT_PATH_MAP[display] = full_path
|
| 152 |
+
if display not in ALL_VOICES_FLAT_LIST: ALL_VOICES_FLAT_LIST.append(display)
|
| 153 |
+
|
| 154 |
+
CATEGORY_LIST.sort()
|
| 155 |
+
print(f"✅ {len(ALL_VOICES_FLAT_LIST)} voces encontradas.")
|
| 156 |
+
|
| 157 |
+
def resolve_voice_path(voice_name, lang="en"):
|
| 158 |
+
if not voice_name or voice_name == "None": return None
|
| 159 |
+
if voice_name in FLAT_PATH_MAP: return FLAT_PATH_MAP[voice_name]
|
| 160 |
+
for name, path in FLAT_PATH_MAP.items():
|
| 161 |
+
if voice_name in name: return path
|
| 162 |
+
return None
|
| 163 |
+
|
| 164 |
+
def get_available_languages(category):
|
| 165 |
+
if not category or category not in VOICE_DB: return []
|
| 166 |
+
return [(f"{SUPPORTED_LANGUAGES.get(c, c)} ({c})", c) for c in sorted(VOICE_DB[category].keys())]
|
| 167 |
+
|
| 168 |
+
def get_voices_for_ui(category, lang):
|
| 169 |
+
if category in VOICE_DB and lang in VOICE_DB[category]:
|
| 170 |
+
return sorted(list(VOICE_DB[category][lang].keys()))
|
| 171 |
+
return []
|
| 172 |
+
|
| 173 |
+
def get_all_voices_list():
|
| 174 |
+
return sorted(ALL_VOICES_FLAT_LIST)
|
| 175 |
+
|
| 176 |
+
# Inicializar
|
| 177 |
+
scan_voice_wav_structure()
|
| 178 |
+
|
| 179 |
+
# ==============================================================================
|
| 180 |
+
# 3. FUNCIONES DE GENERACIÓN
|
| 181 |
+
# ==============================================================================
|
| 182 |
+
|
| 183 |
+
def set_seed(seed):
|
| 184 |
+
torch.manual_seed(seed)
|
| 185 |
+
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
|
| 186 |
+
random.seed(seed)
|
| 187 |
+
np.random.seed(seed)
|
| 188 |
+
|
| 189 |
+
def format_time(seconds):
|
| 190 |
+
if seconds < 60: return f"{seconds:.1f}s"
|
| 191 |
+
return f"{int(seconds//60)}m {seconds%60:.1f}s"
|
| 192 |
+
|
| 193 |
+
def estimate_generation_time(text_length):
|
| 194 |
+
return (text_length / 50) * 2 + 1
|
| 195 |
+
|
| 196 |
+
def smart_chunk_text(text, max_words=40):
|
| 197 |
+
def has_cjk(text): return bool(re.search(r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]', text))
|
| 198 |
+
is_cjk = has_cjk(text)
|
| 199 |
+
sentences = re.split(r'(?<=[.!?。!?।؟])\s*|\n+', text)
|
| 200 |
+
chunks = []; current_chunk = []; current_count = 0
|
| 201 |
+
for sentence in sentences:
|
| 202 |
+
sentence = sentence.strip()
|
| 203 |
+
if not sentence: continue
|
| 204 |
+
count = len(re.sub(r'\s+', '', sentence)) if is_cjk else len(sentence.split())
|
| 205 |
+
if current_count + count > max_words:
|
| 206 |
+
if current_chunk:
|
| 207 |
+
chunks.append(''.join(current_chunk) if is_cjk else ' '.join(current_chunk))
|
| 208 |
+
current_chunk = []; current_count = 0
|
| 209 |
+
current_chunk.append(sentence); current_count += count
|
| 210 |
+
if current_chunk: chunks.append(''.join(current_chunk) if is_cjk else ' '.join(current_chunk))
|
| 211 |
+
return chunks if chunks else [text]
|
| 212 |
+
|
| 213 |
+
def generate_speech(text, voice_name, exaggeration, temperature, seed_num, cfgw, min_p, top_p, repetition_penalty):
|
| 214 |
+
try:
|
| 215 |
+
start_time = time.time()
|
| 216 |
+
if not text.strip(): yield make_progress_html(0, "❌ Error: Texto vacío"), None; return
|
| 217 |
+
path = resolve_voice_path(voice_name, "en")
|
| 218 |
+
if not path: yield make_progress_html(0, "❌ Error: Voz no encontrada"), None; return
|
| 219 |
+
|
| 220 |
+
yield make_progress_html(20, "⚙️ Cargando modelo TTS..."), None
|
| 221 |
+
model = model_manager.get_model("tts")
|
| 222 |
+
if model is None: yield make_progress_html(0, "❌ Error modelo"), None; return
|
| 223 |
+
|
| 224 |
+
if seed_num != 0: set_seed(int(seed_num))
|
| 225 |
+
chunks = smart_chunk_text(text)
|
| 226 |
+
wavs = []
|
| 227 |
+
|
| 228 |
+
for i, chunk in enumerate(chunks):
|
| 229 |
+
pct = 30 + int((i / len(chunks)) * 60)
|
| 230 |
+
yield make_progress_html(pct, f"🎙️ Generando parte {i+1}/{len(chunks)}..."), None
|
| 231 |
+
w = model.generate(chunk, audio_prompt_path=path, exaggeration=exaggeration, temperature=temperature, cfg_weight=cfgw, min_p=min_p, top_p=top_p, repetition_penalty=repetition_penalty)
|
| 232 |
+
wavs.append(w)
|
| 233 |
+
|
| 234 |
+
yield make_progress_html(95, "🎹 Uniendo audio..."), None
|
| 235 |
+
full_wav = torch.cat(wavs, dim=-1) if len(wavs) > 1 else wavs[0]
|
| 236 |
+
yield make_progress_html(100, f"✅ Listo ({format_time(time.time()-start_time)})"), (model.sr, full_wav.squeeze(0).numpy())
|
| 237 |
+
except Exception as e:
|
| 238 |
+
yield make_progress_html(0, f"❌ Error: {str(e)}"), None
|
| 239 |
+
|
| 240 |
+
def generate_turbo_speech(text, voice_name):
|
| 241 |
+
try:
|
| 242 |
+
start_time = time.time()
|
| 243 |
+
if not text.strip(): yield make_progress_html(0, "❌ Error: Texto vacío"), None; return
|
| 244 |
+
path = resolve_voice_path(voice_name, "en")
|
| 245 |
+
if not path: yield make_progress_html(0, "❌ Error: Voz no encontrada"), None; return
|
| 246 |
+
|
| 247 |
+
yield make_progress_html(20, "⚡ Cargando Turbo..."), None
|
| 248 |
+
model = model_manager.get_model("turbo")
|
| 249 |
+
if model is None: yield make_progress_html(0, "❌ Error Turbo"), None; return
|
| 250 |
+
|
| 251 |
+
chunks = smart_chunk_text(text)
|
| 252 |
+
wavs = []
|
| 253 |
+
for i, chunk in enumerate(chunks):
|
| 254 |
+
pct = 30 + int((i / len(chunks)) * 60)
|
| 255 |
+
yield make_progress_html(pct, f"⚡ Turbo chunk {i+1}/{len(chunks)}..."), None
|
| 256 |
+
w = model.generate(chunk, audio_prompt_path=path)
|
| 257 |
+
wavs.append(w)
|
| 258 |
+
|
| 259 |
+
full_wav = torch.cat(wavs, dim=-1) if len(wavs) > 1 else wavs[0]
|
| 260 |
+
yield make_progress_html(100, f"✅ Turbo listo ({format_time(time.time()-start_time)})"), (model.sr, full_wav.squeeze(0).numpy())
|
| 261 |
+
except Exception as e:
|
| 262 |
+
yield make_progress_html(0, f"❌ Error: {str(e)}"), None
|
| 263 |
+
|
| 264 |
+
def generate_multilingual_speech(text, voice_name, lang_code, exaggeration, temperature, seed_num, cfgw):
|
| 265 |
+
try:
|
| 266 |
+
start_time = time.time()
|
| 267 |
+
path = resolve_voice_path(voice_name, lang_code)
|
| 268 |
+
yield make_progress_html(20, "🌍 Cargando Multi-TTS..."), None
|
| 269 |
+
model = model_manager.get_model("mtl")
|
| 270 |
+
if model is None: yield make_progress_html(0, "❌ Error modelo"), None; return
|
| 271 |
+
|
| 272 |
+
if seed_num != 0: set_seed(int(seed_num))
|
| 273 |
+
chunks = smart_chunk_text(text)
|
| 274 |
+
wavs = []
|
| 275 |
+
for i, chunk in enumerate(chunks):
|
| 276 |
+
pct = 30 + int((i / len(chunks)) * 60)
|
| 277 |
+
yield make_progress_html(pct, f"🌍 Generando ({lang_code}) {i+1}..."), None
|
| 278 |
+
w = model.generate(chunk, language_id=lang_code, audio_prompt_path=path, exaggeration=exaggeration, temperature=temperature, cfg_weight=cfgw)
|
| 279 |
+
wavs.append(w)
|
| 280 |
+
|
| 281 |
+
full_wav = torch.cat(wavs, dim=-1) if len(wavs) > 1 else wavs[0]
|
| 282 |
+
yield make_progress_html(100, "✅ Listo"), (model.sr, full_wav.squeeze(0).numpy())
|
| 283 |
+
except Exception as e:
|
| 284 |
+
yield make_progress_html(0, f"❌ Error: {str(e)}"), None
|
| 285 |
+
|
| 286 |
+
def convert_voice(audio, target_voice):
|
| 287 |
+
try:
|
| 288 |
+
start_time = time.time()
|
| 289 |
+
path = resolve_voice_path(target_voice, "en")
|
| 290 |
+
if not path: yield make_progress_html(0, "❌ Error: Voz destino no válida"), None; return
|
| 291 |
+
|
| 292 |
+
yield make_progress_html(50, "🔄 Cargando VC..."), None
|
| 293 |
+
model = model_manager.get_model("vc")
|
| 294 |
+
if model is None: yield make_progress_html(0, "❌ Error VC"), None; return
|
| 295 |
+
|
| 296 |
+
yield make_progress_html(70, "🔄 Convirtiendo..."), None
|
| 297 |
+
w = model.generate(audio, target_voice_path=path)
|
| 298 |
+
yield make_progress_html(100, f"✅ Listo ({format_time(time.time()-start_time)})"), (model.sr, w.squeeze(0).numpy())
|
| 299 |
+
except Exception as e:
|
| 300 |
+
yield make_progress_html(0, f"❌ Error: {str(e)}"), None
|
| 301 |
+
|
| 302 |
+
def clone_voice_wrapper(ref_audio, name, cat, lang, gender):
|
| 303 |
+
try:
|
| 304 |
+
if not name: return "Nombre requerido", gr.update()
|
| 305 |
+
cat_slug = cat.lower().replace(" ", "_")
|
| 306 |
+
gender_slug = gender.lower()
|
| 307 |
+
target_dir = os.path.join(VOICE_WAV_ROOT, f"{cat_slug}_{gender_slug}", lang)
|
| 308 |
+
os.makedirs(target_dir, exist_ok=True)
|
| 309 |
+
|
| 310 |
+
clean_name = "".join(x for x in name if x.isalnum() or x in " -_").strip()
|
| 311 |
+
dest = os.path.join(target_dir, f"{clean_name}.wav")
|
| 312 |
+
|
| 313 |
+
if os.path.exists(dest): return "❌ La voz ya existe", gr.update()
|
| 314 |
+
shutil.copy(ref_audio, dest)
|
| 315 |
+
|
| 316 |
+
scan_voice_wav_structure()
|
| 317 |
+
return f"✅ Clonada: {clean_name}", gr.update(choices=get_all_voices_list())
|
| 318 |
+
except Exception as e:
|
| 319 |
+
return f"❌ Error: {e}", gr.update()
|
| 320 |
+
|
| 321 |
+
def delete_voice_wrapper(voice_name):
|
| 322 |
+
try:
|
| 323 |
+
path = resolve_voice_path(voice_name, "en")
|
| 324 |
+
if path and os.path.exists(path):
|
| 325 |
+
os.remove(path)
|
| 326 |
+
scan_voice_wav_structure()
|
| 327 |
+
return f"✅ Eliminada: {voice_name}", gr.update(choices=get_all_voices_list(), value=None)
|
| 328 |
+
return "❌ Archivo no encontrado", gr.update()
|
| 329 |
+
except Exception as e:
|
| 330 |
+
return f"❌ Error: {e}", gr.update()
|
| 331 |
+
|
| 332 |
+
# ==============================================================================
|
| 333 |
+
# 4. INTERFAZ GRÁFICA (UI)
|
| 334 |
+
# ==============================================================================
|
| 335 |
+
|
| 336 |
+
def create_header():
|
| 337 |
+
gr.HTML("""
|
| 338 |
+
<div style="text-align: center; margin-bottom: 1rem; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 25px; border-radius: 16px; color: white; box-shadow: 0 4px 15px rgba(0,0,0,0.2);">
|
| 339 |
+
<h1 style="font-size: 2.8em; margin: 0; font-weight: 800;">⚡ Chatterbox Turbo</h1>
|
| 340 |
+
<p style="font-size: 1.1em; opacity: 0.9; margin-top: 10px;">Pro Audio Synthesis Suite</p>
|
| 341 |
+
</div>
|
| 342 |
+
""")
|
| 343 |
+
|
| 344 |
+
INITIAL_PROGRESS = make_progress_html(0, "Esperando inicio...")
|
| 345 |
+
|
| 346 |
+
def create_turbo_tab():
|
| 347 |
+
with gr.Row():
|
| 348 |
+
with gr.Column(scale=1):
|
| 349 |
+
text = gr.Textbox(label="Texto a sintetizar", value="Hello! [laugh] This is Turbo speed!", lines=4, elem_id="turbo_textbox")
|
| 350 |
+
with gr.Row(elem_classes="tag-container"):
|
| 351 |
+
tags = [gr.Button(t, size="sm", elem_classes="tag-btn") for t in ["[laugh]", "[sigh]", "[cough]", "[clear throat]", "[gasp]", "[chuckle]"]]
|
| 352 |
+
voice = gr.Dropdown(label="Seleccionar Voz", choices=[], interactive=True)
|
| 353 |
+
preview = gr.Audio(label="Preview", interactive=False, visible=True, type="filepath")
|
| 354 |
+
btn = gr.Button("⚡ Generar Audio (Turbo)", variant="primary", size="lg")
|
| 355 |
+
with gr.Column(scale=1):
|
| 356 |
+
# MODIFICADO: Audio primero, luego progreso
|
| 357 |
+
audio_out = gr.Audio(label="Resultado Final", autoplay=True, show_download_button=True)
|
| 358 |
+
progress_html = gr.HTML(value=INITIAL_PROGRESS, label="Estado")
|
| 359 |
+
return {"text": text, "voice": voice, "prev": preview, "btn": btn, "prog": progress_html, "out": audio_out, "tags": tags}
|
| 360 |
+
|
| 361 |
+
def create_tts_tab():
|
| 362 |
+
with gr.Row():
|
| 363 |
+
with gr.Column(scale=1):
|
| 364 |
+
text = gr.Textbox(label="Texto", value="Hello world!", lines=4)
|
| 365 |
+
voice = gr.Dropdown(label="Voz Clonada", choices=[], interactive=True)
|
| 366 |
+
preview = gr.Audio(label="Preview", interactive=False)
|
| 367 |
+
with gr.Accordion("⚙️ Opciones Avanzadas", open=False):
|
| 368 |
+
exag = gr.Slider(0.25, 2, value=.5, step=0.05, label="Exaggeration")
|
| 369 |
+
cfg = gr.Slider(0, 1, value=.5, step=0.05, label="CFG")
|
| 370 |
+
temp = gr.Slider(0.05, 5, value=.8, step=0.05, label="Temp")
|
| 371 |
+
seed = gr.Number(0, label="Seed")
|
| 372 |
+
min_p = gr.Slider(0, 1, value=0.05, label="Min P")
|
| 373 |
+
top_p = gr.Slider(0, 1, value=1.0, label="Top P")
|
| 374 |
+
rep = gr.Slider(1, 2, value=1.2, label="Repetition")
|
| 375 |
+
btn = gr.Button("🎙️ Generar Audio", variant="primary", size="lg")
|
| 376 |
+
with gr.Column(scale=1):
|
| 377 |
+
# MODIFICADO: Audio primero, luego progreso
|
| 378 |
+
audio_out = gr.Audio(label="Resultado", autoplay=True)
|
| 379 |
+
progress_html = gr.HTML(value=INITIAL_PROGRESS)
|
| 380 |
+
return {"text": text, "voice": voice, "prev": preview, "btn": btn, "prog": progress_html, "out": audio_out, "opts": [exag, temp, seed, cfg, min_p, top_p, rep]}
|
| 381 |
+
|
| 382 |
+
def create_mtl_tab():
|
| 383 |
+
with gr.Row():
|
| 384 |
+
with gr.Column(scale=1):
|
| 385 |
+
text = gr.Textbox(label="Texto", value="Hola mundo", lines=4)
|
| 386 |
+
lang_choices = [(f"{v} ({k})", k) for k,v in SUPPORTED_LANGUAGES.items()]
|
| 387 |
+
lang = gr.Dropdown(label="Idioma", choices=lang_choices, value="es")
|
| 388 |
+
|
| 389 |
+
voice = gr.Dropdown(label="Voz", choices=[])
|
| 390 |
+
preview = gr.Audio(label="Preview", interactive=False)
|
| 391 |
+
with gr.Accordion("⚙️ Opciones", open=False):
|
| 392 |
+
exag = gr.Slider(0.25, 2, value=.5, label="Exaggeration")
|
| 393 |
+
temp = gr.Slider(0.05, 5, value=.8, label="Temp")
|
| 394 |
+
seed = gr.Number(0, label="Seed")
|
| 395 |
+
cfg = gr.Slider(0, 1, value=.5, label="CFG")
|
| 396 |
+
btn = gr.Button("🌍 Generar", variant="primary", size="lg")
|
| 397 |
+
with gr.Column(scale=1):
|
| 398 |
+
# MODIFICADO: Audio primero, luego progreso
|
| 399 |
+
audio_out = gr.Audio(label="Resultado", autoplay=True)
|
| 400 |
+
progress_html = gr.HTML(value=INITIAL_PROGRESS)
|
| 401 |
+
return {"text": text, "lang": lang, "voice": voice, "prev": preview, "btn": btn, "prog": progress_html, "out": audio_out, "opts": [exag, temp, seed, cfg]}
|
| 402 |
+
|
| 403 |
+
def create_vc_tab():
|
| 404 |
+
with gr.Row():
|
| 405 |
+
with gr.Column(scale=1):
|
| 406 |
+
inp = gr.Audio(label="Entrada", sources=["upload", "microphone"], type="filepath")
|
| 407 |
+
voice = gr.Dropdown(label="Voz Objetivo", choices=[])
|
| 408 |
+
preview = gr.Audio(label="Preview", interactive=False)
|
| 409 |
+
btn = gr.Button("🔄 Convertir", variant="primary", size="lg")
|
| 410 |
+
with gr.Column(scale=1):
|
| 411 |
+
# MODIFICADO: Audio primero, luego progreso
|
| 412 |
+
audio_out = gr.Audio(label="Resultado", autoplay=True)
|
| 413 |
+
progress_html = gr.HTML(value=INITIAL_PROGRESS)
|
| 414 |
+
return {"inp": inp, "voice": voice, "prev": preview, "btn": btn, "prog": progress_html, "out": audio_out}
|
| 415 |
+
|
| 416 |
+
def create_clone_tab():
|
| 417 |
+
with gr.Row():
|
| 418 |
+
with gr.Column():
|
| 419 |
+
gr.Markdown("### 🧬 Clonar Voz")
|
| 420 |
+
name = gr.Textbox(label="Nombre")
|
| 421 |
+
with gr.Row():
|
| 422 |
+
gender = gr.Radio(["Male", "Female"], value="Male", label="Género")
|
| 423 |
+
lang_choices = [(f"{v} ({k})", k) for k,v in SUPPORTED_LANGUAGES.items()]
|
| 424 |
+
lang = gr.Dropdown(label="Idioma", choices=lang_choices, value="es")
|
| 425 |
+
|
| 426 |
+
cat = gr.Dropdown(label="Categoría", choices=CATEGORY_LIST, allow_custom_value=False)
|
| 427 |
+
ref = gr.Audio(label="Referencia", type="filepath")
|
| 428 |
+
btn = gr.Button("💾 Clonar", variant="primary")
|
| 429 |
+
status = gr.Textbox(label="Estado")
|
| 430 |
+
with gr.Column():
|
| 431 |
+
gr.Markdown("### 🗑️ Borrar")
|
| 432 |
+
del_sel = gr.Dropdown(label="Seleccionar Voz", choices=[])
|
| 433 |
+
del_btn = gr.Button("🗑️ Eliminar", variant="stop")
|
| 434 |
+
del_stat = gr.Textbox(label="Estado")
|
| 435 |
+
return {"name": name, "gender": gender, "cat": cat, "lang": lang, "ref": ref, "btn": btn, "stat": status, "del_sel": del_sel, "del_btn": del_btn, "del_stat": del_stat}
|
| 436 |
+
|
| 437 |
+
CSS = """
|
| 438 |
+
body, .gradio-container { background-color: #0f172a; font-family: 'Segoe UI', sans-serif; }
|
| 439 |
+
.sidebar-container { background-color: #1e293b; padding: 20px; border-right: 1px solid #334155; }
|
| 440 |
+
.nav-btn { background: transparent; border: none; color: #94a3b8; text-align: left; padding: 15px; font-weight: 600; width: 100%; border-radius: 8px; margin-bottom: 5px; transition: all 0.2s; }
|
| 441 |
+
.nav-btn:hover { background: #334155; color: white; padding-left: 20px; }
|
| 442 |
+
.active-btn { background: #334155; color: white; border-left: 4px solid #f97316; padding-left: 20px; box-shadow: 0 4px 6px rgba(0,0,0,0.1); }
|
| 443 |
+
.content-panel { background: #1e293b; border: 1px solid #334155; border-radius: 16px; padding: 30px; margin-top: 20px; box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1); }
|
| 444 |
+
input, textarea, select { background-color: #0f172a !important; border: 1px solid #334155 !important; color: white !important; }
|
| 445 |
+
.tag-btn { background: #334155; color: #e2e8f0; border: 1px solid #475569; margin-right: 5px; }
|
| 446 |
+
"""
|
| 447 |
+
|
| 448 |
+
with gr.Blocks(title="Chatterbox Pro", css=CSS, theme=gr.themes.Base()) as demo:
|
| 449 |
+
with gr.Row(elem_classes="main-layout", equal_height=True):
|
| 450 |
+
with gr.Column(scale=1, min_width=250, elem_classes="sidebar-container"):
|
| 451 |
+
gr.Markdown("### 🎛️ CONTROL PANEL")
|
| 452 |
+
gr.Markdown("---")
|
| 453 |
+
initial_cat = CATEGORY_LIST[0] if CATEGORY_LIST else None
|
| 454 |
+
cat_filter = gr.Dropdown(label="📚 Librería", choices=CATEGORY_LIST, value=initial_cat)
|
| 455 |
+
lang_filter = gr.Dropdown(label="🌐 Idioma", choices=[])
|
| 456 |
+
gr.Markdown("---")
|
| 457 |
+
btn_turbo = gr.Button("🚀 Turbo Mode", elem_classes=["nav-btn", "active-btn"])
|
| 458 |
+
btn_tts = gr.Button("🎤 TTS Pro", elem_classes=["nav-btn"])
|
| 459 |
+
btn_mtl = gr.Button("🌍 Multilingual", elem_classes=["nav-btn"])
|
| 460 |
+
btn_vc = gr.Button("🔄 Converter", elem_classes=["nav-btn"])
|
| 461 |
+
btn_clone = gr.Button("🧬 Cloning Lab", elem_classes=["nav-btn"])
|
| 462 |
+
|
| 463 |
+
with gr.Column(scale=4, elem_classes="content-area"):
|
| 464 |
+
create_header()
|
| 465 |
+
with gr.Column(visible=True, elem_classes="content-panel") as v_turbo: ui_turbo = create_turbo_tab()
|
| 466 |
+
with gr.Column(visible=False, elem_classes="content-panel") as v_tts: ui_tts = create_tts_tab()
|
| 467 |
+
with gr.Column(visible=False, elem_classes="content-panel") as v_mtl: ui_mtl = create_mtl_tab()
|
| 468 |
+
with gr.Column(visible=False, elem_classes="content-panel") as v_vc: ui_vc = create_vc_tab()
|
| 469 |
+
with gr.Column(visible=False, elem_classes="content-panel") as v_clone: ui_clone = create_clone_tab()
|
| 470 |
+
|
| 471 |
+
views = [v_turbo, v_tts, v_mtl, v_vc, v_clone]; btns = [btn_turbo, btn_tts, btn_mtl, btn_vc, btn_clone]
|
| 472 |
+
def switch_view(idx):
|
| 473 |
+
return [gr.update(visible=(i==idx)) for i in range(len(views))] + [gr.update(elem_classes=["nav-btn", "active-btn"] if i==idx else ["nav-btn"]) for i in range(len(btns))]
|
| 474 |
+
|
| 475 |
+
for i, b in enumerate(btns): b.click(lambda idx=i: switch_view(idx), outputs=views+btns)
|
| 476 |
+
|
| 477 |
+
def update_ui_lists(cat, lang_code=None):
|
| 478 |
+
langs = get_available_languages(cat)
|
| 479 |
+
valid_codes = [c[1] for c in langs]
|
| 480 |
+
curr_lang = lang_code if lang_code in valid_codes else (valid_codes[0] if valid_codes else None)
|
| 481 |
+
voices = get_voices_for_ui(cat, curr_lang)
|
| 482 |
+
v_val = voices[0] if voices else None
|
| 483 |
+
return (gr.update(choices=langs, value=curr_lang), gr.update(choices=voices, value=v_val), gr.update(choices=voices, value=v_val), gr.update(choices=voices, value=v_val), gr.update(choices=voices, value=v_val), gr.update(choices=CATEGORY_LIST, value=cat), gr.update(choices=get_all_voices_list()))
|
| 484 |
+
|
| 485 |
+
cat_filter.change(update_ui_lists, inputs=[cat_filter], outputs=[lang_filter, ui_turbo["voice"], ui_tts["voice"], ui_mtl["voice"], ui_vc["voice"], ui_clone["cat"], ui_clone["del_sel"]])
|
| 486 |
+
lang_filter.change(lambda c, l: update_ui_lists(c, l), inputs=[cat_filter, lang_filter], outputs=[lang_filter, ui_turbo["voice"], ui_tts["voice"], ui_mtl["voice"], ui_vc["voice"], ui_clone["cat"], ui_clone["del_sel"]])
|
| 487 |
+
demo.load(lambda: update_ui_lists(initial_cat), outputs=[lang_filter, ui_turbo["voice"], ui_tts["voice"], ui_mtl["voice"], ui_vc["voice"], ui_clone["cat"], ui_clone["del_sel"]])
|
| 488 |
+
|
| 489 |
+
ui_turbo["voice"].change(lambda v: resolve_voice_path(v, "en"), inputs=ui_turbo["voice"], outputs=ui_turbo["prev"])
|
| 490 |
+
ui_tts["voice"].change(lambda v: resolve_voice_path(v, "en"), inputs=ui_tts["voice"], outputs=ui_tts["prev"])
|
| 491 |
+
ui_mtl["voice"].change(lambda v, l: resolve_voice_path(v, l), inputs=[ui_mtl["voice"], ui_mtl["lang"]], outputs=ui_mtl["prev"])
|
| 492 |
+
ui_vc["voice"].change(lambda v: resolve_voice_path(v, "en"), inputs=ui_vc["voice"], outputs=ui_vc["prev"])
|
| 493 |
+
|
| 494 |
+
ui_turbo["btn"].click(generate_turbo_speech, inputs=[ui_turbo["text"], ui_turbo["voice"]], outputs=[ui_turbo["prog"], ui_turbo["out"]])
|
| 495 |
+
ui_tts["btn"].click(generate_speech, inputs=[ui_tts["text"], ui_tts["voice"]] + ui_tts["opts"], outputs=[ui_tts["prog"], ui_tts["out"]])
|
| 496 |
+
ui_mtl["btn"].click(generate_multilingual_speech, inputs=[ui_mtl["text"], ui_mtl["voice"], ui_mtl["lang"]] + ui_mtl["opts"], outputs=[ui_mtl["prog"], ui_mtl["out"]])
|
| 497 |
+
ui_vc["btn"].click(convert_voice, inputs=[ui_vc["inp"], ui_vc["voice"]], outputs=[ui_vc["prog"], ui_vc["out"]])
|
| 498 |
+
ui_clone["btn"].click(clone_voice_wrapper, inputs=[ui_clone["ref"], ui_clone["name"], ui_clone["cat"], ui_clone["lang"], ui_clone["gender"]], outputs=[ui_clone["stat"], ui_clone["del_sel"]])
|
| 499 |
+
ui_clone["del_btn"].click(delete_voice_wrapper, inputs=[ui_clone["del_sel"]], outputs=[ui_clone["del_stat"], ui_clone["del_sel"]])
|
| 500 |
+
|
| 501 |
+
JS_TAGS = """(tag, text) => { var el = document.querySelector('#turbo_textbox textarea'); if(el) { var start = el.selectionStart; var end = el.selectionEnd; return text.slice(0, start) + " " + tag + " " + text.slice(end); } return text + " " + tag; }"""
|
| 502 |
+
for btn in ui_turbo["tags"]: btn.click(None, inputs=[btn, ui_turbo["text"]], outputs=ui_turbo["text"], js=JS_TAGS)
|
| 503 |
+
|
| 504 |
+
if __name__ == "__main__":
|
| 505 |
+
demo.queue().launch(inbrowser=True)
|
requirements.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy>=1.24.0,<1.26.0
|
| 2 |
+
librosa==0.11.0
|
| 3 |
+
s3tokenizer
|
| 4 |
+
torch==2.7.1
|
| 5 |
+
torchaudio==2.7.1
|
| 6 |
+
transformers==4.46.3
|
| 7 |
+
diffusers==0.29.0
|
| 8 |
+
resemble-perth==1.0.1
|
| 9 |
+
conformer==0.3.2
|
| 10 |
+
safetensors==0.5.3
|
| 11 |
+
pykakasi==2.3.0
|
| 12 |
+
gradio==5.44.1
|
| 13 |
+
pyloudnorm
|
| 14 |
+
omegaconf
|