Upload 3 files
Browse files- Dockerfile +48 -0
- README.md +61 -6
- requirements.txt +25 -0
Dockerfile
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-runtime
|
| 2 |
+
|
| 3 |
+
# Set environment variables
|
| 4 |
+
ENV PYTHONUNBUFFERED=1
|
| 5 |
+
ENV GRADIO_SERVER_NAME=0.0.0.0
|
| 6 |
+
ENV GRADIO_SERVER_PORT=7860
|
| 7 |
+
ENV USE_DEEPSPEED=true
|
| 8 |
+
ENV USE_FP16=true
|
| 9 |
+
ENV USE_TORCH_COMPILE=true
|
| 10 |
+
ENV MAX_CACHE_SIZE=10
|
| 11 |
+
ENV HF_HOME=/app/cache
|
| 12 |
+
ENV TRANSFORMERS_CACHE=/app/cache
|
| 13 |
+
|
| 14 |
+
# Install system dependencies
|
| 15 |
+
RUN apt-get update && apt-get install -y \
|
| 16 |
+
git \
|
| 17 |
+
ffmpeg \
|
| 18 |
+
libsndfile1 \
|
| 19 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 20 |
+
|
| 21 |
+
# Create app directory
|
| 22 |
+
WORKDIR /app
|
| 23 |
+
|
| 24 |
+
# Copy requirements first for caching
|
| 25 |
+
COPY requirements.txt .
|
| 26 |
+
|
| 27 |
+
# Install Python dependencies
|
| 28 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 29 |
+
pip install --no-cache-dir -r requirements.txt
|
| 30 |
+
|
| 31 |
+
# Install DeepSpeed with CUDA
|
| 32 |
+
RUN pip install --no-cache-dir deepspeed
|
| 33 |
+
|
| 34 |
+
# Copy application code
|
| 35 |
+
COPY app.py .
|
| 36 |
+
COPY model/ ./model/
|
| 37 |
+
|
| 38 |
+
# Create cache directory
|
| 39 |
+
RUN mkdir -p /app/cache
|
| 40 |
+
|
| 41 |
+
# Set permissions for HF Spaces
|
| 42 |
+
RUN chmod -R 777 /app
|
| 43 |
+
|
| 44 |
+
# Expose port
|
| 45 |
+
EXPOSE 7860
|
| 46 |
+
|
| 47 |
+
# Run the application
|
| 48 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,12 +1,67 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: green
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: XTTSv2 Optimized TTS
|
| 3 |
+
emoji: 🐸
|
| 4 |
colorFrom: green
|
| 5 |
+
colorTo: blue
|
| 6 |
+
sdk: docker
|
| 7 |
+
sdk_version: 4.19.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: other
|
| 11 |
+
tags:
|
| 12 |
+
- tts
|
| 13 |
+
- text-to-speech
|
| 14 |
+
- voice-cloning
|
| 15 |
+
- xtts
|
| 16 |
+
- coqui
|
| 17 |
+
suggested_hardware: t4-small
|
| 18 |
---
|
| 19 |
|
| 20 |
+
# 🐸 XTTSv2 Optimized Text-to-Speech
|
| 21 |
+
|
| 22 |
+
High-quality multilingual voice cloning powered by XTTSv2 with performance optimizations.
|
| 23 |
+
|
| 24 |
+
## Features
|
| 25 |
+
|
| 26 |
+
- **17 Languages**: English, Spanish, French, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Chinese, Japanese, Hungarian, Korean, Hindi
|
| 27 |
+
- **Voice Cloning**: Clone any voice from ~6 seconds of reference audio
|
| 28 |
+
- **Streaming Mode**: Low-latency streaming for real-time applications
|
| 29 |
+
- **Optimizations**:
|
| 30 |
+
- DeepSpeed acceleration
|
| 31 |
+
- FP16 inference
|
| 32 |
+
- torch.compile() optimization
|
| 33 |
+
- Speaker embedding caching
|
| 34 |
+
|
| 35 |
+
## Usage
|
| 36 |
+
|
| 37 |
+
1. Upload a reference audio file (WAV/MP3, 6-30 seconds recommended)
|
| 38 |
+
2. Enter your text
|
| 39 |
+
3. Select the language
|
| 40 |
+
4. Click "Generate Speech"
|
| 41 |
+
|
| 42 |
+
## Performance
|
| 43 |
+
|
| 44 |
+
| Hardware | Latency (per sentence) |
|
| 45 |
+
|----------|------------------------|
|
| 46 |
+
| T4 | ~2-3 seconds |
|
| 47 |
+
| A10G | ~1 second |
|
| 48 |
+
| A100 | ~0.5 seconds |
|
| 49 |
+
|
| 50 |
+
## Configuration
|
| 51 |
+
|
| 52 |
+
Environment variables for tuning:
|
| 53 |
+
|
| 54 |
+
- `USE_DEEPSPEED`: Enable DeepSpeed (default: true)
|
| 55 |
+
- `USE_FP16`: Enable FP16 inference (default: true)
|
| 56 |
+
- `USE_TORCH_COMPILE`: Enable torch.compile (default: true)
|
| 57 |
+
- `MAX_CACHE_SIZE`: Number of speakers to cache (default: 10)
|
| 58 |
+
- `STREAMING_CHUNK_SIZE`: Streaming chunk size (default: 20)
|
| 59 |
+
|
| 60 |
+
## License
|
| 61 |
+
|
| 62 |
+
This model uses the [Coqui Public Model License](https://coqui.ai/cpml).
|
| 63 |
+
|
| 64 |
+
## Credits
|
| 65 |
+
|
| 66 |
+
- [Coqui TTS](https://github.com/coqui-ai/TTS)
|
| 67 |
+
- [XTTS Paper](https://arxiv.org/abs/2406.04904)
|
requirements.txt
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core TTS
|
| 2 |
+
TTS>=0.22.0
|
| 3 |
+
|
| 4 |
+
# PyTorch with CUDA (HF Spaces handles this, but specify for local dev)
|
| 5 |
+
torch>=2.1.0
|
| 6 |
+
torchaudio>=2.1.0
|
| 7 |
+
|
| 8 |
+
# DeepSpeed for acceleration
|
| 9 |
+
deepspeed>=0.12.0
|
| 10 |
+
|
| 11 |
+
# Gradio UI
|
| 12 |
+
gradio>=4.19.0
|
| 13 |
+
|
| 14 |
+
# Audio processing
|
| 15 |
+
numpy>=1.24.0
|
| 16 |
+
scipy>=1.11.0
|
| 17 |
+
librosa>=0.10.0
|
| 18 |
+
soundfile>=0.12.0
|
| 19 |
+
|
| 20 |
+
# Utilities
|
| 21 |
+
transformers>=4.36.0
|
| 22 |
+
huggingface_hub>=0.20.0
|
| 23 |
+
|
| 24 |
+
# Optional: for faster tokenization
|
| 25 |
+
tokenizers>=0.15.0
|