Spaces:

diabolic6045
/

tts-api

Sleeping

App Files Files Community

Avinyaa commited on May 31

Commit

9a88d9c

1 Parent(s): c53fcc3

new

Browse files

Files changed (6) hide show

README.md +109 -1
app.py +169 -0
client_example.py +73 -0
dockerfile +28 -0
requirements.txt +9 -0
test.py +10 -0

README.md CHANGED Viewed

@@ -7,4 +7,112 @@ sdk: docker
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 pinned: false
 ---
+# TTS API
+A FastAPI-based Text-to-Speech API using XTTS-v2 for voice cloning.
+## Features
+- Convert text to speech using voice cloning
+- Upload reference speaker audio files
+- Support for multiple languages
+- RESTful API with automatic documentation
+- Docker support
+## Setup
+### Local Development
+1. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+2. Run the API:
+```bash
+python app.py
+```
+The API will be available at `http://localhost:8000`
+### Using Docker
+1. Build the Docker image:
+```bash
+docker build -t tts-api .
+```
+2. Run the container:
+```bash
+docker run -p 8000:8000 tts-api
+```
+## API Endpoints
+### Health Check
+- **GET** `/health` - Check API status
+### Text-to-Speech
+- **POST** `/tts` - Convert text to speech with uploaded speaker file
+  - **Parameters:**
+    - `text` (form): Text to convert to speech
+    - `language` (form): Language code (default: "en")
+    - `speaker_file` (file): Reference speaker audio file
+### API Documentation
+- **GET** `/docs` - Interactive API documentation (Swagger UI)
+- **GET** `/redoc` - Alternative API documentation
+## Usage Examples
+### Using Python requests
+```python
+import requests
+# Prepare the request
+url = "http://localhost:8000/tts"
+data = {
+    "text": "Hello, this is a test of voice cloning!",
+    "language": "en"
+}
+files = {
+    "speaker_file": open("path/to/speaker.wav", "rb")
+}
+# Make the request
+response = requests.post(url, data=data, files=files)
+# Save the generated audio
+if response.status_code == 200:
+    with open("output.wav", "wb") as f:
+        f.write(response.content)
+    print("Speech generated successfully!")
+```
+### Using curl
+```bash
+curl -X POST "http://localhost:8000/tts" \
+  -F "text=Hello, this is a test!" \
+  -F "language=en" \
+  -F "speaker_file=@path/to/speaker.wav" \
+  --output generated_speech.wav
+```
+### Using the provided client example
+```bash
+python client_example.py
+```
+## Requirements
+- Python 3.8+
+- CUDA-compatible GPU (recommended for faster processing)
+- Audio file in supported format (WAV, MP3, etc.) for speaker reference
+## Model
+This API uses the XTTS-v2_C3PO model for voice cloning, which is automatically downloaded when building the Docker image.

app.py ADDED Viewed

	@@ -0,0 +1,169 @@

+from fastapi import FastAPI, HTTPException, UploadFile, File, Form
+from fastapi.responses import FileResponse
+from pydantic import BaseModel
+from TTS.api import TTS
+import os
+import tempfile
+import uuid
+import torch
+from typing import Optional
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = FastAPI(title="TTS API", description="Text-to-Speech API using XTTS-v2", version="1.0.0")
+class TTSRequest(BaseModel):
+    text: str
+    language: str = "en"
+class TTSService:
+    def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"Using device: {self.device}")
+        # Use absolute paths for the model
+        model_path = "/app/XTTS-v2_C3PO/"
+        config_path = "/app/XTTS-v2_C3PO/config.json"
+        # Check if model files exist
+        if not os.path.exists(config_path):
+            logger.warning(f"Custom model config not found at {config_path}")
+            # List contents of model directory for debugging
+            model_dir = "/app/XTTS-v2_C3PO"
+            if os.path.exists(model_dir):
+                logger.info(f"Contents of {model_dir}: {os.listdir(model_dir)}")
+            else:
+                logger.warning(f"Model directory {model_dir} does not exist")
+            # Fallback to default XTTS model
+            logger.info("Falling back to default XTTS model")
+            try:
+                self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
+                logger.info("Default TTS model loaded successfully")
+                return
+            except Exception as e:
+                logger.error(f"Failed to load default TTS model: {e}")
+                raise e
+        try:
+            self.tts = TTS(
+                model_path=model_path,
+                config_path=config_path,
+                progress_bar=False,
+                gpu=torch.cuda.is_available()
+            ).to(self.device)
+            logger.info("Custom TTS model loaded successfully")
+        except Exception as e:
+            logger.error(f"Failed to load custom TTS model: {e}")
+            # Fallback to default model
+            logger.info("Falling back to default XTTS model")
+            try:
+                self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
+                logger.info("Default TTS model loaded successfully")
+            except Exception as fallback_e:
+                logger.error(f"Failed to load default TTS model: {fallback_e}")
+                raise fallback_e
+    def generate_speech(self, text: str, speaker_wav_path: str, language: str = "en") -> str:
+        """Generate speech and return the path to the output file"""
+        try:
+            # Create a unique filename for the output
+            output_filename = f"output_{uuid.uuid4().hex}.wav"
+            output_path = os.path.join(tempfile.gettempdir(), output_filename)
+            # Generate speech
+            self.tts.tts_to_file(
+                text=text,
+                file_path=output_path,
+                speaker_wav=speaker_wav_path,
+                language=language
+            )
+            return output_path
+        except Exception as e:
+            logger.error(f"Error generating speech: {e}")
+            raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")
+# Initialize TTS service
+tts_service = TTSService()
+@app.get("/")
+async def root():
+    return {"message": "TTS API is running", "status": "healthy"}
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy", "device": tts_service.device}
+@app.post("/tts")
+async def text_to_speech(
+    text: str = Form(...),
+    language: str = Form("en"),
+    speaker_file: UploadFile = File(...)
+):
+    """
+    Convert text to speech using a reference speaker voice
+    - **text**: The text to convert to speech
+    - **language**: Language code (default: "en")
+    - **speaker_file**: Audio file containing the reference speaker voice
+    """
+    if not text.strip():
+        raise HTTPException(status_code=400, detail="Text cannot be empty")
+    # Validate file type
+    if not speaker_file.content_type.startswith('audio/'):
+        raise HTTPException(status_code=400, detail="Speaker file must be an audio file")
+    try:
+        # Save uploaded speaker file temporarily
+        speaker_temp_path = "XTTS-v2_C3PO/reference.wav"
+        with open(speaker_temp_path, "wb") as buffer:
+            content = await speaker_file.read()
+            buffer.write(content)
+        # Generate speech
+        output_path = tts_service.generate_speech(text, speaker_temp_path, language)
+        # Return the generated audio file
+        return FileResponse(
+            output_path,
+            media_type="audio/wav",
+            filename=f"tts_output_{uuid.uuid4().hex}.wav",
+            headers={"Content-Disposition": "attachment"}
+        )
+    except Exception as e:
+        # Clean up files in case of error
+        if 'speaker_temp_path' in locals() and os.path.exists(speaker_temp_path):
+            os.remove(speaker_temp_path)
+        logger.error(f"Error in TTS endpoint: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/tts-with-url")
+async def text_to_speech_with_url(request: TTSRequest, speaker_wav_url: str):
+    """
+    Convert text to speech using a reference speaker voice from URL
+    - **request**: TTSRequest containing text and language
+    - **speaker_wav_url**: URL to the reference speaker audio file
+    """
+    if not request.text.strip():
+        raise HTTPException(status_code=400, detail="Text cannot be empty")
+    try:
+        # For this endpoint, you would need to download the file from URL
+        # This is a simplified version - you might want to add URL validation and download logic
+        raise HTTPException(status_code=501, detail="URL-based speaker input not implemented yet")
+    except Exception as e:
+        logger.error(f"Error in TTS URL endpoint: {e}")
+        raise HTTPException(status_code=500, detail=str(e))

client_example.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import requests
+import os
+def test_tts_api():
+    """Example of how to use the TTS API"""
+    # API endpoint
+    url = "http://localhost:8000/tts"
+    # Text to convert to speech
+    text = "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."
+    # Path to your speaker reference audio file
+    speaker_file_path = "/path/to/target/speaker.wav"  # Update this path
+    # Check if speaker file exists
+    if not os.path.exists(speaker_file_path):
+        print(f"Error: Speaker file not found at {speaker_file_path}")
+        print("Please update the speaker_file_path variable with a valid audio file path")
+        return
+    # Prepare the request
+    data = {
+        "text": text,
+        "language": "en"
+    }
+    files = {
+        "speaker_file": open(speaker_file_path, "rb")
+    }
+    try:
+        print("Sending request to TTS API...")
+        response = requests.post(url, data=data, files=files)
+        if response.status_code == 200:
+            # Save the generated audio
+            output_filename = "generated_speech.wav"
+            with open(output_filename, "wb") as f:
+                f.write(response.content)
+            print(f"Success! Generated speech saved as {output_filename}")
+        else:
+            print(f"Error: {response.status_code}")
+            print(response.text)
+    except requests.exceptions.ConnectionError:
+        print("Error: Could not connect to the API. Make sure the server is running on http://localhost:8000")
+    except Exception as e:
+        print(f"Error: {e}")
+    finally:
+        files["speaker_file"].close()
+def check_api_health():
+    """Check if the API is running"""
+    try:
+        response = requests.get("http://localhost:8000/health")
+        if response.status_code == 200:
+            print("API is healthy:", response.json())
+        else:
+            print("API health check failed:", response.status_code)
+    except requests.exceptions.ConnectionError:
+        print("API is not running. Start it with: python app.py")
+if __name__ == "__main__":
+    print("TTS API Client Example")
+    print("=" * 30)
+    # First check if API is running
+    check_api_health()
+    print()
+    # Test the TTS functionality
+    test_tts_api()

dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM python:3.11
+WORKDIR /app
+# Install git and git-lfs
+RUN apt-get update && apt-get install -y git git-lfs && rm -rf /var/lib/apt/lists/*
+# Initialize git lfs
+RUN git lfs install
+COPY requirements.txt .
+RUN pip install uv
+RUN uv pip install --no-cache-dir -r requirements.txt --system
+echo "Cloning the XTTS-v2_C3PO model..."
+# Clone the XTTS-v2_C3PO model and verify it
+RUN git clone https://huggingface.co/Borcherding/XTTS-v2_C3PO && \
+    ls -la XTTS-v2_C3PO/ && \
+    echo "Model directory contents:" && \
+    find XTTS-v2_C3PO/ -type f -name "*.json" -o -name "*.pth" -o -name "*.pt" | head -10
+COPY . .
+# Expose the port
+EXPOSE 8000
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+coqui-tts
+pandas
+scikit-learn
+fastapi
+uvicorn[standard]
+python-multipart
+torch
+torchaudio
+requests

test.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from TTS.api import TTS
+tts = TTS(model_path="XTTS-v2_C3PO/",
+          config_path="XTTS-v2_C3PO/config.json", progress_bar=False, gpu=True).to(self.device)
+# generate speech by cloning a voice using default settings
+tts.tts_to_file(text="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+                file_path="output.wav",
+                speaker_wav="/path/to/target/speaker.wav",
+                language="en")