ttlm / app.py
Samarth Naik
feat: Switch to minimal Coqui TTS VITS model implementation
1059c3e
"""
Minimal Text-to-Speech API using Coqui TTS VITS model
FastAPI application for Hugging Face Spaces
"""
import os
import tempfile
import logging
from pathlib import Path
from typing import Optional
from fastapi import FastAPI, HTTPException, Form
from fastapi.responses import FileResponse
from pydantic import BaseModel
import uvicorn
# Import TTS
try:
from TTS.api import TTS
except ImportError:
raise ImportError("TTS library not found. Install with: pip install TTS")
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Initialize FastAPI app
app = FastAPI(
title="Text-to-Speech API",
description="Minimal TTS API using Coqui TTS VITS model",
version="1.0.0"
)
# Global TTS model variable
tts_model = None
# Request model
class TTSRequest(BaseModel):
text: str
@app.on_event("startup")
async def startup_event():
"""
Load the TTS model once at startup.
Using the VITS model for LJSpeech dataset.
"""
global tts_model
try:
logger.info("Loading TTS model...")
# Use the specific VITS model requested
model_name = "tts_models/en/ljspeech/vits"
tts_model = TTS(model_name=model_name, progress_bar=False)
logger.info("TTS model loaded successfully!")
except Exception as e:
logger.error(f"Failed to load TTS model: {str(e)}")
raise e
@app.get("/")
async def root():
"""Health check endpoint"""
return {
"status": "healthy",
"message": "Text-to-Speech API is running",
"model": "tts_models/en/ljspeech/vits",
"engine": "Coqui TTS"
}
@app.get("/tts")
async def tts_get(text: str):
"""
Simple GET endpoint for TTS
Usage: GET /tts?text=Hello%20world
"""
if not text or len(text.strip()) == 0:
raise HTTPException(status_code=400, detail="Text parameter is required")
return await generate_speech(text)
@app.post("/tts")
async def tts_post(
request: TTSRequest = None,
text: str = Form(None)
):
"""
POST endpoint for TTS
Accepts JSON body or form data
"""
# Handle different input formats
if request:
input_text = request.text
elif text:
input_text = text
else:
raise HTTPException(status_code=400, detail="Text is required")
if not input_text or len(input_text.strip()) == 0:
raise HTTPException(status_code=400, detail="Text cannot be empty")
return await generate_speech(input_text)
async def generate_speech(text: str):
"""
Generate speech from text using the VITS model
"""
if not tts_model:
raise HTTPException(status_code=503, detail="TTS model not loaded")
try:
# Create temporary file for output
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
output_path = tmp_file.name
logger.info(f"Generating speech for text: '{text[:50]}...'")
# Generate speech using VITS model
tts_model.tts_to_file(
text=text,
file_path=output_path
)
# Verify the file was created and has content
if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
raise Exception("Generated audio file is empty or was not created")
logger.info(f"Speech generated successfully, file size: {os.path.getsize(output_path)} bytes")
# Return the audio file
return FileResponse(
path=output_path,
media_type="audio/wav",
filename="speech.wav",
headers={
"Content-Disposition": "attachment; filename=speech.wav",
"Cache-Control": "no-cache"
}
)
except Exception as e:
logger.error(f"Error generating speech: {str(e)}")
# Clean up output file on error
if 'output_path' in locals() and os.path.exists(output_path):
try:
os.unlink(output_path)
except:
pass
raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")
@app.get("/health")
async def health_check():
"""Health check endpoint"""
return {
"status": "healthy",
"model_loaded": tts_model is not None,
"model_name": "tts_models/en/ljspeech/vits"
}
if __name__ == "__main__":
# For local development and HF Spaces
uvicorn.run(app, host="0.0.0.0", port=7860)