File size: 4,574 Bytes
3135113
1059c3e
 
3135113
 
 
 
 
 
 
 
1059c3e
3135113
 
 
 
1059c3e
 
 
 
 
 
3135113
 
 
 
 
 
1059c3e
 
3135113
 
 
1059c3e
 
3135113
1059c3e
3135113
 
 
 
 
 
 
1059c3e
 
3135113
1059c3e
3135113
1059c3e
be42ab9
1059c3e
 
 
3135113
1059c3e
3135113
 
1059c3e
3135113
 
 
 
 
 
 
 
1059c3e
 
 
3135113
 
 
 
1059c3e
3135113
 
1059c3e
3135113
 
 
 
1059c3e
3135113
 
 
 
 
1059c3e
3135113
 
be42ab9
 
3135113
 
 
 
 
 
 
 
 
 
 
 
1059c3e
3135113
 
1059c3e
3135113
1059c3e
3135113
1059c3e
 
 
3135113
 
 
 
 
1059c3e
be42ab9
1059c3e
 
 
 
be42ab9
3135113
 
 
 
 
 
 
 
 
 
 
1059c3e
3135113
1059c3e
3135113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1059c3e
3135113
1059c3e
 
 
3135113
 
 
 
1059c3e
3135113
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""
Minimal Text-to-Speech API using Coqui TTS VITS model
FastAPI application for Hugging Face Spaces
"""

import os
import tempfile
import logging
from pathlib import Path
from typing import Optional

from fastapi import FastAPI, HTTPException, Form
from fastapi.responses import FileResponse
from pydantic import BaseModel
import uvicorn

# Import TTS
try:
    from TTS.api import TTS
except ImportError:
    raise ImportError("TTS library not found. Install with: pip install TTS")

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize FastAPI app
app = FastAPI(
    title="Text-to-Speech API",
    description="Minimal TTS API using Coqui TTS VITS model",
    version="1.0.0"
)

# Global TTS model variable
tts_model = None

# Request model
class TTSRequest(BaseModel):
    text: str


@app.on_event("startup")
async def startup_event():
    """
    Load the TTS model once at startup.
    Using the VITS model for LJSpeech dataset.
    """
    global tts_model
    try:
        logger.info("Loading TTS model...")
        
        # Use the specific VITS model requested
        model_name = "tts_models/en/ljspeech/vits"
        tts_model = TTS(model_name=model_name, progress_bar=False)
        
        logger.info("TTS model loaded successfully!")
        
    except Exception as e:
        logger.error(f"Failed to load TTS model: {str(e)}")
        raise e


@app.get("/")
async def root():
    """Health check endpoint"""
    return {
        "status": "healthy",
        "message": "Text-to-Speech API is running",
        "model": "tts_models/en/ljspeech/vits",
        "engine": "Coqui TTS"
    }


@app.get("/tts")
async def tts_get(text: str):
    """
    Simple GET endpoint for TTS
    Usage: GET /tts?text=Hello%20world
    """
    if not text or len(text.strip()) == 0:
        raise HTTPException(status_code=400, detail="Text parameter is required")
    
    return await generate_speech(text)


@app.post("/tts")
async def tts_post(
    request: TTSRequest = None,
    text: str = Form(None)
):
    """
    POST endpoint for TTS
    Accepts JSON body or form data
    """
    # Handle different input formats
    if request:
        input_text = request.text
    elif text:
        input_text = text
    else:
        raise HTTPException(status_code=400, detail="Text is required")
    
    if not input_text or len(input_text.strip()) == 0:
        raise HTTPException(status_code=400, detail="Text cannot be empty")
    
    return await generate_speech(input_text)


async def generate_speech(text: str):
    """
    Generate speech from text using the VITS model
    """
    if not tts_model:
        raise HTTPException(status_code=503, detail="TTS model not loaded")
    
    try:
        # Create temporary file for output
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
            output_path = tmp_file.name
        
        logger.info(f"Generating speech for text: '{text[:50]}...'")
        
        # Generate speech using VITS model
        tts_model.tts_to_file(
            text=text,
            file_path=output_path
        )
        
        # Verify the file was created and has content
        if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
            raise Exception("Generated audio file is empty or was not created")
        
        logger.info(f"Speech generated successfully, file size: {os.path.getsize(output_path)} bytes")
        
        # Return the audio file
        return FileResponse(
            path=output_path,
            media_type="audio/wav",
            filename="speech.wav",
            headers={
                "Content-Disposition": "attachment; filename=speech.wav",
                "Cache-Control": "no-cache"
            }
        )
        
    except Exception as e:
        logger.error(f"Error generating speech: {str(e)}")
        # Clean up output file on error
        if 'output_path' in locals() and os.path.exists(output_path):
            try:
                os.unlink(output_path)
            except:
                pass
        raise HTTPException(status_code=500, detail=f"Failed to generate speech: {str(e)}")


@app.get("/health")
async def health_check():
    """Health check endpoint"""
    return {
        "status": "healthy",
        "model_loaded": tts_model is not None,
        "model_name": "tts_models/en/ljspeech/vits"
    }


if __name__ == "__main__":
    # For local development and HF Spaces
    uvicorn.run(app, host="0.0.0.0", port=7860)