tts_testing_2_m / main.py
talha77's picture
Upload 3 files
26b8b20 verified
"""
TTS API β€” FastAPI Service using Piper TTS (standalone binary)
Supports: English (Male/Female) & Arabic (Male)
Designed for: Hugging Face Spaces (Docker SDK)
"""
import io
import os
import time
import logging
import subprocess
import tempfile
from enum import Enum
from pathlib import Path
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException
from fastapi.responses import Response
from pydantic import BaseModel, Field
# ─── Logging ────────────────────────────────────────────────────────────────────
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s β”‚ %(levelname)-7s β”‚ %(message)s",
datefmt="%H:%M:%S",
)
log = logging.getLogger("tts-api")
# ─── Constants ──────────────────────────────────────────────────────────────────
MODELS_DIR = Path(os.environ.get("MODELS_DIR", "/app/models"))
PIPER_BIN = os.environ.get("PIPER_BIN", "/app/piper/piper")
# Voice registry: language β†’ gender β†’ model filename
VOICE_MAP = {
"en": {
"male": "en_US-lessac-medium.onnx",
"female": "en_US-amy-medium.onnx",
},
"ar": {
"male": "ar_JO-kareem-low.onnx",
"female": "ar_JO-kareem-low.onnx", # fallback: same as male
},
}
# ─── Enums ──────────────────────────────────────────────────────────────────────
class Language(str, Enum):
english = "en"
arabic = "ar"
class Gender(str, Enum):
male = "male"
female = "female"
# ─── Request Model ──────────────────────────────────────────────────────────────
class TTSRequest(BaseModel):
text: str = Field(
...,
min_length=1,
max_length=5000,
description="Text to convert to speech",
examples=["Hello, welcome to our text to speech service."],
)
language: Language = Field(
default=Language.english,
description="Language of the text",
)
gender: Gender = Field(
default=Gender.male,
description="Voice gender",
)
speed: float = Field(
default=1.0,
ge=0.25,
le=4.0,
description="Speech speed multiplier (0.25 = very slow, 4.0 = very fast)",
)
class HealthResponse(BaseModel):
status: str
piper_binary: str
available_voices: list[str]
# ─── Startup Validation ────────────────────────────────────────────────────────
def _validate_setup():
"""Validate that piper binary and models exist."""
# Check piper binary
if not os.path.isfile(PIPER_BIN):
log.error(f"βœ— Piper binary not found at: {PIPER_BIN}")
return
os.chmod(PIPER_BIN, 0o755)
log.info(f"βœ“ Piper binary: {PIPER_BIN}")
# Check models
for lang, genders in VOICE_MAP.items():
for gender, model_file in genders.items():
model_path = MODELS_DIR / model_file
config_path = MODELS_DIR / f"{model_file}.json"
if model_path.exists() and config_path.exists():
log.info(f"βœ“ Voice ready: {lang}-{gender}")
else:
log.warning(f"βœ— Missing: {lang}-{gender} ({model_file})")
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Validate setup on startup."""
log.info("=" * 50)
log.info("Starting TTS API")
log.info("=" * 50)
_validate_setup()
yield
log.info("Shutting down TTS API")
# ─── FastAPI App ────────────────────────────────────────────────────────────────
app = FastAPI(
title="TTS API",
description=(
"Text-to-Speech API.\n\n"
"Supports **English** and **Arabic** with **Male** and **Female** voices."
),
version="1.0.0",
lifespan=lifespan,
)
# ─── Endpoints ──────────────────────────────────────────────────────────────────
@app.get("/", tags=["Health"])
def root():
"""Root endpoint."""
return {
"service": "TTS API",
"version": "1.0.0",
"docs": "/docs",
}
@app.get("/health", response_model=HealthResponse, tags=["Health"])
def health():
"""Health check."""
voices = []
for lang, genders in VOICE_MAP.items():
for gender in genders:
voices.append(f"{lang}-{gender}")
return HealthResponse(
status="healthy" if os.path.isfile(PIPER_BIN) else "degraded",
piper_binary="found" if os.path.isfile(PIPER_BIN) else "missing",
available_voices=voices,
)
@app.post(
"/generate",
tags=["TTS"],
responses={
200: {
"content": {
"audio/wav": {
"schema": {"type": "string", "format": "binary"}
}
},
"description": "Generated WAV audio file",
}
},
)
def generate(request: TTSRequest):
"""Generate speech audio from text. Returns a WAV file."""
lang = request.language.value
gender = request.gender.value
# Validate
if lang not in VOICE_MAP:
raise HTTPException(status_code=400, detail=f"Unsupported language: {lang}")
if gender not in VOICE_MAP[lang]:
raise HTTPException(status_code=400, detail=f"Unsupported gender: {gender}")
model_file = VOICE_MAP[lang][gender]
model_path = str(MODELS_DIR / model_file)
if not os.path.isfile(model_path):
raise HTTPException(status_code=500, detail=f"Model file not found")
if not os.path.isfile(PIPER_BIN):
raise HTTPException(status_code=500, detail="Piper binary not found")
log.info(f"Generating: lang={lang}, gender={gender}, "
f"speed={request.speed}, text_len={len(request.text)}")
start_time = time.perf_counter()
try:
# Use a temp file for output
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
output_path = tmp.name
# length_scale is inverse of speed: lower = faster
length_scale = 1.0 / request.speed
cmd = [
PIPER_BIN,
"--model", model_path,
"--output_file", output_path,
"--length-scale", str(length_scale),
]
process = subprocess.run(
cmd,
input=request.text,
capture_output=True,
text=True,
encoding="utf-8",
timeout=60,
)
if process.returncode != 0:
error_msg = process.stderr.strip() if process.stderr else "Unknown error"
log.error(f"βœ— Piper failed: {error_msg}")
raise Exception(f"Piper error: {error_msg}")
# Read the generated audio file
with open(output_path, "rb") as f:
audio_bytes = f.read()
# Clean up temp file
os.unlink(output_path)
if len(audio_bytes) <= 44:
raise Exception("No audio generated β€” empty WAV file")
duration_ms = (time.perf_counter() - start_time) * 1000
log.info(f"βœ“ Generated {len(audio_bytes)} bytes in {duration_ms:.0f}ms")
return Response(
content=audio_bytes,
media_type="audio/wav",
headers={
"Content-Disposition": "attachment; filename=speech.wav",
"X-Generation-Time-Ms": f"{duration_ms:.0f}",
},
)
except subprocess.TimeoutExpired:
log.error("βœ— Synthesis timed out after 60s")
raise HTTPException(status_code=504, detail="Synthesis timed out")
except HTTPException:
raise
except Exception as e:
log.error(f"βœ— Synthesis failed: {e}")
raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}")
finally:
# Ensure cleanup
if 'output_path' in locals() and os.path.exists(output_path):
os.unlink(output_path)