""" TTS API — FastAPI Service using Piper TTS (standalone binary) Supports: English (Male/Female) & Arabic (Male) Designed for: Hugging Face Spaces (Docker SDK) """ import io import os import time import logging import subprocess import tempfile from enum import Enum from pathlib import Path from contextlib import asynccontextmanager from fastapi import FastAPI, HTTPException from fastapi.responses import Response from pydantic import BaseModel, Field # ─── Logging ──────────────────────────────────────────────────────────────────── logging.basicConfig( level=logging.INFO, format="%(asctime)s │ %(levelname)-7s │ %(message)s", datefmt="%H:%M:%S", ) log = logging.getLogger("tts-api") # ─── Constants ────────────────────────────────────────────────────────────────── MODELS_DIR = Path(os.environ.get("MODELS_DIR", "/app/models")) PIPER_BIN = os.environ.get("PIPER_BIN", "/app/piper/piper") # Voice registry: language → gender → model filename VOICE_MAP = { "en": { "male": "en_US-lessac-medium.onnx", "female": "en_US-amy-medium.onnx", }, "ar": { "male": "ar_JO-kareem-low.onnx", "female": "ar_JO-kareem-low.onnx", # fallback: same as male }, } # ─── Enums ────────────────────────────────────────────────────────────────────── class Language(str, Enum): english = "en" arabic = "ar" class Gender(str, Enum): male = "male" female = "female" # ─── Request Model ────────────────────────────────────────────────────────────── class TTSRequest(BaseModel): text: str = Field( ..., min_length=1, max_length=5000, description="Text to convert to speech", examples=["Hello, welcome to our text to speech service."], ) language: Language = Field( default=Language.english, description="Language of the text", ) gender: Gender = Field( default=Gender.male, description="Voice gender", ) speed: float = Field( default=1.0, ge=0.25, le=4.0, description="Speech speed multiplier (0.25 = very slow, 4.0 = very fast)", ) class HealthResponse(BaseModel): status: str piper_binary: str available_voices: list[str] # ─── Startup Validation ──────────────────────────────────────────────────────── def _validate_setup(): """Validate that piper binary and models exist.""" # Check piper binary if not os.path.isfile(PIPER_BIN): log.error(f"✗ Piper binary not found at: {PIPER_BIN}") return os.chmod(PIPER_BIN, 0o755) log.info(f"✓ Piper binary: {PIPER_BIN}") # Check models for lang, genders in VOICE_MAP.items(): for gender, model_file in genders.items(): model_path = MODELS_DIR / model_file config_path = MODELS_DIR / f"{model_file}.json" if model_path.exists() and config_path.exists(): log.info(f"✓ Voice ready: {lang}-{gender}") else: log.warning(f"✗ Missing: {lang}-{gender} ({model_file})") @asynccontextmanager async def lifespan(app: FastAPI): """Validate setup on startup.""" log.info("=" * 50) log.info("Starting TTS API") log.info("=" * 50) _validate_setup() yield log.info("Shutting down TTS API") # ─── FastAPI App ──────────────────────────────────────────────────────────────── app = FastAPI( title="TTS API", description=( "Text-to-Speech API.\n\n" "Supports **English** and **Arabic** with **Male** and **Female** voices." ), version="1.0.0", lifespan=lifespan, ) # ─── Endpoints ────────────────────────────────────────────────────────────────── @app.get("/", tags=["Health"]) def root(): """Root endpoint.""" return { "service": "TTS API", "version": "1.0.0", "docs": "/docs", } @app.get("/health", response_model=HealthResponse, tags=["Health"]) def health(): """Health check.""" voices = [] for lang, genders in VOICE_MAP.items(): for gender in genders: voices.append(f"{lang}-{gender}") return HealthResponse( status="healthy" if os.path.isfile(PIPER_BIN) else "degraded", piper_binary="found" if os.path.isfile(PIPER_BIN) else "missing", available_voices=voices, ) @app.post( "/generate", tags=["TTS"], responses={ 200: { "content": { "audio/wav": { "schema": {"type": "string", "format": "binary"} } }, "description": "Generated WAV audio file", } }, ) def generate(request: TTSRequest): """Generate speech audio from text. Returns a WAV file.""" lang = request.language.value gender = request.gender.value # Validate if lang not in VOICE_MAP: raise HTTPException(status_code=400, detail=f"Unsupported language: {lang}") if gender not in VOICE_MAP[lang]: raise HTTPException(status_code=400, detail=f"Unsupported gender: {gender}") model_file = VOICE_MAP[lang][gender] model_path = str(MODELS_DIR / model_file) if not os.path.isfile(model_path): raise HTTPException(status_code=500, detail=f"Model file not found") if not os.path.isfile(PIPER_BIN): raise HTTPException(status_code=500, detail="Piper binary not found") log.info(f"Generating: lang={lang}, gender={gender}, " f"speed={request.speed}, text_len={len(request.text)}") start_time = time.perf_counter() try: # Use a temp file for output with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: output_path = tmp.name # length_scale is inverse of speed: lower = faster length_scale = 1.0 / request.speed cmd = [ PIPER_BIN, "--model", model_path, "--output_file", output_path, "--length-scale", str(length_scale), ] process = subprocess.run( cmd, input=request.text, capture_output=True, text=True, encoding="utf-8", timeout=60, ) if process.returncode != 0: error_msg = process.stderr.strip() if process.stderr else "Unknown error" log.error(f"✗ Piper failed: {error_msg}") raise Exception(f"Piper error: {error_msg}") # Read the generated audio file with open(output_path, "rb") as f: audio_bytes = f.read() # Clean up temp file os.unlink(output_path) if len(audio_bytes) <= 44: raise Exception("No audio generated — empty WAV file") duration_ms = (time.perf_counter() - start_time) * 1000 log.info(f"✓ Generated {len(audio_bytes)} bytes in {duration_ms:.0f}ms") return Response( content=audio_bytes, media_type="audio/wav", headers={ "Content-Disposition": "attachment; filename=speech.wav", "X-Generation-Time-Ms": f"{duration_ms:.0f}", }, ) except subprocess.TimeoutExpired: log.error("✗ Synthesis timed out after 60s") raise HTTPException(status_code=504, detail="Synthesis timed out") except HTTPException: raise except Exception as e: log.error(f"✗ Synthesis failed: {e}") raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}") finally: # Ensure cleanup if 'output_path' in locals() and os.path.exists(output_path): os.unlink(output_path)