Spaces:
Sleeping
Sleeping
| """ | |
| TTS API β FastAPI Service using Piper TTS (standalone binary) | |
| Supports: English (Male/Female) & Arabic (Male) | |
| Designed for: Hugging Face Spaces (Docker SDK) | |
| """ | |
| import io | |
| import os | |
| import time | |
| import logging | |
| import subprocess | |
| import tempfile | |
| from enum import Enum | |
| from pathlib import Path | |
| from contextlib import asynccontextmanager | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.responses import Response | |
| from pydantic import BaseModel, Field | |
| # βββ Logging ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s β %(levelname)-7s β %(message)s", | |
| datefmt="%H:%M:%S", | |
| ) | |
| log = logging.getLogger("tts-api") | |
| # βββ Constants ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| MODELS_DIR = Path(os.environ.get("MODELS_DIR", "/app/models")) | |
| PIPER_BIN = os.environ.get("PIPER_BIN", "/app/piper/piper") | |
| # Voice registry: language β gender β model filename | |
| VOICE_MAP = { | |
| "en": { | |
| "male": "en_US-lessac-medium.onnx", | |
| "female": "en_US-amy-medium.onnx", | |
| }, | |
| "ar": { | |
| "male": "ar_JO-kareem-low.onnx", | |
| "female": "ar_JO-kareem-low.onnx", # fallback: same as male | |
| }, | |
| } | |
| # βββ Enums ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class Language(str, Enum): | |
| english = "en" | |
| arabic = "ar" | |
| class Gender(str, Enum): | |
| male = "male" | |
| female = "female" | |
| # βββ Request Model ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TTSRequest(BaseModel): | |
| text: str = Field( | |
| ..., | |
| min_length=1, | |
| max_length=5000, | |
| description="Text to convert to speech", | |
| examples=["Hello, welcome to our text to speech service."], | |
| ) | |
| language: Language = Field( | |
| default=Language.english, | |
| description="Language of the text", | |
| ) | |
| gender: Gender = Field( | |
| default=Gender.male, | |
| description="Voice gender", | |
| ) | |
| speed: float = Field( | |
| default=1.0, | |
| ge=0.25, | |
| le=4.0, | |
| description="Speech speed multiplier (0.25 = very slow, 4.0 = very fast)", | |
| ) | |
| class HealthResponse(BaseModel): | |
| status: str | |
| piper_binary: str | |
| available_voices: list[str] | |
| # βββ Startup Validation ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _validate_setup(): | |
| """Validate that piper binary and models exist.""" | |
| # Check piper binary | |
| if not os.path.isfile(PIPER_BIN): | |
| log.error(f"β Piper binary not found at: {PIPER_BIN}") | |
| return | |
| os.chmod(PIPER_BIN, 0o755) | |
| log.info(f"β Piper binary: {PIPER_BIN}") | |
| # Check models | |
| for lang, genders in VOICE_MAP.items(): | |
| for gender, model_file in genders.items(): | |
| model_path = MODELS_DIR / model_file | |
| config_path = MODELS_DIR / f"{model_file}.json" | |
| if model_path.exists() and config_path.exists(): | |
| log.info(f"β Voice ready: {lang}-{gender}") | |
| else: | |
| log.warning(f"β Missing: {lang}-{gender} ({model_file})") | |
| async def lifespan(app: FastAPI): | |
| """Validate setup on startup.""" | |
| log.info("=" * 50) | |
| log.info("Starting TTS API") | |
| log.info("=" * 50) | |
| _validate_setup() | |
| yield | |
| log.info("Shutting down TTS API") | |
| # βββ FastAPI App ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| app = FastAPI( | |
| title="TTS API", | |
| description=( | |
| "Text-to-Speech API.\n\n" | |
| "Supports **English** and **Arabic** with **Male** and **Female** voices." | |
| ), | |
| version="1.0.0", | |
| lifespan=lifespan, | |
| ) | |
| # βββ Endpoints ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def root(): | |
| """Root endpoint.""" | |
| return { | |
| "service": "TTS API", | |
| "version": "1.0.0", | |
| "docs": "/docs", | |
| } | |
| def health(): | |
| """Health check.""" | |
| voices = [] | |
| for lang, genders in VOICE_MAP.items(): | |
| for gender in genders: | |
| voices.append(f"{lang}-{gender}") | |
| return HealthResponse( | |
| status="healthy" if os.path.isfile(PIPER_BIN) else "degraded", | |
| piper_binary="found" if os.path.isfile(PIPER_BIN) else "missing", | |
| available_voices=voices, | |
| ) | |
| def generate(request: TTSRequest): | |
| """Generate speech audio from text. Returns a WAV file.""" | |
| lang = request.language.value | |
| gender = request.gender.value | |
| # Validate | |
| if lang not in VOICE_MAP: | |
| raise HTTPException(status_code=400, detail=f"Unsupported language: {lang}") | |
| if gender not in VOICE_MAP[lang]: | |
| raise HTTPException(status_code=400, detail=f"Unsupported gender: {gender}") | |
| model_file = VOICE_MAP[lang][gender] | |
| model_path = str(MODELS_DIR / model_file) | |
| if not os.path.isfile(model_path): | |
| raise HTTPException(status_code=500, detail=f"Model file not found") | |
| if not os.path.isfile(PIPER_BIN): | |
| raise HTTPException(status_code=500, detail="Piper binary not found") | |
| log.info(f"Generating: lang={lang}, gender={gender}, " | |
| f"speed={request.speed}, text_len={len(request.text)}") | |
| start_time = time.perf_counter() | |
| try: | |
| # Use a temp file for output | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: | |
| output_path = tmp.name | |
| # length_scale is inverse of speed: lower = faster | |
| length_scale = 1.0 / request.speed | |
| cmd = [ | |
| PIPER_BIN, | |
| "--model", model_path, | |
| "--output_file", output_path, | |
| "--length-scale", str(length_scale), | |
| ] | |
| process = subprocess.run( | |
| cmd, | |
| input=request.text, | |
| capture_output=True, | |
| text=True, | |
| encoding="utf-8", | |
| timeout=60, | |
| ) | |
| if process.returncode != 0: | |
| error_msg = process.stderr.strip() if process.stderr else "Unknown error" | |
| log.error(f"β Piper failed: {error_msg}") | |
| raise Exception(f"Piper error: {error_msg}") | |
| # Read the generated audio file | |
| with open(output_path, "rb") as f: | |
| audio_bytes = f.read() | |
| # Clean up temp file | |
| os.unlink(output_path) | |
| if len(audio_bytes) <= 44: | |
| raise Exception("No audio generated β empty WAV file") | |
| duration_ms = (time.perf_counter() - start_time) * 1000 | |
| log.info(f"β Generated {len(audio_bytes)} bytes in {duration_ms:.0f}ms") | |
| return Response( | |
| content=audio_bytes, | |
| media_type="audio/wav", | |
| headers={ | |
| "Content-Disposition": "attachment; filename=speech.wav", | |
| "X-Generation-Time-Ms": f"{duration_ms:.0f}", | |
| }, | |
| ) | |
| except subprocess.TimeoutExpired: | |
| log.error("β Synthesis timed out after 60s") | |
| raise HTTPException(status_code=504, detail="Synthesis timed out") | |
| except HTTPException: | |
| raise | |
| except Exception as e: | |
| log.error(f"β Synthesis failed: {e}") | |
| raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}") | |
| finally: | |
| # Ensure cleanup | |
| if 'output_path' in locals() and os.path.exists(output_path): | |
| os.unlink(output_path) | |