Spaces:

talha77
/

tts_testing_2_m

Sleeping

App Files Files Community

tts_testing_2_m / main.py

talha77

Upload 3 files

26b8b20 verified about 2 months ago

raw

history blame contribute delete

8.87 kB

	"""
	TTS API — FastAPI Service using Piper TTS (standalone binary)
	Supports: English (Male/Female) & Arabic (Male)
	Designed for: Hugging Face Spaces (Docker SDK)
	"""

	import io
	import os
	import time
	import logging
	import subprocess
	import tempfile
	from enum import Enum
	from pathlib import Path
	from contextlib import asynccontextmanager

	from fastapi import FastAPI, HTTPException
	from fastapi.responses import Response
	from pydantic import BaseModel, Field

	# ─── Logging ────────────────────────────────────────────────────────────────────
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s │ %(levelname)-7s │ %(message)s",
	datefmt="%H:%M:%S",
	)
	log = logging.getLogger("tts-api")

	# ─── Constants ──────────────────────────────────────────────────────────────────
	MODELS_DIR = Path(os.environ.get("MODELS_DIR", "/app/models"))
	PIPER_BIN = os.environ.get("PIPER_BIN", "/app/piper/piper")

	# Voice registry: language → gender → model filename
	VOICE_MAP = {
	"en": {
	"male": "en_US-lessac-medium.onnx",
	"female": "en_US-amy-medium.onnx",
	},
	"ar": {
	"male": "ar_JO-kareem-low.onnx",
	"female": "ar_JO-kareem-low.onnx", # fallback: same as male
	},
	}


	# ─── Enums ──────────────────────────────────────────────────────────────────────
	class Language(str, Enum):
	english = "en"
	arabic = "ar"


	class Gender(str, Enum):
	male = "male"
	female = "female"


	# ─── Request Model ──────────────────────────────────────────────────────────────
	class TTSRequest(BaseModel):
	text: str = Field(
	...,
	min_length=1,
	max_length=5000,
	description="Text to convert to speech",
	examples=["Hello, welcome to our text to speech service."],
	)
	language: Language = Field(
	default=Language.english,
	description="Language of the text",
	)
	gender: Gender = Field(
	default=Gender.male,
	description="Voice gender",
	)
	speed: float = Field(
	default=1.0,
	ge=0.25,
	le=4.0,
	description="Speech speed multiplier (0.25 = very slow, 4.0 = very fast)",
	)


	class HealthResponse(BaseModel):
	status: str
	piper_binary: str
	available_voices: list[str]


	# ─── Startup Validation ────────────────────────────────────────────────────────
	def _validate_setup():
	"""Validate that piper binary and models exist."""
	# Check piper binary
	if not os.path.isfile(PIPER_BIN):
	log.error(f"✗ Piper binary not found at: {PIPER_BIN}")
	return

	os.chmod(PIPER_BIN, 0o755)
	log.info(f"✓ Piper binary: {PIPER_BIN}")

	# Check models
	for lang, genders in VOICE_MAP.items():
	for gender, model_file in genders.items():
	model_path = MODELS_DIR / model_file
	config_path = MODELS_DIR / f"{model_file}.json"
	if model_path.exists() and config_path.exists():
	log.info(f"✓ Voice ready: {lang}-{gender}")
	else:
	log.warning(f"✗ Missing: {lang}-{gender} ({model_file})")


	@asynccontextmanager
	async def lifespan(app: FastAPI):
	"""Validate setup on startup."""
	log.info("=" * 50)
	log.info("Starting TTS API")
	log.info("=" * 50)
	_validate_setup()
	yield
	log.info("Shutting down TTS API")


	# ─── FastAPI App ────────────────────────────────────────────────────────────────
	app = FastAPI(
	title="TTS API",
	description=(
	"Text-to-Speech API.\n\n"
	"Supports English and Arabic with Male and Female voices."
	),
	version="1.0.0",
	lifespan=lifespan,
	)


	# ─── Endpoints ──────────────────────────────────────────────────────────────────
	@app.get("/", tags=["Health"])
	def root():
	"""Root endpoint."""
	return {
	"service": "TTS API",
	"version": "1.0.0",
	"docs": "/docs",
	}


	@app.get("/health", response_model=HealthResponse, tags=["Health"])
	def health():
	"""Health check."""
	voices = []
	for lang, genders in VOICE_MAP.items():
	for gender in genders:
	voices.append(f"{lang}-{gender}")
	return HealthResponse(
	status="healthy" if os.path.isfile(PIPER_BIN) else "degraded",
	piper_binary="found" if os.path.isfile(PIPER_BIN) else "missing",
	available_voices=voices,
	)


	@app.post(
	"/generate",
	tags=["TTS"],
	responses={
	200: {
	"content": {
	"audio/wav": {
	"schema": {"type": "string", "format": "binary"}
	}
	},
	"description": "Generated WAV audio file",
	}
	},
	)
	def generate(request: TTSRequest):
	"""Generate speech audio from text. Returns a WAV file."""
	lang = request.language.value
	gender = request.gender.value

	# Validate
	if lang not in VOICE_MAP:
	raise HTTPException(status_code=400, detail=f"Unsupported language: {lang}")
	if gender not in VOICE_MAP[lang]:
	raise HTTPException(status_code=400, detail=f"Unsupported gender: {gender}")

	model_file = VOICE_MAP[lang][gender]
	model_path = str(MODELS_DIR / model_file)

	if not os.path.isfile(model_path):
	raise HTTPException(status_code=500, detail=f"Model file not found")

	if not os.path.isfile(PIPER_BIN):
	raise HTTPException(status_code=500, detail="Piper binary not found")

	log.info(f"Generating: lang={lang}, gender={gender}, "
	f"speed={request.speed}, text_len={len(request.text)}")

	start_time = time.perf_counter()

	try:
	# Use a temp file for output
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
	output_path = tmp.name

	# length_scale is inverse of speed: lower = faster
	length_scale = 1.0 / request.speed

	cmd = [
	PIPER_BIN,
	"--model", model_path,
	"--output_file", output_path,
	"--length-scale", str(length_scale),
	]

	process = subprocess.run(
	cmd,
	input=request.text,
	capture_output=True,
	text=True,
	encoding="utf-8",
	timeout=60,
	)

	if process.returncode != 0:
	error_msg = process.stderr.strip() if process.stderr else "Unknown error"
	log.error(f"✗ Piper failed: {error_msg}")
	raise Exception(f"Piper error: {error_msg}")

	# Read the generated audio file
	with open(output_path, "rb") as f:
	audio_bytes = f.read()

	# Clean up temp file
	os.unlink(output_path)

	if len(audio_bytes) <= 44:
	raise Exception("No audio generated — empty WAV file")

	duration_ms = (time.perf_counter() - start_time) * 1000
	log.info(f"✓ Generated {len(audio_bytes)} bytes in {duration_ms:.0f}ms")

	return Response(
	content=audio_bytes,
	media_type="audio/wav",
	headers={
	"Content-Disposition": "attachment; filename=speech.wav",
	"X-Generation-Time-Ms": f"{duration_ms:.0f}",
	},
	)

	except subprocess.TimeoutExpired:
	log.error("✗ Synthesis timed out after 60s")
	raise HTTPException(status_code=504, detail="Synthesis timed out")
	except HTTPException:
	raise
	except Exception as e:
	log.error(f"✗ Synthesis failed: {e}")
	raise HTTPException(status_code=500, detail=f"Synthesis failed: {str(e)}")
	finally:
	# Ensure cleanup
	if 'output_path' in locals() and os.path.exists(output_path):
	os.unlink(output_path)