Spaces:

triflix
/

tts

Runtime error

App Files Files Community

tts / app.py

triflix

Update app.py

2eeb709 verified 3 months ago

raw

history blame contribute delete

3.97 kB

	import io
	import os
	import wave
	import asyncio
	from fastapi import FastAPI, HTTPException, Body
	from fastapi.responses import StreamingResponse, Response


	from piper import PiperVoice, SynthesisConfig
	import onnxruntime as ort

	app = FastAPI(title="Piper TTS API")

	MODEL_ONNX = os.environ.get("PIPER_MODEL", "models/en_US-ljspeech-medium.onnx")

	engine_ready = False
	engine_provider = "CPU"
	voice = None
	sample_rate = 22050
	synth_lock = asyncio.Lock()

	async def init_engine():
	global engine_ready, engine_provider, voice, sample_rate
	try:
	providers = []
	try:
	providers = ort.get_available_providers()
	except Exception:
	providers = []
	use_cuda = "CUDAExecutionProvider" in providers
	engine_provider = "CUDAExecutionProvider" if use_cuda else "CPU"
	voice = PiperVoice.load(MODEL_ONNX, use_cuda=use_cuda)
	# PiperVoice exposes effective sample rate via metadata on chunks; default to 22050
	sample_rate = getattr(voice, "sample_rate", 22050)
	engine_ready = True
	except Exception as e:
	engine_ready = False
	raise e

	@app.on_event("startup")
	async def on_startup():
	await init_engine()

	@app.get("/health")
	async def health():
	if not engine_ready:
	raise HTTPException(503, "model not ready")
	return {"status": "ok", "provider": engine_provider, "sample_rate": sample_rate}

	def map_config(volume, length_scale, noise_scale, noise_w_scale, normalize_audio):
	return SynthesisConfig(
	volume=float(volume),
	length_scale=float(length_scale),
	noise_scale=float(noise_scale),
	noise_w_scale=float(noise_w_scale),
	normalize_audio=bool(normalize_audio),
	)

	@app.post("/tts")
	async def tts_stream(
	text: str = Body(..., embed=True),
	volume: float = 0.8,
	length_scale: float = 1.0,
	noise_scale: float = 0.6,
	noise_w_scale: float = 0.6,
	normalize_audio: bool = True,
	):
	if not engine_ready:
	raise HTTPException(503, "model not ready")
	if not text or not text.strip():
	raise HTTPException(400, "text required")

	cfg = map_config(volume, length_scale, noise_scale, noise_w_scale, normalize_audio)

	if not synth_lock.locked():
	await synth_lock.acquire()
	else:
	raise HTTPException(429, "synthesis in progress, try later")

	async def generator():
	try:
	# Emit a minimal valid WAV header (PCM 16-bit mono)
	header_buf = io.BytesIO()
	with wave.open(header_buf, "wb") as wf:
	wf.setnchannels(1)
	wf.setsampwidth(2)
	wf.setframerate(sample_rate)
	wf.writeframes(b"")
	header_buf.seek(0)
	yield header_buf.read()

	# Stream raw PCM frames as generated by Piper
	for chunk in voice.synthesize(text, syn_config=cfg):
	yield chunk.audio_int16_bytes
	finally:
	synth_lock.release()

	headers = {"Content-Disposition": 'inline; filename="speech.wav"', "Cache-Control": "no-store"}
	return StreamingResponse(generator(), media_type="audio/wav", headers=headers)

	@app.post("/tts-file")
	async def tts_file(
	text: str = Body(..., embed=True),
	volume: float = 0.8,
	length_scale: float = 1.0,
	noise_scale: float = 0.6,
	noise_w_scale: float = 0.6,
	normalize_audio: bool = True,
	):
	if not engine_ready:
	raise HTTPException(503, "model not ready")
	if not text or not text.strip():
	raise HTTPException(400, "text required")

	cfg = map_config(volume, length_scale, noise_scale, noise_w_scale, normalize_audio)
	buf = io.BytesIO()
	with wave.open(buf, "wb") as wf:
	voice.synthesize_wav(text, wf, syn_config=cfg)
	audio = buf.getvalue()
	headers = {"Content-Disposition": 'attachment; filename="speech.wav"'}
	return Response(content=audio, media_type="audio/wav", headers=headers)