Spaces:

aankitdas
/

tts-eval-framework

Sleeping

App Files Files Community

tts-eval-framework / app /engines /piper_engine.py

aankitdas

upgraded for hf space

24a256c about 1 month ago

raw

history blame contribute delete

4.34 kB

	# app/engines/piper_engine.py
	# Piper TTS engine — fast ONNX-based neural TTS, fully offline.
	# Voices are downloaded on demand from rhasspy/piper-voices on HF Hub
	# and cached flat in voices/piper/ for subsequent runs.
	# Designed for low-latency, low-resource deployment (runs on Raspberry Pi).
	# Faster than Kokoro on CPU, lower naturalness ceiling.
	# Good fallback: offline after first download, no API key, minimal VRAM.

	import wave
	import time
	import shutil
	from pathlib import Path
	from piper import PiperVoice
	from huggingface_hub import hf_hub_download

	from engines.base import TTSEngine

	# voice files live flat in voices/piper/ relative to project root
	_VOICES_DIR = Path(__file__).parent.parent.parent / "voices" / "piper"

	# cache loaded voices — loading ONNX takes ~0.5s, reuse across calls
	_voice_cache: dict[str, PiperVoice] = {}


	def _ensure_model_downloaded(voice_file: str) -> None:
	"""
	Checks for model and config at flat voices/piper/ path.
	If missing, downloads from rhasspy/piper-voices on HF Hub
	and moves to flat location. Handles .onnx and .json separately
	so a partial download can be recovered.
	"""
	_VOICES_DIR.mkdir(parents=True, exist_ok=True)

	model_path = _VOICES_DIR / voice_file
	config_path = _VOICES_DIR / f"{voice_file}.json"

	# parse voice file name into HF Hub repo subfolder structure
	# e.g. en_US-amy-medium.onnx -> en/en_US/amy/medium/
	parts = voice_file.split("-")
	lang_family = parts[0].split("_")[0] # "en"
	lang_full = parts[0] # "en_US"
	speaker = parts[1] # "amy"
	quality = parts[2].replace(".onnx", "") # "medium"
	repo_subfolder = f"{lang_family}/{lang_full}/{speaker}/{quality}"

	if not model_path.exists():
	print(f"[Piper] Downloading {voice_file} from HF Hub...")
	downloaded = hf_hub_download(
	repo_id="rhasspy/piper-voices",
	filename=f"{repo_subfolder}/{voice_file}",
	local_dir=str(_VOICES_DIR),
	local_dir_use_symlinks=False,
	)
	shutil.move(downloaded, model_path)
	print(f"[Piper] Saved to {model_path}")

	if not config_path.exists():
	print(f"[Piper] Downloading {voice_file}.json from HF Hub...")
	downloaded = hf_hub_download(
	repo_id="rhasspy/piper-voices",
	filename=f"{repo_subfolder}/{voice_file}.json",
	local_dir=str(_VOICES_DIR),
	local_dir_use_symlinks=False,
	)
	shutil.move(downloaded, config_path)
	print(f"[Piper] Saved to {config_path}")


	def _get_voice(voice_file: str) -> PiperVoice:
	"""Returns a cached PiperVoice, downloading the model first if needed."""
	if voice_file not in _voice_cache:
	_ensure_model_downloaded(voice_file)
	model_path = _VOICES_DIR / voice_file
	_voice_cache[voice_file] = PiperVoice.load(
	str(model_path),
	use_cuda=False, # ONNX CUDA provider requires separate install
	)
	return _voice_cache[voice_file]


	class PiperEngine(TTSEngine):

	name = "Piper (ONNX)"
	engine_type = "neural-local"
	cost_per_million_chars = 0.0
	is_production_ready = False # lower naturalness than Kokoro, no band-tuned voices yet
	requires_internet = False # only on first run; fully offline after download

	BAND_CONFIG = {
	"K-2": {"voice_file": "en_US-amy-medium.onnx", "speed": 0.9},
	"3-5": {"voice_file": "en_US-amy-medium.onnx", "speed": 1.0},
	"6-8": {"voice_file": "en_US-amy-medium.onnx", "speed": 1.0},
	"9-12": {"voice_file": "en_US-lessac-medium.onnx", "speed": 1.1},
	}

	def synthesize(self, text: str, band: str, output_path: str) -> dict:
	config = self.get_band_config(band)
	voice_file = config["voice_file"]
	full_path = output_path + ".wav"

	voice = _get_voice(voice_file)

	start = time.time()
	with wave.open(full_path, "wb") as wav_file:
	voice.synthesize_wav(text, wav_file)
	latency = round(time.time() - start, 3)

	return {
	"audio_path": full_path,
	"latency_seconds": latency,
	"voice": voice_file.replace(".onnx", ""),
	"speed": config["speed"],
	"engine": self.name,
	}