# app/engines/piper_engine.py # Piper TTS engine — fast ONNX-based neural TTS, fully offline. # Voices are downloaded on demand from rhasspy/piper-voices on HF Hub # and cached flat in voices/piper/ for subsequent runs. # Designed for low-latency, low-resource deployment (runs on Raspberry Pi). # Faster than Kokoro on CPU, lower naturalness ceiling. # Good fallback: offline after first download, no API key, minimal VRAM. import wave import time import shutil from pathlib import Path from piper import PiperVoice from huggingface_hub import hf_hub_download from engines.base import TTSEngine # voice files live flat in voices/piper/ relative to project root _VOICES_DIR = Path(__file__).parent.parent.parent / "voices" / "piper" # cache loaded voices — loading ONNX takes ~0.5s, reuse across calls _voice_cache: dict[str, PiperVoice] = {} def _ensure_model_downloaded(voice_file: str) -> None: """ Checks for model and config at flat voices/piper/ path. If missing, downloads from rhasspy/piper-voices on HF Hub and moves to flat location. Handles .onnx and .json separately so a partial download can be recovered. """ _VOICES_DIR.mkdir(parents=True, exist_ok=True) model_path = _VOICES_DIR / voice_file config_path = _VOICES_DIR / f"{voice_file}.json" # parse voice file name into HF Hub repo subfolder structure # e.g. en_US-amy-medium.onnx -> en/en_US/amy/medium/ parts = voice_file.split("-") lang_family = parts[0].split("_")[0] # "en" lang_full = parts[0] # "en_US" speaker = parts[1] # "amy" quality = parts[2].replace(".onnx", "") # "medium" repo_subfolder = f"{lang_family}/{lang_full}/{speaker}/{quality}" if not model_path.exists(): print(f"[Piper] Downloading {voice_file} from HF Hub...") downloaded = hf_hub_download( repo_id="rhasspy/piper-voices", filename=f"{repo_subfolder}/{voice_file}", local_dir=str(_VOICES_DIR), local_dir_use_symlinks=False, ) shutil.move(downloaded, model_path) print(f"[Piper] Saved to {model_path}") if not config_path.exists(): print(f"[Piper] Downloading {voice_file}.json from HF Hub...") downloaded = hf_hub_download( repo_id="rhasspy/piper-voices", filename=f"{repo_subfolder}/{voice_file}.json", local_dir=str(_VOICES_DIR), local_dir_use_symlinks=False, ) shutil.move(downloaded, config_path) print(f"[Piper] Saved to {config_path}") def _get_voice(voice_file: str) -> PiperVoice: """Returns a cached PiperVoice, downloading the model first if needed.""" if voice_file not in _voice_cache: _ensure_model_downloaded(voice_file) model_path = _VOICES_DIR / voice_file _voice_cache[voice_file] = PiperVoice.load( str(model_path), use_cuda=False, # ONNX CUDA provider requires separate install ) return _voice_cache[voice_file] class PiperEngine(TTSEngine): name = "Piper (ONNX)" engine_type = "neural-local" cost_per_million_chars = 0.0 is_production_ready = False # lower naturalness than Kokoro, no band-tuned voices yet requires_internet = False # only on first run; fully offline after download BAND_CONFIG = { "K-2": {"voice_file": "en_US-amy-medium.onnx", "speed": 0.9}, "3-5": {"voice_file": "en_US-amy-medium.onnx", "speed": 1.0}, "6-8": {"voice_file": "en_US-amy-medium.onnx", "speed": 1.0}, "9-12": {"voice_file": "en_US-lessac-medium.onnx", "speed": 1.1}, } def synthesize(self, text: str, band: str, output_path: str) -> dict: config = self.get_band_config(band) voice_file = config["voice_file"] full_path = output_path + ".wav" voice = _get_voice(voice_file) start = time.time() with wave.open(full_path, "wb") as wav_file: voice.synthesize_wav(text, wav_file) latency = round(time.time() - start, 3) return { "audio_path": full_path, "latency_seconds": latency, "voice": voice_file.replace(".onnx", ""), "speed": config["speed"], "engine": self.name, }