tts-eval-framework / app /engines /piper_engine.py
aankitdas's picture
upgraded for hf space
24a256c
# app/engines/piper_engine.py
# Piper TTS engine — fast ONNX-based neural TTS, fully offline.
# Voices are downloaded on demand from rhasspy/piper-voices on HF Hub
# and cached flat in voices/piper/ for subsequent runs.
# Designed for low-latency, low-resource deployment (runs on Raspberry Pi).
# Faster than Kokoro on CPU, lower naturalness ceiling.
# Good fallback: offline after first download, no API key, minimal VRAM.
import wave
import time
import shutil
from pathlib import Path
from piper import PiperVoice
from huggingface_hub import hf_hub_download
from engines.base import TTSEngine
# voice files live flat in voices/piper/ relative to project root
_VOICES_DIR = Path(__file__).parent.parent.parent / "voices" / "piper"
# cache loaded voices — loading ONNX takes ~0.5s, reuse across calls
_voice_cache: dict[str, PiperVoice] = {}
def _ensure_model_downloaded(voice_file: str) -> None:
"""
Checks for model and config at flat voices/piper/ path.
If missing, downloads from rhasspy/piper-voices on HF Hub
and moves to flat location. Handles .onnx and .json separately
so a partial download can be recovered.
"""
_VOICES_DIR.mkdir(parents=True, exist_ok=True)
model_path = _VOICES_DIR / voice_file
config_path = _VOICES_DIR / f"{voice_file}.json"
# parse voice file name into HF Hub repo subfolder structure
# e.g. en_US-amy-medium.onnx -> en/en_US/amy/medium/
parts = voice_file.split("-")
lang_family = parts[0].split("_")[0] # "en"
lang_full = parts[0] # "en_US"
speaker = parts[1] # "amy"
quality = parts[2].replace(".onnx", "") # "medium"
repo_subfolder = f"{lang_family}/{lang_full}/{speaker}/{quality}"
if not model_path.exists():
print(f"[Piper] Downloading {voice_file} from HF Hub...")
downloaded = hf_hub_download(
repo_id="rhasspy/piper-voices",
filename=f"{repo_subfolder}/{voice_file}",
local_dir=str(_VOICES_DIR),
local_dir_use_symlinks=False,
)
shutil.move(downloaded, model_path)
print(f"[Piper] Saved to {model_path}")
if not config_path.exists():
print(f"[Piper] Downloading {voice_file}.json from HF Hub...")
downloaded = hf_hub_download(
repo_id="rhasspy/piper-voices",
filename=f"{repo_subfolder}/{voice_file}.json",
local_dir=str(_VOICES_DIR),
local_dir_use_symlinks=False,
)
shutil.move(downloaded, config_path)
print(f"[Piper] Saved to {config_path}")
def _get_voice(voice_file: str) -> PiperVoice:
"""Returns a cached PiperVoice, downloading the model first if needed."""
if voice_file not in _voice_cache:
_ensure_model_downloaded(voice_file)
model_path = _VOICES_DIR / voice_file
_voice_cache[voice_file] = PiperVoice.load(
str(model_path),
use_cuda=False, # ONNX CUDA provider requires separate install
)
return _voice_cache[voice_file]
class PiperEngine(TTSEngine):
name = "Piper (ONNX)"
engine_type = "neural-local"
cost_per_million_chars = 0.0
is_production_ready = False # lower naturalness than Kokoro, no band-tuned voices yet
requires_internet = False # only on first run; fully offline after download
BAND_CONFIG = {
"K-2": {"voice_file": "en_US-amy-medium.onnx", "speed": 0.9},
"3-5": {"voice_file": "en_US-amy-medium.onnx", "speed": 1.0},
"6-8": {"voice_file": "en_US-amy-medium.onnx", "speed": 1.0},
"9-12": {"voice_file": "en_US-lessac-medium.onnx", "speed": 1.1},
}
def synthesize(self, text: str, band: str, output_path: str) -> dict:
config = self.get_band_config(band)
voice_file = config["voice_file"]
full_path = output_path + ".wav"
voice = _get_voice(voice_file)
start = time.time()
with wave.open(full_path, "wb") as wav_file:
voice.synthesize_wav(text, wav_file)
latency = round(time.time() - start, 3)
return {
"audio_path": full_path,
"latency_seconds": latency,
"voice": voice_file.replace(".onnx", ""),
"speed": config["speed"],
"engine": self.name,
}