Spaces:

aankitdas
/

tts-eval-framework

Sleeping

File size: 4,344 Bytes

# app/engines/piper_engine.py
# Piper TTS engine — fast ONNX-based neural TTS, fully offline.
# Voices are downloaded on demand from rhasspy/piper-voices on HF Hub
# and cached flat in voices/piper/ for subsequent runs.
# Designed for low-latency, low-resource deployment (runs on Raspberry Pi).
# Faster than Kokoro on CPU, lower naturalness ceiling.
# Good fallback: offline after first download, no API key, minimal VRAM.

import wave
import time
import shutil
from pathlib import Path
from piper import PiperVoice
from huggingface_hub import hf_hub_download

from engines.base import TTSEngine

# voice files live flat in voices/piper/ relative to project root
_VOICES_DIR = Path(__file__).parent.parent.parent / "voices" / "piper"

# cache loaded voices — loading ONNX takes ~0.5s, reuse across calls
_voice_cache: dict[str, PiperVoice] = {}


def _ensure_model_downloaded(voice_file: str) -> None:
    """
    Checks for model and config at flat voices/piper/ path.
    If missing, downloads from rhasspy/piper-voices on HF Hub
    and moves to flat location. Handles .onnx and .json separately
    so a partial download can be recovered.
    """
    _VOICES_DIR.mkdir(parents=True, exist_ok=True)

    model_path = _VOICES_DIR / voice_file
    config_path = _VOICES_DIR / f"{voice_file}.json"

    # parse voice file name into HF Hub repo subfolder structure
    # e.g. en_US-amy-medium.onnx -> en/en_US/amy/medium/
    parts = voice_file.split("-")
    lang_family = parts[0].split("_")[0]        # "en"
    lang_full = parts[0]                        # "en_US"
    speaker = parts[1]                          # "amy"
    quality = parts[2].replace(".onnx", "")     # "medium"
    repo_subfolder = f"{lang_family}/{lang_full}/{speaker}/{quality}"

    if not model_path.exists():
        print(f"[Piper] Downloading {voice_file} from HF Hub...")
        downloaded = hf_hub_download(
            repo_id="rhasspy/piper-voices",
            filename=f"{repo_subfolder}/{voice_file}",
            local_dir=str(_VOICES_DIR),
            local_dir_use_symlinks=False,
        )
        shutil.move(downloaded, model_path)
        print(f"[Piper] Saved to {model_path}")

    if not config_path.exists():
        print(f"[Piper] Downloading {voice_file}.json from HF Hub...")
        downloaded = hf_hub_download(
            repo_id="rhasspy/piper-voices",
            filename=f"{repo_subfolder}/{voice_file}.json",
            local_dir=str(_VOICES_DIR),
            local_dir_use_symlinks=False,
        )
        shutil.move(downloaded, config_path)
        print(f"[Piper] Saved to {config_path}")


def _get_voice(voice_file: str) -> PiperVoice:
    """Returns a cached PiperVoice, downloading the model first if needed."""
    if voice_file not in _voice_cache:
        _ensure_model_downloaded(voice_file)
        model_path = _VOICES_DIR / voice_file
        _voice_cache[voice_file] = PiperVoice.load(
            str(model_path),
            use_cuda=False,  # ONNX CUDA provider requires separate install
        )
    return _voice_cache[voice_file]


class PiperEngine(TTSEngine):

    name = "Piper (ONNX)"
    engine_type = "neural-local"
    cost_per_million_chars = 0.0
    is_production_ready = False  # lower naturalness than Kokoro, no band-tuned voices yet
    requires_internet = False    # only on first run; fully offline after download

    BAND_CONFIG = {
        "K-2":  {"voice_file": "en_US-amy-medium.onnx",    "speed": 0.9},
        "3-5":  {"voice_file": "en_US-amy-medium.onnx",    "speed": 1.0},
        "6-8":  {"voice_file": "en_US-amy-medium.onnx",    "speed": 1.0},
        "9-12": {"voice_file": "en_US-lessac-medium.onnx", "speed": 1.1},
    }

    def synthesize(self, text: str, band: str, output_path: str) -> dict:
        config = self.get_band_config(band)
        voice_file = config["voice_file"]
        full_path = output_path + ".wav"

        voice = _get_voice(voice_file)

        start = time.time()
        with wave.open(full_path, "wb") as wav_file:
            voice.synthesize_wav(text, wav_file)
        latency = round(time.time() - start, 3)

        return {
            "audio_path": full_path,
            "latency_seconds": latency,
            "voice": voice_file.replace(".onnx", ""),
            "speed": config["speed"],
            "engine": self.name,
        }