File size: 1,446 Bytes
df6b3ac
26dae50
df6b3ac
26dae50
 
 
 
 
 
df6b3ac
 
 
 
 
26dae50
 
df6b3ac
 
26dae50
 
df6b3ac
 
 
 
 
26dae50
 
df6b3ac
26dae50
 
df6b3ac
26dae50
 
df6b3ac
26dae50
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
"""Text-to-speech (Piper). Returns the path to a .wav. Bilingual PT/EN.

Piper runs on onnxruntime (no torch). Voices come from rhasspy/piper-voices.
"""
import os
import tempfile
import wave

_REPO = "rhasspy/piper-voices"
_VOICES = {
    "pt": os.environ.get("IRIS_TTS_VOICE_PT", "pt/pt_BR/faber/medium/pt_BR-faber-medium"),
    "en": os.environ.get("IRIS_TTS_VOICE_EN", "en/en_US/amy/medium/en_US-amy-medium"),
}
_cache = {}


def _load(lang: str):
    if lang not in _cache:
        from huggingface_hub import hf_hub_download
        from piper import PiperVoice
        name = _VOICES.get(lang, _VOICES["pt"])
        onnx = hf_hub_download(_REPO, f"{name}.onnx")
        conf = hf_hub_download(_REPO, f"{name}.onnx.json")
        _cache[lang] = PiperVoice.load(onnx, config_path=conf)
    return _cache[lang]


def synthesize(text: str, lang: str = "pt") -> str | None:
    if not text or not text.strip():
        return None
    voice = _load("en" if lang == "en" else "pt")
    chunks = list(voice.synthesize(text))
    if not chunks:
        print(f"[tts] no audio for text: {text!r}", flush=True)
        return None
    path = tempfile.mktemp(suffix=".wav")
    with wave.open(path, "wb") as wf:
        wf.setnchannels(chunks[0].sample_channels)
        wf.setsampwidth(chunks[0].sample_width)
        wf.setframerate(chunks[0].sample_rate)
        for ch in chunks:
            wf.writeframes(ch.audio_int16_bytes)
    return path