Spaces:
Sleeping
Sleeping
File size: 1,317 Bytes
a3419b6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | # src/kokoro_client.py
# TTS client for Kokoro — neural TTS, runs locally on GPU/CPU.
# Significantly more natural than pyttsx3 baseline.
import time
import numpy as np
import soundfile as sf
from kokoro import KPipeline
# initialize pipeline once at module level (expensive to reload)
# lang_code "a" = American English
_pipeline = KPipeline(lang_code="a")
def synthesize(text: str, output_path: str, voice: str = "af_heart", speed: float = 1.0) -> dict:
"""
Synthesize text to a .wav file using Kokoro neural TTS.
Args:
text: the string to synthesize
output_path: where to save the .wav file
voice: kokoro voice ID (default af_heart — warm American female)
speed: speaking rate multiplier (default 1.0)
Returns:
dict with keys: output_path, latency_seconds, engine, voice
"""
start = time.time()
generator = _pipeline(text, voice=voice, speed=speed)
chunks = []
for _, _, audio in generator:
if audio is not None:
chunks.append(audio)
audio_out = np.concatenate(chunks)
sf.write(output_path, audio_out, 24000)
latency = time.time() - start
return {
"output_path": output_path,
"latency_seconds": round(latency, 3),
"engine": "kokoro",
"voice": voice,
} |