tts-eval-framework / src /kokoro_client.py
aankitdas's picture
first commit - working app locally
a3419b6
# src/kokoro_client.py
# TTS client for Kokoro — neural TTS, runs locally on GPU/CPU.
# Significantly more natural than pyttsx3 baseline.
import time
import numpy as np
import soundfile as sf
from kokoro import KPipeline
# initialize pipeline once at module level (expensive to reload)
# lang_code "a" = American English
_pipeline = KPipeline(lang_code="a")
def synthesize(text: str, output_path: str, voice: str = "af_heart", speed: float = 1.0) -> dict:
"""
Synthesize text to a .wav file using Kokoro neural TTS.
Args:
text: the string to synthesize
output_path: where to save the .wav file
voice: kokoro voice ID (default af_heart — warm American female)
speed: speaking rate multiplier (default 1.0)
Returns:
dict with keys: output_path, latency_seconds, engine, voice
"""
start = time.time()
generator = _pipeline(text, voice=voice, speed=speed)
chunks = []
for _, _, audio in generator:
if audio is not None:
chunks.append(audio)
audio_out = np.concatenate(chunks)
sf.write(output_path, audio_out, 24000)
latency = time.time() - start
return {
"output_path": output_path,
"latency_seconds": round(latency, 3),
"engine": "kokoro",
"voice": voice,
}