# src/kokoro_client.py # TTS client for Kokoro — neural TTS, runs locally on GPU/CPU. # Significantly more natural than pyttsx3 baseline. import time import numpy as np import soundfile as sf from kokoro import KPipeline # initialize pipeline once at module level (expensive to reload) # lang_code "a" = American English _pipeline = KPipeline(lang_code="a") def synthesize(text: str, output_path: str, voice: str = "af_heart", speed: float = 1.0) -> dict: """ Synthesize text to a .wav file using Kokoro neural TTS. Args: text: the string to synthesize output_path: where to save the .wav file voice: kokoro voice ID (default af_heart — warm American female) speed: speaking rate multiplier (default 1.0) Returns: dict with keys: output_path, latency_seconds, engine, voice """ start = time.time() generator = _pipeline(text, voice=voice, speed=speed) chunks = [] for _, _, audio in generator: if audio is not None: chunks.append(audio) audio_out = np.concatenate(chunks) sf.write(output_path, audio_out, 24000) latency = time.time() - start return { "output_path": output_path, "latency_seconds": round(latency, 3), "engine": "kokoro", "voice": voice, }