Spaces:
Sleeping
Sleeping
| # src/kokoro_client.py | |
| # TTS client for Kokoro — neural TTS, runs locally on GPU/CPU. | |
| # Significantly more natural than pyttsx3 baseline. | |
| import time | |
| import numpy as np | |
| import soundfile as sf | |
| from kokoro import KPipeline | |
| # initialize pipeline once at module level (expensive to reload) | |
| # lang_code "a" = American English | |
| _pipeline = KPipeline(lang_code="a") | |
| def synthesize(text: str, output_path: str, voice: str = "af_heart", speed: float = 1.0) -> dict: | |
| """ | |
| Synthesize text to a .wav file using Kokoro neural TTS. | |
| Args: | |
| text: the string to synthesize | |
| output_path: where to save the .wav file | |
| voice: kokoro voice ID (default af_heart — warm American female) | |
| speed: speaking rate multiplier (default 1.0) | |
| Returns: | |
| dict with keys: output_path, latency_seconds, engine, voice | |
| """ | |
| start = time.time() | |
| generator = _pipeline(text, voice=voice, speed=speed) | |
| chunks = [] | |
| for _, _, audio in generator: | |
| if audio is not None: | |
| chunks.append(audio) | |
| audio_out = np.concatenate(chunks) | |
| sf.write(output_path, audio_out, 24000) | |
| latency = time.time() - start | |
| return { | |
| "output_path": output_path, | |
| "latency_seconds": round(latency, 3), | |
| "engine": "kokoro", | |
| "voice": voice, | |
| } |