Spaces:

aankitdas
/

tts-eval-framework

Sleeping

File size: 1,317 Bytes

a3419b6

# src/kokoro_client.py
# TTS client for Kokoro — neural TTS, runs locally on GPU/CPU.
# Significantly more natural than pyttsx3 baseline.

import time
import numpy as np
import soundfile as sf
from kokoro import KPipeline

# initialize pipeline once at module level (expensive to reload)
# lang_code "a" = American English
_pipeline = KPipeline(lang_code="a")


def synthesize(text: str, output_path: str, voice: str = "af_heart", speed: float = 1.0) -> dict:
    """
    Synthesize text to a .wav file using Kokoro neural TTS.

    Args:
        text: the string to synthesize
        output_path: where to save the .wav file
        voice: kokoro voice ID (default af_heart — warm American female)
        speed: speaking rate multiplier (default 1.0)

    Returns:
        dict with keys: output_path, latency_seconds, engine, voice
    """
    start = time.time()

    generator = _pipeline(text, voice=voice, speed=speed)

    chunks = []
    for _, _, audio in generator:
        if audio is not None:
            chunks.append(audio)

    audio_out = np.concatenate(chunks)
    sf.write(output_path, audio_out, 24000)

    latency = time.time() - start

    return {
        "output_path": output_path,
        "latency_seconds": round(latency, 3),
        "engine": "kokoro",
        "voice": voice,
    }