from typing import Tuple, Optional import numpy as np from kokoro_onnx import Kokoro from voice_dialogue.tts.configs.kokoro import KokoroTTSConfig from voice_dialogue.tts.manager import tts_tables from voice_dialogue.tts.runtime.interface import TTSInterface from voice_dialogue.utils.logger import logger @tts_tables.register("tts_classes", "kokoro") class KokoroTTS(TTSInterface): def __init__(self, config: KokoroTTSConfig): super().__init__(config) self.tts_model: Optional[Kokoro] = None self.espeak_ng = None def setup(self, **kwargs) -> None: if self.config.is_chinese_voice: self.tts_model = Kokoro( model_path=self.config.model_path, voices_path=self.config.voices_path, vocab_config=self.config.vocab_config_path, ) from misaki import zh self.espeak_ng = zh.ZHG2P(version="1.1") else: self.tts_model = Kokoro( model_path=self.config.model_path, voices_path=self.config.voices_path ) from misaki import en, espeak fallback = espeak.EspeakFallback(british=False) self.espeak_ng = en.G2P(trf=False, british=False, fallback=fallback) def warmup(self, warmup_steps: int = 1) -> None: logger.info('[INFO:] Warming up Kokoro TTS engine...') warmup_texts = ['Warming up TTS engine.', '预热文字转音频引擎。'] for _ in range(warmup_steps): for warmup_text in warmup_texts: self.synthesize(warmup_text) logger.info('[INFO:] Warm up Kokoro TTS engine finished.') def synthesize(self, text: str, **kwargs) -> Tuple[np.ndarray, int]: phonemes, _ = self.espeak_ng(text) samples, sample_rate = self.tts_model.create(phonemes, **self.config.inference_parameters.model_dump()) return samples, sample_rate