liumaolin
refactor(core): Architecturally decouple Audio, ASR, and TTS modules
60f8238
from typing import Tuple, Optional
import numpy as np
from kokoro_onnx import Kokoro
from voice_dialogue.tts.configs.kokoro import KokoroTTSConfig
from voice_dialogue.tts.manager import tts_tables
from voice_dialogue.tts.runtime.interface import TTSInterface
from voice_dialogue.utils.logger import logger
@tts_tables.register("tts_classes", "kokoro")
class KokoroTTS(TTSInterface):
def __init__(self, config: KokoroTTSConfig):
super().__init__(config)
self.tts_model: Optional[Kokoro] = None
self.espeak_ng = None
def setup(self, **kwargs) -> None:
if self.config.is_chinese_voice:
self.tts_model = Kokoro(
model_path=self.config.model_path,
voices_path=self.config.voices_path,
vocab_config=self.config.vocab_config_path,
)
from misaki import zh
self.espeak_ng = zh.ZHG2P(version="1.1")
else:
self.tts_model = Kokoro(
model_path=self.config.model_path,
voices_path=self.config.voices_path
)
from misaki import en, espeak
fallback = espeak.EspeakFallback(british=False)
self.espeak_ng = en.G2P(trf=False, british=False, fallback=fallback)
def warmup(self, warmup_steps: int = 1) -> None:
logger.info('[INFO:] Warming up Kokoro TTS engine...')
warmup_texts = ['Warming up TTS engine.', '预热文字转音频引擎。']
for _ in range(warmup_steps):
for warmup_text in warmup_texts:
self.synthesize(warmup_text)
logger.info('[INFO:] Warm up Kokoro TTS engine finished.')
def synthesize(self, text: str, **kwargs) -> Tuple[np.ndarray, int]:
phonemes, _ = self.espeak_ng(text)
samples, sample_rate = self.tts_model.create(phonemes, **self.config.inference_parameters.model_dump())
return samples, sample_rate