liumaolin commited on
Commit
1cbd55c
·
1 Parent(s): d231de5

Add Kokoro TTS support: integrate new TTS model, configuration, and runtime components for enhanced multilingual voice synthesis.

Browse files
models/tts/kokoro-v1.0.int8.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e742170d309016e5891a994e1ce1559c702a2ccd0075e67ef7157974f6406cb
3
+ size 92361271
models/tts/voices-v1.0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bca610b8308e8d99f32e6fe4197e7ec01679264efed0cac9140fe9c29f1fbf7d
3
+ size 28214398
src/VoiceDialogue/services/audio/audio_generator/configs/kokoro.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ..models.kokoro import KokoroTTSConfig
2
+
3
+ ENGLISH_MODEL_FILES = {
4
+ 'model': 'kokoro-v1.0.int8.onnx',
5
+ 'voice': 'voices-v1.0.bin'
6
+ }
7
+
8
+ CHINESE_MODEL_FILES = {
9
+ 'model': 'kokoro-v1.1-zh.onnx',
10
+ 'voice': 'voices-v1.1-zh.bin',
11
+ 'vocab_config': 'config.json'
12
+ }
13
+
14
+ KOKORO_TTS_CONFIGS = [
15
+ {
16
+ 'character_name': 'Heart',
17
+ 'cover_image': '',
18
+ 'description': 'Heart是一个温暖亲切的英语女性语音,声音富有感情色彩,适合情感表达和温馨内容的语音合成。',
19
+ 'file_size': '',
20
+ 'is_chinese_voice': False,
21
+ 'inference_parameters': {
22
+ 'voice': 'af_heart',
23
+ 'speed': 1.0,
24
+ 'is_phonemes': True,
25
+ },
26
+ 'model_files': ENGLISH_MODEL_FILES,
27
+ },
28
+ {
29
+ 'character_name': 'Bella',
30
+ 'cover_image': '',
31
+ 'description': 'Bella是一个优质的英语女性语音,具有清晰自然的发音和良好的表现力,适合各种英语内容的语音合成。',
32
+ 'file_size': '',
33
+ 'is_chinese_voice': False,
34
+ 'inference_parameters': {
35
+ 'voice': 'af_bella',
36
+ 'speed': 1.0,
37
+ 'is_phonemes': True,
38
+ },
39
+ 'model_files': ENGLISH_MODEL_FILES,
40
+ },
41
+ {
42
+ 'character_name': 'Nicole',
43
+ 'cover_image': '',
44
+ 'description': 'Nicole是一个高质量的英语女性语音,发音清晰准确,语调自然流畅。',
45
+ 'file_size': '',
46
+ 'is_chinese_voice': False,
47
+ 'inference_parameters': {
48
+ 'voice': 'af_nicole',
49
+ 'speed': 1.0,
50
+ 'is_phonemes': True,
51
+ },
52
+ 'model_files': ENGLISH_MODEL_FILES,
53
+ },
54
+ ]
55
+
56
+
57
+ def get_kokoro_configs() -> list[KokoroTTSConfig]:
58
+ return [KokoroTTSConfig(**config) for config in KOKORO_TTS_CONFIGS]
src/VoiceDialogue/services/audio/audio_generator/models/__init__.py CHANGED
@@ -26,6 +26,16 @@ except ImportError:
26
 
27
  logging.warning("MoYoYo TTS config not available")
28
 
 
 
 
 
 
 
 
 
 
 
29
  # 动态构建导出列表
30
  __all__ = [
31
  'TTSConfigType',
@@ -37,6 +47,8 @@ __all__ = [
37
 
38
  if _moyoyo_available:
39
  __all__.append('MoYoYoTTSConfig')
 
 
40
 
41
 
42
  # 自动注册所有可用的配置
@@ -51,6 +63,15 @@ def _auto_register_configs():
51
  import logging
52
  logging.error(f"Failed to auto-register configs: {e}")
53
 
 
 
 
 
 
 
 
 
 
54
 
55
  # 模块加载时自动注册配置
56
  _auto_register_configs()
 
26
 
27
  logging.warning("MoYoYo TTS config not available")
28
 
29
+ try:
30
+ from .kokoro import KokoroTTSConfig
31
+
32
+ _kokoro_available = True
33
+ except ImportError:
34
+ _kokoro_available = False
35
+ import logging
36
+
37
+ logging.warning("Kokoro TTS config not available")
38
+
39
  # 动态构建导出列表
40
  __all__ = [
41
  'TTSConfigType',
 
47
 
48
  if _moyoyo_available:
49
  __all__.append('MoYoYoTTSConfig')
50
+ if _kokoro_available:
51
+ __all__.append('KokoroTTSConfig')
52
 
53
 
54
  # 自动注册所有可用的配置
 
63
  import logging
64
  logging.error(f"Failed to auto-register configs: {e}")
65
 
66
+ try:
67
+ if _kokoro_available:
68
+ from ..configs.kokoro import get_kokoro_configs
69
+ for config in get_kokoro_configs():
70
+ tts_config_registry.register_config(config)
71
+ except Exception as e:
72
+ import logging
73
+ logging.error(f"Failed to auto-register configs: {e}")
74
+
75
 
76
  # 模块加载时自动注册配置
77
  _auto_register_configs()
src/VoiceDialogue/services/audio/audio_generator/models/kokoro.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import typing
2
+ from pathlib import Path
3
+
4
+ from pydantic import BaseModel, Field
5
+
6
+ from .base import BaseTTSConfig, TTSConfigType
7
+ from config import paths
8
+
9
+
10
+ class InferenceParameters(BaseModel):
11
+ """Kokoro TTS 推理参数"""
12
+ voice: str = Field(description="语音角色名称")
13
+ speed: float = Field(default=1.0, description="语音播放速度")
14
+ is_phonemes: bool = Field(default=True, description="是否使用音素")
15
+
16
+
17
+ class ModelFiles(BaseModel):
18
+ """模型文件配置"""
19
+ model: str = Field(default='', description="模型文件名")
20
+ voice: str = Field(default='', description="语音文件名")
21
+ vocab_config: str = Field(default=None, description="音素配置文件名")
22
+
23
+
24
+ class KokoroTTSConfig(BaseTTSConfig):
25
+ tts_type: TTSConfigType = TTSConfigType.KOKORO
26
+ inference_parameters: InferenceParameters
27
+ model_files: ModelFiles
28
+
29
+ def get_model_storage_path(self) -> Path:
30
+ storage_path = paths.MODELS_PATH / 'tts'
31
+ if not storage_path.exists():
32
+ storage_path.mkdir(parents=True, exist_ok=True)
33
+ return storage_path
34
+
35
+ def is_model_complete(self) -> bool:
36
+ storage_path = self.get_model_storage_path()
37
+ for model_file in self.model_files.model_dump().values():
38
+ if not model_file:
39
+ continue
40
+
41
+ file_path = storage_path / model_file
42
+ if not file_path.exists():
43
+ return False
44
+ return True
45
+
46
+ def download_model(self, progress_callback: typing.Callable = None):
47
+ pass
48
+
49
+ def delete_model(self):
50
+ pass
51
+
52
+ @property
53
+ def model_path(self):
54
+ return self.get_model_storage_path() / self.model_files.model
55
+
56
+ @property
57
+ def voices_path(self):
58
+ return self.get_model_storage_path() / self.model_files.voice
59
+
60
+ @property
61
+ def vocab_config_path(self):
62
+ return self.get_model_storage_path() / self.model_files.vocab_config
src/VoiceDialogue/services/audio/audio_generator/runtime/__init__.py CHANGED
@@ -12,11 +12,13 @@ from .interface import TTSInterface, TTSFactory
12
  # 导入所有TTS实现,确保注册装饰器被执行
13
  try:
14
  from .moyoyo import MoYoYoTTS
 
15
 
16
  __all__ = [
17
  'TTSInterface',
18
  'TTSFactory',
19
- 'MoYoYoTTS'
 
20
  ]
21
  except ImportError as e:
22
  # 如果某些TTS实现无法导入,不影响整体功能
 
12
  # 导入所有TTS实现,确保注册装饰器被执行
13
  try:
14
  from .moyoyo import MoYoYoTTS
15
+ from .kokoro import KokoroTTS
16
 
17
  __all__ = [
18
  'TTSInterface',
19
  'TTSFactory',
20
+ 'MoYoYoTTS',
21
+ 'KokoroTTS'
22
  ]
23
  except ImportError as e:
24
  # 如果某些TTS实现无法导入,不影响整体功能
src/VoiceDialogue/services/audio/audio_generator/runtime/interface.py CHANGED
@@ -34,7 +34,7 @@ class TTSInterface(ABC):
34
  pass
35
 
36
  @abstractmethod
37
- def synthesize(self, text: str, **kwargs) -> Tuple[int, np.ndarray]:
38
  """
39
  将文本转换为语音
40
 
 
34
  pass
35
 
36
  @abstractmethod
37
+ def synthesize(self, text: str, **kwargs) -> Tuple[np.ndarray, int]:
38
  """
39
  将文本转换为语音
40
 
src/VoiceDialogue/services/audio/audio_generator/runtime/kokoro.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple, Optional
2
+
3
+ import numpy as np
4
+ from kokoro_onnx import Kokoro
5
+
6
+ from .interface import TTSInterface
7
+ from ..configs.kokoro import KokoroTTSConfig
8
+ from ..manager import tts_tables
9
+
10
+
11
+ @tts_tables.register("tts_classes", "kokoro")
12
+ class KokoroTTS(TTSInterface):
13
+ def __init__(self, config: KokoroTTSConfig):
14
+ super().__init__(config)
15
+ self.tts_model: Optional[Kokoro] = None
16
+ self.espeak_ng = None
17
+
18
+ def setup(self, **kwargs) -> None:
19
+ if self.config.is_chinese_voice:
20
+ self.tts_model = Kokoro(
21
+ model_path=self.config.model_path,
22
+ voices_path=self.config.voices_path,
23
+ vocab_config=self.config.vocab_config_path,
24
+ )
25
+ from misaki import zh
26
+ self.espeak_ng = zh.ZHG2P(version="1.1")
27
+ else:
28
+ self.tts_model = Kokoro(
29
+ model_path=self.config.model_path,
30
+ voices_path=self.config.voices_path
31
+ )
32
+
33
+ from misaki import en, espeak
34
+ fallback = espeak.EspeakFallback(british=False)
35
+ self.espeak_ng = en.G2P(trf=False, british=False, fallback=fallback)
36
+
37
+ def warmup(self, warmup_steps: int = 1) -> None:
38
+ print('[INFO:] Warming up Kokoro TTS engine...')
39
+ warmup_texts = ['Warming up TTS engine.', '预热文字转音频引擎。']
40
+ for _ in range(warmup_steps):
41
+ for warmup_text in warmup_texts:
42
+ self.synthesize(warmup_text)
43
+ print('[INFO:] Warm up Kokoro TTS engine finished.')
44
+
45
+ def synthesize(self, text: str, **kwargs) -> Tuple[np.ndarray, int]:
46
+ phonemes, _ = self.espeak_ng(text)
47
+ samples, sample_rate = self.tts_model.create(phonemes, **self.config.inference_parameters.model_dump())
48
+ return samples, sample_rate
src/VoiceDialogue/services/audio/audio_generator/runtime/moyoyo.py CHANGED
@@ -34,12 +34,12 @@ class MoYoYoTTS(TTSInterface):
34
 
35
  def warmup(self, warmup_steps: int = 1) -> None:
36
  """预热TTS引擎"""
37
- print('[INFO:] Warming up TTS engine...')
38
  warmup_texts = ['Warming up TTS engine.', '预热文字转音频引擎。']
39
  for _ in range(warmup_steps):
40
  for warmup_text in warmup_texts:
41
  self.tts_module.generate_audio(warmup_text, warmup=True)
42
- print('[INFO:] Warm up TTS engine finished.')
43
 
44
  def synthesize(self, text: str, **kwargs) -> Tuple[np.ndarray, int]:
45
  """合成语音"""
 
34
 
35
  def warmup(self, warmup_steps: int = 1) -> None:
36
  """预热TTS引擎"""
37
+ print('[INFO:] Warming up MoYoYo TTS engine...')
38
  warmup_texts = ['Warming up TTS engine.', '预热文字转音频引擎。']
39
  for _ in range(warmup_steps):
40
  for warmup_text in warmup_texts:
41
  self.tts_module.generate_audio(warmup_text, warmup=True)
42
+ print('[INFO:] Warm up MoYoYo TTS engine finished.')
43
 
44
  def synthesize(self, text: str, **kwargs) -> Tuple[np.ndarray, int]:
45
  """合成语音"""