liumaolin
commited on
Commit
·
1cbd55c
1
Parent(s):
d231de5
Add Kokoro TTS support: integrate new TTS model, configuration, and runtime components for enhanced multilingual voice synthesis.
Browse files- models/tts/kokoro-v1.0.int8.onnx +3 -0
- models/tts/voices-v1.0.bin +3 -0
- src/VoiceDialogue/services/audio/audio_generator/configs/kokoro.py +58 -0
- src/VoiceDialogue/services/audio/audio_generator/models/__init__.py +21 -0
- src/VoiceDialogue/services/audio/audio_generator/models/kokoro.py +62 -0
- src/VoiceDialogue/services/audio/audio_generator/runtime/__init__.py +3 -1
- src/VoiceDialogue/services/audio/audio_generator/runtime/interface.py +1 -1
- src/VoiceDialogue/services/audio/audio_generator/runtime/kokoro.py +48 -0
- src/VoiceDialogue/services/audio/audio_generator/runtime/moyoyo.py +2 -2
models/tts/kokoro-v1.0.int8.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6e742170d309016e5891a994e1ce1559c702a2ccd0075e67ef7157974f6406cb
|
| 3 |
+
size 92361271
|
models/tts/voices-v1.0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bca610b8308e8d99f32e6fe4197e7ec01679264efed0cac9140fe9c29f1fbf7d
|
| 3 |
+
size 28214398
|
src/VoiceDialogue/services/audio/audio_generator/configs/kokoro.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ..models.kokoro import KokoroTTSConfig
|
| 2 |
+
|
| 3 |
+
ENGLISH_MODEL_FILES = {
|
| 4 |
+
'model': 'kokoro-v1.0.int8.onnx',
|
| 5 |
+
'voice': 'voices-v1.0.bin'
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
CHINESE_MODEL_FILES = {
|
| 9 |
+
'model': 'kokoro-v1.1-zh.onnx',
|
| 10 |
+
'voice': 'voices-v1.1-zh.bin',
|
| 11 |
+
'vocab_config': 'config.json'
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
KOKORO_TTS_CONFIGS = [
|
| 15 |
+
{
|
| 16 |
+
'character_name': 'Heart',
|
| 17 |
+
'cover_image': '',
|
| 18 |
+
'description': 'Heart是一个温暖亲切的英语女性语音,声音富有感情色彩,适合情感表达和温馨内容的语音合成。',
|
| 19 |
+
'file_size': '',
|
| 20 |
+
'is_chinese_voice': False,
|
| 21 |
+
'inference_parameters': {
|
| 22 |
+
'voice': 'af_heart',
|
| 23 |
+
'speed': 1.0,
|
| 24 |
+
'is_phonemes': True,
|
| 25 |
+
},
|
| 26 |
+
'model_files': ENGLISH_MODEL_FILES,
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
'character_name': 'Bella',
|
| 30 |
+
'cover_image': '',
|
| 31 |
+
'description': 'Bella是一个优质的英语女性语音,具有清晰自然的发音和良好的表现力,适合各种英语内容的语音合成。',
|
| 32 |
+
'file_size': '',
|
| 33 |
+
'is_chinese_voice': False,
|
| 34 |
+
'inference_parameters': {
|
| 35 |
+
'voice': 'af_bella',
|
| 36 |
+
'speed': 1.0,
|
| 37 |
+
'is_phonemes': True,
|
| 38 |
+
},
|
| 39 |
+
'model_files': ENGLISH_MODEL_FILES,
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
'character_name': 'Nicole',
|
| 43 |
+
'cover_image': '',
|
| 44 |
+
'description': 'Nicole是一个高质量的英语女性语音,发音清晰准确,语调自然流畅。',
|
| 45 |
+
'file_size': '',
|
| 46 |
+
'is_chinese_voice': False,
|
| 47 |
+
'inference_parameters': {
|
| 48 |
+
'voice': 'af_nicole',
|
| 49 |
+
'speed': 1.0,
|
| 50 |
+
'is_phonemes': True,
|
| 51 |
+
},
|
| 52 |
+
'model_files': ENGLISH_MODEL_FILES,
|
| 53 |
+
},
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def get_kokoro_configs() -> list[KokoroTTSConfig]:
|
| 58 |
+
return [KokoroTTSConfig(**config) for config in KOKORO_TTS_CONFIGS]
|
src/VoiceDialogue/services/audio/audio_generator/models/__init__.py
CHANGED
|
@@ -26,6 +26,16 @@ except ImportError:
|
|
| 26 |
|
| 27 |
logging.warning("MoYoYo TTS config not available")
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
# 动态构建导出列表
|
| 30 |
__all__ = [
|
| 31 |
'TTSConfigType',
|
|
@@ -37,6 +47,8 @@ __all__ = [
|
|
| 37 |
|
| 38 |
if _moyoyo_available:
|
| 39 |
__all__.append('MoYoYoTTSConfig')
|
|
|
|
|
|
|
| 40 |
|
| 41 |
|
| 42 |
# 自动注册所有可用的配置
|
|
@@ -51,6 +63,15 @@ def _auto_register_configs():
|
|
| 51 |
import logging
|
| 52 |
logging.error(f"Failed to auto-register configs: {e}")
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
# 模块加载时自动注册配置
|
| 56 |
_auto_register_configs()
|
|
|
|
| 26 |
|
| 27 |
logging.warning("MoYoYo TTS config not available")
|
| 28 |
|
| 29 |
+
try:
|
| 30 |
+
from .kokoro import KokoroTTSConfig
|
| 31 |
+
|
| 32 |
+
_kokoro_available = True
|
| 33 |
+
except ImportError:
|
| 34 |
+
_kokoro_available = False
|
| 35 |
+
import logging
|
| 36 |
+
|
| 37 |
+
logging.warning("Kokoro TTS config not available")
|
| 38 |
+
|
| 39 |
# 动态构建导出列表
|
| 40 |
__all__ = [
|
| 41 |
'TTSConfigType',
|
|
|
|
| 47 |
|
| 48 |
if _moyoyo_available:
|
| 49 |
__all__.append('MoYoYoTTSConfig')
|
| 50 |
+
if _kokoro_available:
|
| 51 |
+
__all__.append('KokoroTTSConfig')
|
| 52 |
|
| 53 |
|
| 54 |
# 自动注册所有可用的配置
|
|
|
|
| 63 |
import logging
|
| 64 |
logging.error(f"Failed to auto-register configs: {e}")
|
| 65 |
|
| 66 |
+
try:
|
| 67 |
+
if _kokoro_available:
|
| 68 |
+
from ..configs.kokoro import get_kokoro_configs
|
| 69 |
+
for config in get_kokoro_configs():
|
| 70 |
+
tts_config_registry.register_config(config)
|
| 71 |
+
except Exception as e:
|
| 72 |
+
import logging
|
| 73 |
+
logging.error(f"Failed to auto-register configs: {e}")
|
| 74 |
+
|
| 75 |
|
| 76 |
# 模块加载时自动注册配置
|
| 77 |
_auto_register_configs()
|
src/VoiceDialogue/services/audio/audio_generator/models/kokoro.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import typing
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
from pydantic import BaseModel, Field
|
| 5 |
+
|
| 6 |
+
from .base import BaseTTSConfig, TTSConfigType
|
| 7 |
+
from config import paths
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class InferenceParameters(BaseModel):
|
| 11 |
+
"""Kokoro TTS 推理参数"""
|
| 12 |
+
voice: str = Field(description="语音角色名称")
|
| 13 |
+
speed: float = Field(default=1.0, description="语音播放速度")
|
| 14 |
+
is_phonemes: bool = Field(default=True, description="是否使用音素")
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class ModelFiles(BaseModel):
|
| 18 |
+
"""模型文件配置"""
|
| 19 |
+
model: str = Field(default='', description="模型文件名")
|
| 20 |
+
voice: str = Field(default='', description="语音文件名")
|
| 21 |
+
vocab_config: str = Field(default=None, description="音素配置文件名")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
class KokoroTTSConfig(BaseTTSConfig):
|
| 25 |
+
tts_type: TTSConfigType = TTSConfigType.KOKORO
|
| 26 |
+
inference_parameters: InferenceParameters
|
| 27 |
+
model_files: ModelFiles
|
| 28 |
+
|
| 29 |
+
def get_model_storage_path(self) -> Path:
|
| 30 |
+
storage_path = paths.MODELS_PATH / 'tts'
|
| 31 |
+
if not storage_path.exists():
|
| 32 |
+
storage_path.mkdir(parents=True, exist_ok=True)
|
| 33 |
+
return storage_path
|
| 34 |
+
|
| 35 |
+
def is_model_complete(self) -> bool:
|
| 36 |
+
storage_path = self.get_model_storage_path()
|
| 37 |
+
for model_file in self.model_files.model_dump().values():
|
| 38 |
+
if not model_file:
|
| 39 |
+
continue
|
| 40 |
+
|
| 41 |
+
file_path = storage_path / model_file
|
| 42 |
+
if not file_path.exists():
|
| 43 |
+
return False
|
| 44 |
+
return True
|
| 45 |
+
|
| 46 |
+
def download_model(self, progress_callback: typing.Callable = None):
|
| 47 |
+
pass
|
| 48 |
+
|
| 49 |
+
def delete_model(self):
|
| 50 |
+
pass
|
| 51 |
+
|
| 52 |
+
@property
|
| 53 |
+
def model_path(self):
|
| 54 |
+
return self.get_model_storage_path() / self.model_files.model
|
| 55 |
+
|
| 56 |
+
@property
|
| 57 |
+
def voices_path(self):
|
| 58 |
+
return self.get_model_storage_path() / self.model_files.voice
|
| 59 |
+
|
| 60 |
+
@property
|
| 61 |
+
def vocab_config_path(self):
|
| 62 |
+
return self.get_model_storage_path() / self.model_files.vocab_config
|
src/VoiceDialogue/services/audio/audio_generator/runtime/__init__.py
CHANGED
|
@@ -12,11 +12,13 @@ from .interface import TTSInterface, TTSFactory
|
|
| 12 |
# 导入所有TTS实现,确保注册装饰器被执行
|
| 13 |
try:
|
| 14 |
from .moyoyo import MoYoYoTTS
|
|
|
|
| 15 |
|
| 16 |
__all__ = [
|
| 17 |
'TTSInterface',
|
| 18 |
'TTSFactory',
|
| 19 |
-
'MoYoYoTTS'
|
|
|
|
| 20 |
]
|
| 21 |
except ImportError as e:
|
| 22 |
# 如果某些TTS实现无法导入,不影响整体功能
|
|
|
|
| 12 |
# 导入所有TTS实现,确保注册装饰器被执行
|
| 13 |
try:
|
| 14 |
from .moyoyo import MoYoYoTTS
|
| 15 |
+
from .kokoro import KokoroTTS
|
| 16 |
|
| 17 |
__all__ = [
|
| 18 |
'TTSInterface',
|
| 19 |
'TTSFactory',
|
| 20 |
+
'MoYoYoTTS',
|
| 21 |
+
'KokoroTTS'
|
| 22 |
]
|
| 23 |
except ImportError as e:
|
| 24 |
# 如果某些TTS实现无法导入,不影响整体功能
|
src/VoiceDialogue/services/audio/audio_generator/runtime/interface.py
CHANGED
|
@@ -34,7 +34,7 @@ class TTSInterface(ABC):
|
|
| 34 |
pass
|
| 35 |
|
| 36 |
@abstractmethod
|
| 37 |
-
def synthesize(self, text: str, **kwargs) -> Tuple[
|
| 38 |
"""
|
| 39 |
将文本转换为语音
|
| 40 |
|
|
|
|
| 34 |
pass
|
| 35 |
|
| 36 |
@abstractmethod
|
| 37 |
+
def synthesize(self, text: str, **kwargs) -> Tuple[np.ndarray, int]:
|
| 38 |
"""
|
| 39 |
将文本转换为语音
|
| 40 |
|
src/VoiceDialogue/services/audio/audio_generator/runtime/kokoro.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Tuple, Optional
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
from kokoro_onnx import Kokoro
|
| 5 |
+
|
| 6 |
+
from .interface import TTSInterface
|
| 7 |
+
from ..configs.kokoro import KokoroTTSConfig
|
| 8 |
+
from ..manager import tts_tables
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@tts_tables.register("tts_classes", "kokoro")
|
| 12 |
+
class KokoroTTS(TTSInterface):
|
| 13 |
+
def __init__(self, config: KokoroTTSConfig):
|
| 14 |
+
super().__init__(config)
|
| 15 |
+
self.tts_model: Optional[Kokoro] = None
|
| 16 |
+
self.espeak_ng = None
|
| 17 |
+
|
| 18 |
+
def setup(self, **kwargs) -> None:
|
| 19 |
+
if self.config.is_chinese_voice:
|
| 20 |
+
self.tts_model = Kokoro(
|
| 21 |
+
model_path=self.config.model_path,
|
| 22 |
+
voices_path=self.config.voices_path,
|
| 23 |
+
vocab_config=self.config.vocab_config_path,
|
| 24 |
+
)
|
| 25 |
+
from misaki import zh
|
| 26 |
+
self.espeak_ng = zh.ZHG2P(version="1.1")
|
| 27 |
+
else:
|
| 28 |
+
self.tts_model = Kokoro(
|
| 29 |
+
model_path=self.config.model_path,
|
| 30 |
+
voices_path=self.config.voices_path
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
from misaki import en, espeak
|
| 34 |
+
fallback = espeak.EspeakFallback(british=False)
|
| 35 |
+
self.espeak_ng = en.G2P(trf=False, british=False, fallback=fallback)
|
| 36 |
+
|
| 37 |
+
def warmup(self, warmup_steps: int = 1) -> None:
|
| 38 |
+
print('[INFO:] Warming up Kokoro TTS engine...')
|
| 39 |
+
warmup_texts = ['Warming up TTS engine.', '预热文字转音频引擎。']
|
| 40 |
+
for _ in range(warmup_steps):
|
| 41 |
+
for warmup_text in warmup_texts:
|
| 42 |
+
self.synthesize(warmup_text)
|
| 43 |
+
print('[INFO:] Warm up Kokoro TTS engine finished.')
|
| 44 |
+
|
| 45 |
+
def synthesize(self, text: str, **kwargs) -> Tuple[np.ndarray, int]:
|
| 46 |
+
phonemes, _ = self.espeak_ng(text)
|
| 47 |
+
samples, sample_rate = self.tts_model.create(phonemes, **self.config.inference_parameters.model_dump())
|
| 48 |
+
return samples, sample_rate
|
src/VoiceDialogue/services/audio/audio_generator/runtime/moyoyo.py
CHANGED
|
@@ -34,12 +34,12 @@ class MoYoYoTTS(TTSInterface):
|
|
| 34 |
|
| 35 |
def warmup(self, warmup_steps: int = 1) -> None:
|
| 36 |
"""预热TTS引擎"""
|
| 37 |
-
print('[INFO:] Warming up TTS engine...')
|
| 38 |
warmup_texts = ['Warming up TTS engine.', '预热文字转音频引擎。']
|
| 39 |
for _ in range(warmup_steps):
|
| 40 |
for warmup_text in warmup_texts:
|
| 41 |
self.tts_module.generate_audio(warmup_text, warmup=True)
|
| 42 |
-
print('[INFO:] Warm up TTS engine finished.')
|
| 43 |
|
| 44 |
def synthesize(self, text: str, **kwargs) -> Tuple[np.ndarray, int]:
|
| 45 |
"""合成语音"""
|
|
|
|
| 34 |
|
| 35 |
def warmup(self, warmup_steps: int = 1) -> None:
|
| 36 |
"""预热TTS引擎"""
|
| 37 |
+
print('[INFO:] Warming up MoYoYo TTS engine...')
|
| 38 |
warmup_texts = ['Warming up TTS engine.', '预热文字转音频引擎。']
|
| 39 |
for _ in range(warmup_steps):
|
| 40 |
for warmup_text in warmup_texts:
|
| 41 |
self.tts_module.generate_audio(warmup_text, warmup=True)
|
| 42 |
+
print('[INFO:] Warm up MoYoYo TTS engine finished.')
|
| 43 |
|
| 44 |
def synthesize(self, text: str, **kwargs) -> Tuple[np.ndarray, int]:
|
| 45 |
"""合成语音"""
|