liumaolin
commited on
Commit
·
025ca3f
1
Parent(s):
7b86866
Refactor voice model structure: extract MoYoYo-specific configurations and introduce universal TTS registry.
Browse files- src/VoiceDialogue/main.py +14 -13
- src/VoiceDialogue/models/voice_model/__init__.py +19 -0
- src/VoiceDialogue/models/voice_model/base.py +119 -0
- src/VoiceDialogue/models/{voice_model.py → voice_model/moyoyo_configs.py} +7 -181
- src/VoiceDialogue/models/voice_model/moyoyo_tts.py +159 -0
- src/VoiceDialogue/services/audio/audio_answer.py +11 -22
src/VoiceDialogue/main.py
CHANGED
|
@@ -7,7 +7,7 @@ from config.paths import load_third_party
|
|
| 7 |
|
| 8 |
load_third_party()
|
| 9 |
|
| 10 |
-
from models.voice_model import
|
| 11 |
from services.audio.aec_audio_capture import EchoCancellingAudioCapture
|
| 12 |
from services.audio.audio_answer import TTSAudioGenerator
|
| 13 |
from services.audio.audio_player import AudioStreamPlayer
|
|
@@ -21,7 +21,7 @@ language: typing.Literal['zh', 'en'] = 'en'
|
|
| 21 |
|
| 22 |
def launch_system(
|
| 23 |
user_language: str,
|
| 24 |
-
|
| 25 |
):
|
| 26 |
audio_frames_queue = Queue()
|
| 27 |
user_voice_queue = Queue()
|
|
@@ -58,21 +58,22 @@ def launch_system(
|
|
| 58 |
threads.append(answer_generator_worker)
|
| 59 |
|
| 60 |
speaker_mapping = {
|
| 61 |
-
'罗翔':
|
| 62 |
-
'马保国':
|
| 63 |
-
'沈逸':
|
| 64 |
-
'杨幂':
|
| 65 |
-
'周杰伦':
|
| 66 |
-
'马云':
|
| 67 |
}
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
| 72 |
audio_generator_worker = TTSAudioGenerator(
|
| 73 |
processed_answer_queue=generated_answer_queue,
|
| 74 |
tts_generated_audio_queue=tts_generated_audio_queue,
|
| 75 |
-
voice_role=
|
| 76 |
)
|
| 77 |
audio_generator_worker.start()
|
| 78 |
threads.append(audio_generator_worker)
|
|
|
|
| 7 |
|
| 8 |
load_third_party()
|
| 9 |
|
| 10 |
+
from models.voice_model import tts_config_registry, TTSConfigType
|
| 11 |
from services.audio.aec_audio_capture import EchoCancellingAudioCapture
|
| 12 |
from services.audio.audio_answer import TTSAudioGenerator
|
| 13 |
from services.audio.audio_player import AudioStreamPlayer
|
|
|
|
| 21 |
|
| 22 |
def launch_system(
|
| 23 |
user_language: str,
|
| 24 |
+
speaker: str
|
| 25 |
):
|
| 26 |
audio_frames_queue = Queue()
|
| 27 |
user_voice_queue = Queue()
|
|
|
|
| 58 |
threads.append(answer_generator_worker)
|
| 59 |
|
| 60 |
speaker_mapping = {
|
| 61 |
+
'罗翔': 'Luo Xiang',
|
| 62 |
+
'马保国': 'Ma Baoguo',
|
| 63 |
+
'沈逸': 'Shen Yi',
|
| 64 |
+
'杨幂': 'Yang Mi',
|
| 65 |
+
'周杰伦': 'Jay Zhou',
|
| 66 |
+
'马云': 'Ma Yun',
|
| 67 |
}
|
| 68 |
+
role = speaker_mapping.get(speaker)
|
| 69 |
+
if role is None:
|
| 70 |
+
raise ValueError(f"不支持的TTS配置: {speaker}")
|
| 71 |
+
|
| 72 |
+
tts_speaker_config = tts_config_registry.get_config(TTSConfigType.MOYOYO, role)
|
| 73 |
audio_generator_worker = TTSAudioGenerator(
|
| 74 |
processed_answer_queue=generated_answer_queue,
|
| 75 |
tts_generated_audio_queue=tts_generated_audio_queue,
|
| 76 |
+
voice_role=tts_speaker_config
|
| 77 |
)
|
| 78 |
audio_generator_worker.start()
|
| 79 |
threads.append(audio_generator_worker)
|
src/VoiceDialogue/models/voice_model/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .base import TTSConfigType, VoiceModelStatus, tts_config_registry
|
| 2 |
+
from .moyoyo_configs import get_moyoyo_configs
|
| 3 |
+
from .moyoyo_tts import MoYoYoTTSConfig, MoYoYoTTSInference
|
| 4 |
+
|
| 5 |
+
# 注册MoYoYo TTS
|
| 6 |
+
moyoyo_inference = MoYoYoTTSInference()
|
| 7 |
+
tts_config_registry.register_inference_engine(TTSConfigType.MOYOYO, moyoyo_inference)
|
| 8 |
+
|
| 9 |
+
# 注册所有MoYoYo配置
|
| 10 |
+
for config in get_moyoyo_configs():
|
| 11 |
+
tts_config_registry.register_config(config)
|
| 12 |
+
|
| 13 |
+
__all__ = [
|
| 14 |
+
'TTSConfigType',
|
| 15 |
+
'VoiceModelStatus',
|
| 16 |
+
'tts_config_registry',
|
| 17 |
+
'MoYoYoTTSConfig',
|
| 18 |
+
'MoYoYoTTSInference',
|
| 19 |
+
]
|
src/VoiceDialogue/models/voice_model/base.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import typing
|
| 2 |
+
from abc import ABC, abstractmethod
|
| 3 |
+
from enum import Enum
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from pydantic import BaseModel
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class TTSConfigType(Enum):
|
| 10 |
+
"""TTS引擎类型枚举"""
|
| 11 |
+
MOYOYO = 'moyoyo'
|
| 12 |
+
EDGE_TTS = 'edge_tts'
|
| 13 |
+
BARK = 'bark'
|
| 14 |
+
# 可以添加更多TTS引擎
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class VoiceModelStatus(Enum):
|
| 18 |
+
"""声音模型状态枚举"""
|
| 19 |
+
NOT_DOWNLOADED = 'not_downloaded'
|
| 20 |
+
DOWNLOADING = 'downloading'
|
| 21 |
+
DOWNLOADED = 'downloaded'
|
| 22 |
+
FAILED = 'failed'
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class BaseTTSConfig(BaseModel, ABC):
|
| 26 |
+
"""TTS配置基类"""
|
| 27 |
+
tts_type: TTSConfigType
|
| 28 |
+
character_name: str
|
| 29 |
+
cover_image: str
|
| 30 |
+
description: str
|
| 31 |
+
file_size: str
|
| 32 |
+
is_chinese_voice: bool
|
| 33 |
+
|
| 34 |
+
@abstractmethod
|
| 35 |
+
def get_model_storage_path(self) -> Path:
|
| 36 |
+
"""获取模型存储路径"""
|
| 37 |
+
pass
|
| 38 |
+
|
| 39 |
+
@abstractmethod
|
| 40 |
+
def is_model_complete(self) -> bool:
|
| 41 |
+
"""检查模型文件是否完整"""
|
| 42 |
+
pass
|
| 43 |
+
|
| 44 |
+
@abstractmethod
|
| 45 |
+
def download_model(self, progress_callback: typing.Callable = None):
|
| 46 |
+
"""下载模型"""
|
| 47 |
+
pass
|
| 48 |
+
|
| 49 |
+
@abstractmethod
|
| 50 |
+
def delete_model(self):
|
| 51 |
+
"""删除模型"""
|
| 52 |
+
pass
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
class BaseTTSInference(ABC):
|
| 56 |
+
"""TTS推理基类"""
|
| 57 |
+
|
| 58 |
+
@abstractmethod
|
| 59 |
+
def generate_speech(self, text: str, config: BaseTTSConfig, **kwargs) -> bytes:
|
| 60 |
+
"""生成语音"""
|
| 61 |
+
pass
|
| 62 |
+
|
| 63 |
+
@abstractmethod
|
| 64 |
+
def is_supported_config(self, config: BaseTTSConfig) -> bool:
|
| 65 |
+
"""检查是否支持此配置"""
|
| 66 |
+
pass
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class TTSConfigRegistry:
|
| 70 |
+
"""TTS注册表,管理所有TTS引擎和配置"""
|
| 71 |
+
|
| 72 |
+
def __init__(self):
|
| 73 |
+
self._configs: dict[str, BaseTTSConfig] = {}
|
| 74 |
+
self._inference_engines: dict[TTSConfigType, BaseTTSInference] = {}
|
| 75 |
+
|
| 76 |
+
def register_config(self, config: BaseTTSConfig):
|
| 77 |
+
"""注册TTS配置"""
|
| 78 |
+
key = f"{config.tts_type.value}:{config.character_name}"
|
| 79 |
+
self._configs[key] = config
|
| 80 |
+
|
| 81 |
+
def register_inference_engine(self, tts_type: TTSConfigType, engine: BaseTTSInference):
|
| 82 |
+
"""注册TTS推理引擎"""
|
| 83 |
+
self._inference_engines[tts_type] = engine
|
| 84 |
+
|
| 85 |
+
def get_config(self, tts_type: TTSConfigType, character_name: str) -> BaseTTSConfig:
|
| 86 |
+
"""获取指定配置"""
|
| 87 |
+
key = f"{tts_type.value}:{character_name}"
|
| 88 |
+
return self._configs.get(key)
|
| 89 |
+
|
| 90 |
+
def get_configs_by_type(self, tts_type: TTSConfigType) -> list[BaseTTSConfig]:
|
| 91 |
+
"""获取指定类型的所有配置"""
|
| 92 |
+
return [config for config in self._configs.values()
|
| 93 |
+
if config.tts_type == tts_type]
|
| 94 |
+
|
| 95 |
+
def get_all_configs(self) -> list[BaseTTSConfig]:
|
| 96 |
+
"""获取所有配置"""
|
| 97 |
+
return list(self._configs.values())
|
| 98 |
+
|
| 99 |
+
def get_inference_engine(self, tts_type: TTSConfigType) -> BaseTTSInference:
|
| 100 |
+
"""获取推理引擎"""
|
| 101 |
+
return self._inference_engines.get(tts_type)
|
| 102 |
+
|
| 103 |
+
def generate_speech(self, tts_type: TTSConfigType, character_name: str,
|
| 104 |
+
text: str, **kwargs) -> bytes:
|
| 105 |
+
"""生成语音的统一接口"""
|
| 106 |
+
config = self.get_config(tts_type, character_name)
|
| 107 |
+
engine = self.get_inference_engine(tts_type)
|
| 108 |
+
|
| 109 |
+
if not config or not engine:
|
| 110 |
+
raise ValueError(f"TTS配置或引擎不存在: {tts_type.value}:{character_name}")
|
| 111 |
+
|
| 112 |
+
if not engine.is_supported_config(config):
|
| 113 |
+
raise ValueError(f"推理引擎不支持此配置: {tts_type.value}:{character_name}")
|
| 114 |
+
|
| 115 |
+
return engine.generate_speech(text, config, **kwargs)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
# 全局TTS注册表实例
|
| 119 |
+
tts_config_registry = TTSConfigRegistry()
|
src/VoiceDialogue/models/{voice_model.py → voice_model/moyoyo_configs.py}
RENAMED
|
@@ -1,12 +1,4 @@
|
|
| 1 |
-
import
|
| 2 |
-
import typing
|
| 3 |
-
from concurrent.futures.thread import ThreadPoolExecutor
|
| 4 |
-
from pathlib import Path
|
| 5 |
-
|
| 6 |
-
from pydantic import BaseModel
|
| 7 |
-
|
| 8 |
-
from config.settings import settings
|
| 9 |
-
from utils.download_utils import download_file_from_huggingface
|
| 10 |
|
| 11 |
# 基础预训练模型文件映射
|
| 12 |
BASE_PRETRAINED_FILES = {
|
|
@@ -18,8 +10,8 @@ BASE_PRETRAINED_FILES = {
|
|
| 18 |
'chinese-roberta-wwm-ext-large/tokenizer.json': 'chinese-roberta-wwm-ext-large/tokenizer.json',
|
| 19 |
}
|
| 20 |
|
| 21 |
-
#
|
| 22 |
-
|
| 23 |
{
|
| 24 |
'repository': 'MoYoYoTech/tone-models',
|
| 25 |
'character_name': 'Luo Xiang',
|
|
@@ -184,7 +176,6 @@ VOICE_MODEL_CONFIGS = (
|
|
| 184 |
'inference_parameters': {
|
| 185 |
'text_lang': "zh",
|
| 186 |
'prompt_text': "这是我们最大的希望能招聘的到人。所以今天阿里巴巴公司内部,我自己这么觉得,人才梯队的建设非常之好。",
|
| 187 |
-
# 'prompt_text': "",
|
| 188 |
'prompt_lang': "zh",
|
| 189 |
'top_k': 5,
|
| 190 |
'top_p': 1,
|
|
@@ -198,174 +189,9 @@ VOICE_MODEL_CONFIGS = (
|
|
| 198 |
'seed': 233333,
|
| 199 |
},
|
| 200 |
},
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
class VoiceModelStatus(enum.Enum):
|
| 205 |
-
"""声音模型状态枚举"""
|
| 206 |
-
NOT_DOWNLOADED = 'not_downloaded'
|
| 207 |
-
DOWNLOADING = 'downloading'
|
| 208 |
-
DOWNLOADED = 'downloaded'
|
| 209 |
-
FAILED = 'failed'
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
class ConversationTemplates(BaseModel):
|
| 213 |
-
"""对话模板"""
|
| 214 |
-
opening_remarks: list[str]
|
| 215 |
-
mid_responses: list[str]
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
class VoiceModel(BaseModel):
|
| 219 |
-
"""声音模型配置类"""
|
| 220 |
-
repository: str
|
| 221 |
-
character_name: str
|
| 222 |
-
cover_image: str
|
| 223 |
-
description: str
|
| 224 |
-
file_size: str
|
| 225 |
-
is_chinese_voice: bool
|
| 226 |
-
model_files: dict[str, str]
|
| 227 |
-
inference_parameters: dict[str, typing.Any]
|
| 228 |
-
# conversation_templates: ConversationTemplates
|
| 229 |
-
|
| 230 |
-
_download_status: VoiceModelStatus = VoiceModelStatus.NOT_DOWNLOADED
|
| 231 |
-
|
| 232 |
-
@property
|
| 233 |
-
def download_status(self) -> VoiceModelStatus:
|
| 234 |
-
"""获取下载状态"""
|
| 235 |
-
if self.is_model_complete:
|
| 236 |
-
return VoiceModelStatus.DOWNLOADED
|
| 237 |
-
return self._download_status
|
| 238 |
-
|
| 239 |
-
@download_status.setter
|
| 240 |
-
def download_status(self, status: VoiceModelStatus):
|
| 241 |
-
"""设置下载状态"""
|
| 242 |
-
self._download_status = status
|
| 243 |
-
|
| 244 |
-
@property
|
| 245 |
-
def model_storage_path(self) -> Path:
|
| 246 |
-
"""获取模型存储路径"""
|
| 247 |
-
storage_path = settings.paths.AUDIO_MODELS_DIR / self.repository
|
| 248 |
-
storage_path.mkdir(parents=True, exist_ok=True)
|
| 249 |
-
return storage_path
|
| 250 |
-
|
| 251 |
-
@property
|
| 252 |
-
def is_model_complete(self) -> bool:
|
| 253 |
-
"""检查模型文件是否完整"""
|
| 254 |
-
for model_file in self.model_files.values():
|
| 255 |
-
file_path = self.model_storage_path / model_file
|
| 256 |
-
if not file_path.exists():
|
| 257 |
-
return False
|
| 258 |
-
return True
|
| 259 |
-
|
| 260 |
-
def download_model(self, progress_callback: typing.Callable = None):
|
| 261 |
-
"""下载模型"""
|
| 262 |
-
self.download_status = VoiceModelStatus.DOWNLOADING
|
| 263 |
-
|
| 264 |
-
try:
|
| 265 |
-
self._download_model_files(progress_callback)
|
| 266 |
-
self.download_status = VoiceModelStatus.DOWNLOADED
|
| 267 |
-
except Exception:
|
| 268 |
-
self.download_status = VoiceModelStatus.FAILED
|
| 269 |
-
raise
|
| 270 |
-
|
| 271 |
-
def _download_model_files(self, progress_callback: typing.Callable = None):
|
| 272 |
-
"""从HuggingFace下载模型文件"""
|
| 273 |
-
with ThreadPoolExecutor() as executor:
|
| 274 |
-
for model_file in self.model_files.values():
|
| 275 |
-
executor.submit(
|
| 276 |
-
download_file_from_huggingface,
|
| 277 |
-
self.model_storage_path,
|
| 278 |
-
self.repository,
|
| 279 |
-
model_file
|
| 280 |
-
)
|
| 281 |
-
|
| 282 |
-
if progress_callback:
|
| 283 |
-
progress_callback()
|
| 284 |
-
|
| 285 |
-
def delete_model(self):
|
| 286 |
-
"""删除模型核心文件"""
|
| 287 |
-
core_files = ['gpt-weights', 'sovits-weights']
|
| 288 |
-
for file_key in core_files:
|
| 289 |
-
file_path = self.model_storage_path / self.model_files.get(file_key, '')
|
| 290 |
-
if file_path.is_file():
|
| 291 |
-
file_path.unlink()
|
| 292 |
-
elif file_path.is_dir():
|
| 293 |
-
file_path.rmdir()
|
| 294 |
-
self.download_status = VoiceModelStatus.NOT_DOWNLOADED
|
| 295 |
-
|
| 296 |
-
# 模型文件路径属性
|
| 297 |
-
@property
|
| 298 |
-
def gpt_weights_path(self) -> Path:
|
| 299 |
-
"""GPT权重文件路径"""
|
| 300 |
-
return self.model_storage_path / self.model_files.get('gpt-weights', '')
|
| 301 |
-
|
| 302 |
-
@property
|
| 303 |
-
def sovits_weights_path(self) -> Path:
|
| 304 |
-
"""SoVITS权重文件路径"""
|
| 305 |
-
return self.model_storage_path / self.model_files.get('sovits-weights', '')
|
| 306 |
-
|
| 307 |
-
@property
|
| 308 |
-
def hubert_model_path(self) -> Path:
|
| 309 |
-
"""中文HuBERT模型路径"""
|
| 310 |
-
return self.model_storage_path / 'chinese-hubert-base'
|
| 311 |
-
|
| 312 |
-
@property
|
| 313 |
-
def bert_model_path(self) -> Path:
|
| 314 |
-
"""中文BERT模型路径"""
|
| 315 |
-
return self.model_storage_path / 'chinese-roberta-wwm-ext-large'
|
| 316 |
-
|
| 317 |
-
@property
|
| 318 |
-
def reference_audio_path(self) -> Path:
|
| 319 |
-
"""参考音频文件路径"""
|
| 320 |
-
return self.model_storage_path / self.model_files.get('reference_audio', '')
|
| 321 |
-
|
| 322 |
-
@property
|
| 323 |
-
def prompt_semantic_path(self) -> Path:
|
| 324 |
-
"""提示语义文件路径"""
|
| 325 |
-
return self.model_storage_path / self.model_files.get('prompt_semantic', '')
|
| 326 |
-
|
| 327 |
-
@property
|
| 328 |
-
def reference_spec_path(self) -> Path:
|
| 329 |
-
"""参考频谱文件路径"""
|
| 330 |
-
return self.model_storage_path / self.model_files.get('reference_spec', '')
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
class VoiceModelRegistry:
|
| 334 |
-
"""声音模型注册表"""
|
| 335 |
-
_registered_models: dict[str, VoiceModel] = {}
|
| 336 |
-
|
| 337 |
-
@classmethod
|
| 338 |
-
def register_models(cls, model_configs: list[dict]) -> list[VoiceModel]:
|
| 339 |
-
"""从配置注册模型"""
|
| 340 |
-
registered_models = []
|
| 341 |
-
|
| 342 |
-
for config in model_configs:
|
| 343 |
-
repository = config.get('repository', '')
|
| 344 |
-
character_name = config.get('character_name', '')
|
| 345 |
-
model_key = f'{repository}:{character_name}'
|
| 346 |
-
|
| 347 |
-
voice_model = VoiceModel(**config)
|
| 348 |
-
cls._registered_models[model_key] = voice_model
|
| 349 |
-
registered_models.append(voice_model)
|
| 350 |
-
|
| 351 |
-
return registered_models
|
| 352 |
-
|
| 353 |
-
@classmethod
|
| 354 |
-
def get_model(cls, repository: str, character_name: str) -> VoiceModel:
|
| 355 |
-
"""获取指定模型"""
|
| 356 |
-
model_key = f'{repository}:{character_name}'
|
| 357 |
-
return cls._registered_models.get(model_key)
|
| 358 |
-
|
| 359 |
-
@classmethod
|
| 360 |
-
def get_all_models(cls) -> list[VoiceModel]:
|
| 361 |
-
"""获取所有注册的模型"""
|
| 362 |
-
return list(cls._registered_models.values())
|
| 363 |
-
|
| 364 |
-
@classmethod
|
| 365 |
-
def get_version(cls) -> str:
|
| 366 |
-
"""获取模型版本"""
|
| 367 |
-
return 'v2'
|
| 368 |
|
| 369 |
|
| 370 |
-
|
| 371 |
-
|
|
|
|
|
|
| 1 |
+
from .moyoyo_tts import MoYoYoTTSConfig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
# 基础预训练模型文件映射
|
| 4 |
BASE_PRETRAINED_FILES = {
|
|
|
|
| 10 |
'chinese-roberta-wwm-ext-large/tokenizer.json': 'chinese-roberta-wwm-ext-large/tokenizer.json',
|
| 11 |
}
|
| 12 |
|
| 13 |
+
# MoYoYo TTS配置列表
|
| 14 |
+
MOYOYO_TTS_CONFIGS = [
|
| 15 |
{
|
| 16 |
'repository': 'MoYoYoTech/tone-models',
|
| 17 |
'character_name': 'Luo Xiang',
|
|
|
|
| 176 |
'inference_parameters': {
|
| 177 |
'text_lang': "zh",
|
| 178 |
'prompt_text': "这是我们最大的希望能招聘的到人。所以今天阿里巴巴公司内部,我自己这么觉得,人才梯队的建设非常之好。",
|
|
|
|
| 179 |
'prompt_lang': "zh",
|
| 180 |
'top_k': 5,
|
| 181 |
'top_p': 1,
|
|
|
|
| 189 |
'seed': 233333,
|
| 190 |
},
|
| 191 |
},
|
| 192 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
|
| 195 |
+
def get_moyoyo_configs() -> list[MoYoYoTTSConfig]:
|
| 196 |
+
"""获取所有MoYoYo TTS配置"""
|
| 197 |
+
return [MoYoYoTTSConfig(**config) for config in MOYOYO_TTS_CONFIGS]
|
src/VoiceDialogue/models/voice_model/moyoyo_tts.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import typing
|
| 2 |
+
from concurrent.futures.thread import ThreadPoolExecutor
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
from pydantic import BaseModel, Field
|
| 6 |
+
|
| 7 |
+
from config.settings import settings
|
| 8 |
+
from utils.download_utils import download_file_from_huggingface
|
| 9 |
+
from .base import BaseTTSConfig, BaseTTSInference, TTSConfigType, VoiceModelStatus
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class InferenceParameters(BaseModel):
|
| 13 |
+
"""TTS推理参数类"""
|
| 14 |
+
text_lang: str = Field(default="zh", description="文本语言")
|
| 15 |
+
prompt_text: str = Field(default="", description="提示文本")
|
| 16 |
+
prompt_lang: str = Field(default="zh", description="提示语言")
|
| 17 |
+
top_k: int = Field(default=5, ge=1, le=100, description="Top-K采样")
|
| 18 |
+
top_p: float = Field(default=1.0, ge=0.0, le=1.0, description="Top-P采样")
|
| 19 |
+
temperature: float = Field(default=1.0, ge=0.0, description="温度参数")
|
| 20 |
+
text_split_method: str = Field(default="cut3", description="文本分割方法")
|
| 21 |
+
batch_size: int = Field(default=100, ge=1, description="批处理大小")
|
| 22 |
+
speed_factor: float = Field(default=1.1, ge=0.1, le=3.0, description="语速因子")
|
| 23 |
+
split_bucket: bool = Field(default=True, description="是否分桶")
|
| 24 |
+
return_fragment: bool = Field(default=False, description="是否返回片段")
|
| 25 |
+
fragment_interval: float = Field(default=0.07, ge=0.0, description="片段间隔")
|
| 26 |
+
seed: int = Field(default=233333, description="随机种子")
|
| 27 |
+
# parallel_infer: bool = Field(default=False, description="是否并行推理")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class MoYoYoTTSConfig(BaseTTSConfig):
|
| 31 |
+
"""MoYoYo TTS配置类"""
|
| 32 |
+
tts_type: TTSConfigType = TTSConfigType.MOYOYO
|
| 33 |
+
repository: str
|
| 34 |
+
model_files: dict[str, str]
|
| 35 |
+
inference_parameters: InferenceParameters
|
| 36 |
+
|
| 37 |
+
_download_status: VoiceModelStatus = VoiceModelStatus.NOT_DOWNLOADED
|
| 38 |
+
|
| 39 |
+
@property
|
| 40 |
+
def download_status(self) -> VoiceModelStatus:
|
| 41 |
+
"""获取下载状态"""
|
| 42 |
+
if self.is_model_complete():
|
| 43 |
+
return VoiceModelStatus.DOWNLOADED
|
| 44 |
+
return self._download_status
|
| 45 |
+
|
| 46 |
+
@download_status.setter
|
| 47 |
+
def download_status(self, status: VoiceModelStatus):
|
| 48 |
+
"""设置下载状态"""
|
| 49 |
+
self._download_status = status
|
| 50 |
+
|
| 51 |
+
def get_model_storage_path(self) -> Path:
|
| 52 |
+
"""获取模型存储路径"""
|
| 53 |
+
storage_path = settings.paths.AUDIO_MODELS_DIR / self.repository
|
| 54 |
+
storage_path.mkdir(parents=True, exist_ok=True)
|
| 55 |
+
return storage_path
|
| 56 |
+
|
| 57 |
+
def is_model_complete(self) -> bool:
|
| 58 |
+
"""检查模型文件是否完整"""
|
| 59 |
+
storage_path = self.get_model_storage_path()
|
| 60 |
+
for model_file in self.model_files.values():
|
| 61 |
+
file_path = storage_path / model_file
|
| 62 |
+
if not file_path.exists():
|
| 63 |
+
return False
|
| 64 |
+
return True
|
| 65 |
+
|
| 66 |
+
def download_model(self, progress_callback: typing.Callable = None):
|
| 67 |
+
"""下载模型"""
|
| 68 |
+
self.download_status = VoiceModelStatus.DOWNLOADING
|
| 69 |
+
|
| 70 |
+
try:
|
| 71 |
+
self._download_model_files(progress_callback)
|
| 72 |
+
self.download_status = VoiceModelStatus.DOWNLOADED
|
| 73 |
+
except Exception:
|
| 74 |
+
self.download_status = VoiceModelStatus.FAILED
|
| 75 |
+
raise
|
| 76 |
+
|
| 77 |
+
def _download_model_files(self, progress_callback: typing.Callable = None):
|
| 78 |
+
"""从HuggingFace下载模型文件"""
|
| 79 |
+
storage_path = self.get_model_storage_path()
|
| 80 |
+
with ThreadPoolExecutor() as executor:
|
| 81 |
+
for model_file in self.model_files.values():
|
| 82 |
+
executor.submit(
|
| 83 |
+
download_file_from_huggingface,
|
| 84 |
+
storage_path,
|
| 85 |
+
self.repository,
|
| 86 |
+
model_file
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
if progress_callback:
|
| 90 |
+
progress_callback()
|
| 91 |
+
|
| 92 |
+
def delete_model(self):
|
| 93 |
+
"""删除模型核心文件"""
|
| 94 |
+
storage_path = self.get_model_storage_path()
|
| 95 |
+
core_files = ['gpt-weights', 'sovits-weights']
|
| 96 |
+
for file_key in core_files:
|
| 97 |
+
file_path = storage_path / self.model_files.get(file_key, '')
|
| 98 |
+
if file_path.is_file():
|
| 99 |
+
file_path.unlink()
|
| 100 |
+
elif file_path.is_dir():
|
| 101 |
+
file_path.rmdir()
|
| 102 |
+
self.download_status = VoiceModelStatus.NOT_DOWNLOADED
|
| 103 |
+
|
| 104 |
+
# 模型文件路径属性
|
| 105 |
+
@property
|
| 106 |
+
def gpt_weights_path(self) -> Path:
|
| 107 |
+
"""GPT权重文件路径"""
|
| 108 |
+
return self.get_model_storage_path() / self.model_files.get('gpt-weights', '')
|
| 109 |
+
|
| 110 |
+
@property
|
| 111 |
+
def sovits_weights_path(self) -> Path:
|
| 112 |
+
"""SoVITS权重文件路径"""
|
| 113 |
+
return self.get_model_storage_path() / self.model_files.get('sovits-weights', '')
|
| 114 |
+
|
| 115 |
+
@property
|
| 116 |
+
def hubert_model_path(self) -> Path:
|
| 117 |
+
"""中文HuBERT模型路径"""
|
| 118 |
+
return self.get_model_storage_path() / 'chinese-hubert-base'
|
| 119 |
+
|
| 120 |
+
@property
|
| 121 |
+
def bert_model_path(self) -> Path:
|
| 122 |
+
"""中文BERT模型路径"""
|
| 123 |
+
return self.get_model_storage_path() / 'chinese-roberta-wwm-ext-large'
|
| 124 |
+
|
| 125 |
+
@property
|
| 126 |
+
def reference_audio_path(self) -> Path:
|
| 127 |
+
"""参考��频文件路径"""
|
| 128 |
+
return self.get_model_storage_path() / self.model_files.get('reference_audio', '')
|
| 129 |
+
|
| 130 |
+
def get_runtime_config(self) -> typing.Dict[str, typing.Any]:
|
| 131 |
+
"""获取Moyoyo运行时配置"""
|
| 132 |
+
return {
|
| 133 |
+
'default_v2': {
|
| 134 |
+
'version': 'v2',
|
| 135 |
+
'device': 'cpu',
|
| 136 |
+
'is_half': False,
|
| 137 |
+
't2s_weights_path': self.gpt_weights_path,
|
| 138 |
+
'vits_weights_path': self.sovits_weights_path,
|
| 139 |
+
'cnhuhbert_base_path': self.hubert_model_path,
|
| 140 |
+
'bert_base_path': self.bert_model_path,
|
| 141 |
+
}
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
class MoYoYoTTSInference(BaseTTSInference):
|
| 146 |
+
"""MoYoYo TTS推理引擎"""
|
| 147 |
+
|
| 148 |
+
def generate_speech(self, text: str, config: BaseTTSConfig, **kwargs) -> bytes:
|
| 149 |
+
"""生成语音"""
|
| 150 |
+
if not isinstance(config, MoYoYoTTSConfig):
|
| 151 |
+
raise ValueError("配置类型不匹配,需要MoYoYoTTSConfig")
|
| 152 |
+
|
| 153 |
+
# 这里实现MoYoYo TTS的具体推理逻辑
|
| 154 |
+
# 暂时返回空字节,实际实现需要调用相应的TTS模型
|
| 155 |
+
return b""
|
| 156 |
+
|
| 157 |
+
def is_supported_config(self, config: BaseTTSConfig) -> bool:
|
| 158 |
+
"""检查是否支持此配置"""
|
| 159 |
+
return isinstance(config, MoYoYoTTSConfig)
|
src/VoiceDialogue/services/audio/audio_answer.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import time
|
|
|
|
| 2 |
from multiprocessing import Queue
|
| 3 |
from queue import Empty
|
| 4 |
|
|
@@ -8,39 +9,27 @@ load_third_party()
|
|
| 8 |
|
| 9 |
from moyoyo_tts import TTSModule, TTS_Config
|
| 10 |
|
| 11 |
-
from models.voice_model import VoiceModel
|
| 12 |
from models.voice_task import VoiceTask
|
| 13 |
from services.core.base import BaseThread
|
| 14 |
from services.core.constants import dropped_audio_cache, user_still_speaking_event, voice_state_manager
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
class TTSAudioGenerator(BaseThread):
|
| 18 |
"""TTS 音频生成器 - 负责将文本转换为音频"""
|
| 19 |
|
| 20 |
def __init__(self, group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None,
|
| 21 |
-
processed_answer_queue, tts_generated_audio_queue, voice_role:
|
| 22 |
super().__init__(group, target, name, args, kwargs, daemon=daemon)
|
| 23 |
self.processed_answer_queue: Queue = processed_answer_queue
|
| 24 |
self.tts_generated_audio_queue: Queue = tts_generated_audio_queue
|
| 25 |
|
| 26 |
self._device = "cpu" # mps slower 11.66(cpu) vs 39.42(mps)
|
| 27 |
-
self.
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
'version': 'v2',
|
| 33 |
-
'device': f'{device}',
|
| 34 |
-
'is_half': False,
|
| 35 |
-
't2s_weights_path': voice_role.gpt_weights_path,
|
| 36 |
-
'vits_weights_path': voice_role.sovits_weights_path,
|
| 37 |
-
'cnhuhbert_base_path': voice_role.hubert_model_path,
|
| 38 |
-
'bert_base_path': voice_role.bert_model_path,
|
| 39 |
-
# 'prompt_semantic_path': voice_role.prompt_semantic_path,
|
| 40 |
-
# 'refer_spec_path': voice_role.reference_spec_path,
|
| 41 |
-
}
|
| 42 |
-
}
|
| 43 |
-
tts_config = TTS_Config(config)
|
| 44 |
return tts_config
|
| 45 |
|
| 46 |
def warmup(self, warmup_steps=1):
|
|
@@ -53,13 +42,13 @@ class TTSAudioGenerator(BaseThread):
|
|
| 53 |
|
| 54 |
def run(self):
|
| 55 |
|
| 56 |
-
tts_config = self.setup_tts_config(self.
|
| 57 |
|
| 58 |
self.tts_module = TTSModule(tts_config)
|
| 59 |
self.tts_module.setup_inference_params(
|
| 60 |
-
ref_audio=self.
|
| 61 |
parallel_infer=False,
|
| 62 |
-
**self.
|
| 63 |
)
|
| 64 |
self.warmup()
|
| 65 |
|
|
|
|
| 1 |
import time
|
| 2 |
+
import typing
|
| 3 |
from multiprocessing import Queue
|
| 4 |
from queue import Empty
|
| 5 |
|
|
|
|
| 9 |
|
| 10 |
from moyoyo_tts import TTSModule, TTS_Config
|
| 11 |
|
|
|
|
| 12 |
from models.voice_task import VoiceTask
|
| 13 |
from services.core.base import BaseThread
|
| 14 |
from services.core.constants import dropped_audio_cache, user_still_speaking_event, voice_state_manager
|
| 15 |
+
from models.voice_model import MoYoYoTTSConfig
|
| 16 |
|
| 17 |
|
| 18 |
class TTSAudioGenerator(BaseThread):
|
| 19 |
"""TTS 音频生成器 - 负责将文本转换为音频"""
|
| 20 |
|
| 21 |
def __init__(self, group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None,
|
| 22 |
+
processed_answer_queue, tts_generated_audio_queue, voice_role: MoYoYoTTSConfig):
|
| 23 |
super().__init__(group, target, name, args, kwargs, daemon=daemon)
|
| 24 |
self.processed_answer_queue: Queue = processed_answer_queue
|
| 25 |
self.tts_generated_audio_queue: Queue = tts_generated_audio_queue
|
| 26 |
|
| 27 |
self._device = "cpu" # mps slower 11.66(cpu) vs 39.42(mps)
|
| 28 |
+
self._tts_config = voice_role
|
| 29 |
+
self.tts_module: typing.Optional[TTSModule] = None
|
| 30 |
+
|
| 31 |
+
def setup_tts_config(self, voice_role: MoYoYoTTSConfig):
|
| 32 |
+
tts_config = TTS_Config(voice_role.get_runtime_config())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
return tts_config
|
| 34 |
|
| 35 |
def warmup(self, warmup_steps=1):
|
|
|
|
| 42 |
|
| 43 |
def run(self):
|
| 44 |
|
| 45 |
+
tts_config = self.setup_tts_config(self._tts_config)
|
| 46 |
|
| 47 |
self.tts_module = TTSModule(tts_config)
|
| 48 |
self.tts_module.setup_inference_params(
|
| 49 |
+
ref_audio=self._tts_config.reference_audio_path,
|
| 50 |
parallel_infer=False,
|
| 51 |
+
**self._tts_config.inference_parameters.model_dump()
|
| 52 |
)
|
| 53 |
self.warmup()
|
| 54 |
|