liumaolin commited on
Commit
025ca3f
·
1 Parent(s): 7b86866

Refactor voice model structure: extract MoYoYo-specific configurations and introduce universal TTS registry.

Browse files
src/VoiceDialogue/main.py CHANGED
@@ -7,7 +7,7 @@ from config.paths import load_third_party
7
 
8
  load_third_party()
9
 
10
- from models.voice_model import voice_model_registry
11
  from services.audio.aec_audio_capture import EchoCancellingAudioCapture
12
  from services.audio.audio_answer import TTSAudioGenerator
13
  from services.audio.audio_player import AudioStreamPlayer
@@ -21,7 +21,7 @@ language: typing.Literal['zh', 'en'] = 'en'
21
 
22
  def launch_system(
23
  user_language: str,
24
- tts_speaker: str
25
  ):
26
  audio_frames_queue = Queue()
27
  user_voice_queue = Queue()
@@ -58,21 +58,22 @@ def launch_system(
58
  threads.append(answer_generator_worker)
59
 
60
  speaker_mapping = {
61
- '罗翔': 0,
62
- '马保国': 1,
63
- '沈逸': 2,
64
- '杨幂': 3,
65
- '周杰伦': 4,
66
- '马云': 5,
67
  }
68
- speaker = tts_speaker
69
- index = speaker_mapping.get(speaker, 0)
70
- supported_audio_model = voice_model_registry[index]
71
- supported_audio_model.download_model()
 
72
  audio_generator_worker = TTSAudioGenerator(
73
  processed_answer_queue=generated_answer_queue,
74
  tts_generated_audio_queue=tts_generated_audio_queue,
75
- voice_role=supported_audio_model
76
  )
77
  audio_generator_worker.start()
78
  threads.append(audio_generator_worker)
 
7
 
8
  load_third_party()
9
 
10
+ from models.voice_model import tts_config_registry, TTSConfigType
11
  from services.audio.aec_audio_capture import EchoCancellingAudioCapture
12
  from services.audio.audio_answer import TTSAudioGenerator
13
  from services.audio.audio_player import AudioStreamPlayer
 
21
 
22
  def launch_system(
23
  user_language: str,
24
+ speaker: str
25
  ):
26
  audio_frames_queue = Queue()
27
  user_voice_queue = Queue()
 
58
  threads.append(answer_generator_worker)
59
 
60
  speaker_mapping = {
61
+ '罗翔': 'Luo Xiang',
62
+ '马保国': 'Ma Baoguo',
63
+ '沈逸': 'Shen Yi',
64
+ '杨幂': 'Yang Mi',
65
+ '周杰伦': 'Jay Zhou',
66
+ '马云': 'Ma Yun',
67
  }
68
+ role = speaker_mapping.get(speaker)
69
+ if role is None:
70
+ raise ValueError(f"不支持的TTS配置: {speaker}")
71
+
72
+ tts_speaker_config = tts_config_registry.get_config(TTSConfigType.MOYOYO, role)
73
  audio_generator_worker = TTSAudioGenerator(
74
  processed_answer_queue=generated_answer_queue,
75
  tts_generated_audio_queue=tts_generated_audio_queue,
76
+ voice_role=tts_speaker_config
77
  )
78
  audio_generator_worker.start()
79
  threads.append(audio_generator_worker)
src/VoiceDialogue/models/voice_model/__init__.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .base import TTSConfigType, VoiceModelStatus, tts_config_registry
2
+ from .moyoyo_configs import get_moyoyo_configs
3
+ from .moyoyo_tts import MoYoYoTTSConfig, MoYoYoTTSInference
4
+
5
+ # 注册MoYoYo TTS
6
+ moyoyo_inference = MoYoYoTTSInference()
7
+ tts_config_registry.register_inference_engine(TTSConfigType.MOYOYO, moyoyo_inference)
8
+
9
+ # 注册所有MoYoYo配置
10
+ for config in get_moyoyo_configs():
11
+ tts_config_registry.register_config(config)
12
+
13
+ __all__ = [
14
+ 'TTSConfigType',
15
+ 'VoiceModelStatus',
16
+ 'tts_config_registry',
17
+ 'MoYoYoTTSConfig',
18
+ 'MoYoYoTTSInference',
19
+ ]
src/VoiceDialogue/models/voice_model/base.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import typing
2
+ from abc import ABC, abstractmethod
3
+ from enum import Enum
4
+ from pathlib import Path
5
+
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class TTSConfigType(Enum):
10
+ """TTS引擎类型枚举"""
11
+ MOYOYO = 'moyoyo'
12
+ EDGE_TTS = 'edge_tts'
13
+ BARK = 'bark'
14
+ # 可以添加更多TTS引擎
15
+
16
+
17
+ class VoiceModelStatus(Enum):
18
+ """声音模型状态枚举"""
19
+ NOT_DOWNLOADED = 'not_downloaded'
20
+ DOWNLOADING = 'downloading'
21
+ DOWNLOADED = 'downloaded'
22
+ FAILED = 'failed'
23
+
24
+
25
+ class BaseTTSConfig(BaseModel, ABC):
26
+ """TTS配置基类"""
27
+ tts_type: TTSConfigType
28
+ character_name: str
29
+ cover_image: str
30
+ description: str
31
+ file_size: str
32
+ is_chinese_voice: bool
33
+
34
+ @abstractmethod
35
+ def get_model_storage_path(self) -> Path:
36
+ """获取模型存储路径"""
37
+ pass
38
+
39
+ @abstractmethod
40
+ def is_model_complete(self) -> bool:
41
+ """检查模型文件是否完整"""
42
+ pass
43
+
44
+ @abstractmethod
45
+ def download_model(self, progress_callback: typing.Callable = None):
46
+ """下载模型"""
47
+ pass
48
+
49
+ @abstractmethod
50
+ def delete_model(self):
51
+ """删除模型"""
52
+ pass
53
+
54
+
55
+ class BaseTTSInference(ABC):
56
+ """TTS推理基类"""
57
+
58
+ @abstractmethod
59
+ def generate_speech(self, text: str, config: BaseTTSConfig, **kwargs) -> bytes:
60
+ """生成语音"""
61
+ pass
62
+
63
+ @abstractmethod
64
+ def is_supported_config(self, config: BaseTTSConfig) -> bool:
65
+ """检查是否支持此配置"""
66
+ pass
67
+
68
+
69
+ class TTSConfigRegistry:
70
+ """TTS注册表,管理所有TTS引擎和配置"""
71
+
72
+ def __init__(self):
73
+ self._configs: dict[str, BaseTTSConfig] = {}
74
+ self._inference_engines: dict[TTSConfigType, BaseTTSInference] = {}
75
+
76
+ def register_config(self, config: BaseTTSConfig):
77
+ """注册TTS配置"""
78
+ key = f"{config.tts_type.value}:{config.character_name}"
79
+ self._configs[key] = config
80
+
81
+ def register_inference_engine(self, tts_type: TTSConfigType, engine: BaseTTSInference):
82
+ """注册TTS推理引擎"""
83
+ self._inference_engines[tts_type] = engine
84
+
85
+ def get_config(self, tts_type: TTSConfigType, character_name: str) -> BaseTTSConfig:
86
+ """获取指定配置"""
87
+ key = f"{tts_type.value}:{character_name}"
88
+ return self._configs.get(key)
89
+
90
+ def get_configs_by_type(self, tts_type: TTSConfigType) -> list[BaseTTSConfig]:
91
+ """获取指定类型的所有配置"""
92
+ return [config for config in self._configs.values()
93
+ if config.tts_type == tts_type]
94
+
95
+ def get_all_configs(self) -> list[BaseTTSConfig]:
96
+ """获取所有配置"""
97
+ return list(self._configs.values())
98
+
99
+ def get_inference_engine(self, tts_type: TTSConfigType) -> BaseTTSInference:
100
+ """获取推理引擎"""
101
+ return self._inference_engines.get(tts_type)
102
+
103
+ def generate_speech(self, tts_type: TTSConfigType, character_name: str,
104
+ text: str, **kwargs) -> bytes:
105
+ """生成语音的统一接口"""
106
+ config = self.get_config(tts_type, character_name)
107
+ engine = self.get_inference_engine(tts_type)
108
+
109
+ if not config or not engine:
110
+ raise ValueError(f"TTS配置或引擎不存在: {tts_type.value}:{character_name}")
111
+
112
+ if not engine.is_supported_config(config):
113
+ raise ValueError(f"推理引擎不支持此配置: {tts_type.value}:{character_name}")
114
+
115
+ return engine.generate_speech(text, config, **kwargs)
116
+
117
+
118
+ # 全局TTS注册表实例
119
+ tts_config_registry = TTSConfigRegistry()
src/VoiceDialogue/models/{voice_model.py → voice_model/moyoyo_configs.py} RENAMED
@@ -1,12 +1,4 @@
1
- import enum
2
- import typing
3
- from concurrent.futures.thread import ThreadPoolExecutor
4
- from pathlib import Path
5
-
6
- from pydantic import BaseModel
7
-
8
- from config.settings import settings
9
- from utils.download_utils import download_file_from_huggingface
10
 
11
  # 基础预训练模型文件映射
12
  BASE_PRETRAINED_FILES = {
@@ -18,8 +10,8 @@ BASE_PRETRAINED_FILES = {
18
  'chinese-roberta-wwm-ext-large/tokenizer.json': 'chinese-roberta-wwm-ext-large/tokenizer.json',
19
  }
20
 
21
- # 声音模型配置
22
- VOICE_MODEL_CONFIGS = (
23
  {
24
  'repository': 'MoYoYoTech/tone-models',
25
  'character_name': 'Luo Xiang',
@@ -184,7 +176,6 @@ VOICE_MODEL_CONFIGS = (
184
  'inference_parameters': {
185
  'text_lang': "zh",
186
  'prompt_text': "这是我们最大的希望能招聘的到人。所以今天阿里巴巴公司内部,我自己这么觉得,人才梯队的建设非常之好。",
187
- # 'prompt_text': "",
188
  'prompt_lang': "zh",
189
  'top_k': 5,
190
  'top_p': 1,
@@ -198,174 +189,9 @@ VOICE_MODEL_CONFIGS = (
198
  'seed': 233333,
199
  },
200
  },
201
- )
202
-
203
-
204
- class VoiceModelStatus(enum.Enum):
205
- """声音模型状态枚举"""
206
- NOT_DOWNLOADED = 'not_downloaded'
207
- DOWNLOADING = 'downloading'
208
- DOWNLOADED = 'downloaded'
209
- FAILED = 'failed'
210
-
211
-
212
- class ConversationTemplates(BaseModel):
213
- """对话模板"""
214
- opening_remarks: list[str]
215
- mid_responses: list[str]
216
-
217
-
218
- class VoiceModel(BaseModel):
219
- """声音模型配置类"""
220
- repository: str
221
- character_name: str
222
- cover_image: str
223
- description: str
224
- file_size: str
225
- is_chinese_voice: bool
226
- model_files: dict[str, str]
227
- inference_parameters: dict[str, typing.Any]
228
- # conversation_templates: ConversationTemplates
229
-
230
- _download_status: VoiceModelStatus = VoiceModelStatus.NOT_DOWNLOADED
231
-
232
- @property
233
- def download_status(self) -> VoiceModelStatus:
234
- """获取下载状态"""
235
- if self.is_model_complete:
236
- return VoiceModelStatus.DOWNLOADED
237
- return self._download_status
238
-
239
- @download_status.setter
240
- def download_status(self, status: VoiceModelStatus):
241
- """设置下载状态"""
242
- self._download_status = status
243
-
244
- @property
245
- def model_storage_path(self) -> Path:
246
- """获取模型存储路径"""
247
- storage_path = settings.paths.AUDIO_MODELS_DIR / self.repository
248
- storage_path.mkdir(parents=True, exist_ok=True)
249
- return storage_path
250
-
251
- @property
252
- def is_model_complete(self) -> bool:
253
- """检查模型文件是否完整"""
254
- for model_file in self.model_files.values():
255
- file_path = self.model_storage_path / model_file
256
- if not file_path.exists():
257
- return False
258
- return True
259
-
260
- def download_model(self, progress_callback: typing.Callable = None):
261
- """下载模型"""
262
- self.download_status = VoiceModelStatus.DOWNLOADING
263
-
264
- try:
265
- self._download_model_files(progress_callback)
266
- self.download_status = VoiceModelStatus.DOWNLOADED
267
- except Exception:
268
- self.download_status = VoiceModelStatus.FAILED
269
- raise
270
-
271
- def _download_model_files(self, progress_callback: typing.Callable = None):
272
- """从HuggingFace下载模型文件"""
273
- with ThreadPoolExecutor() as executor:
274
- for model_file in self.model_files.values():
275
- executor.submit(
276
- download_file_from_huggingface,
277
- self.model_storage_path,
278
- self.repository,
279
- model_file
280
- )
281
-
282
- if progress_callback:
283
- progress_callback()
284
-
285
- def delete_model(self):
286
- """删除模型核心文件"""
287
- core_files = ['gpt-weights', 'sovits-weights']
288
- for file_key in core_files:
289
- file_path = self.model_storage_path / self.model_files.get(file_key, '')
290
- if file_path.is_file():
291
- file_path.unlink()
292
- elif file_path.is_dir():
293
- file_path.rmdir()
294
- self.download_status = VoiceModelStatus.NOT_DOWNLOADED
295
-
296
- # 模型文件路径属性
297
- @property
298
- def gpt_weights_path(self) -> Path:
299
- """GPT权重文件路径"""
300
- return self.model_storage_path / self.model_files.get('gpt-weights', '')
301
-
302
- @property
303
- def sovits_weights_path(self) -> Path:
304
- """SoVITS权重文件路径"""
305
- return self.model_storage_path / self.model_files.get('sovits-weights', '')
306
-
307
- @property
308
- def hubert_model_path(self) -> Path:
309
- """中文HuBERT模型路径"""
310
- return self.model_storage_path / 'chinese-hubert-base'
311
-
312
- @property
313
- def bert_model_path(self) -> Path:
314
- """中文BERT模型路径"""
315
- return self.model_storage_path / 'chinese-roberta-wwm-ext-large'
316
-
317
- @property
318
- def reference_audio_path(self) -> Path:
319
- """参考音频文件路径"""
320
- return self.model_storage_path / self.model_files.get('reference_audio', '')
321
-
322
- @property
323
- def prompt_semantic_path(self) -> Path:
324
- """提示语义文件路径"""
325
- return self.model_storage_path / self.model_files.get('prompt_semantic', '')
326
-
327
- @property
328
- def reference_spec_path(self) -> Path:
329
- """参考频谱文件路径"""
330
- return self.model_storage_path / self.model_files.get('reference_spec', '')
331
-
332
-
333
- class VoiceModelRegistry:
334
- """声音模型注册表"""
335
- _registered_models: dict[str, VoiceModel] = {}
336
-
337
- @classmethod
338
- def register_models(cls, model_configs: list[dict]) -> list[VoiceModel]:
339
- """从配置注册模型"""
340
- registered_models = []
341
-
342
- for config in model_configs:
343
- repository = config.get('repository', '')
344
- character_name = config.get('character_name', '')
345
- model_key = f'{repository}:{character_name}'
346
-
347
- voice_model = VoiceModel(**config)
348
- cls._registered_models[model_key] = voice_model
349
- registered_models.append(voice_model)
350
-
351
- return registered_models
352
-
353
- @classmethod
354
- def get_model(cls, repository: str, character_name: str) -> VoiceModel:
355
- """获取指定模型"""
356
- model_key = f'{repository}:{character_name}'
357
- return cls._registered_models.get(model_key)
358
-
359
- @classmethod
360
- def get_all_models(cls) -> list[VoiceModel]:
361
- """获取所有注册的模型"""
362
- return list(cls._registered_models.values())
363
-
364
- @classmethod
365
- def get_version(cls) -> str:
366
- """获取模型版本"""
367
- return 'v2'
368
 
369
 
370
- # 全局声音模型注册表实例
371
- voice_model_registry = VoiceModelRegistry.register_models(VOICE_MODEL_CONFIGS)
 
 
1
+ from .moyoyo_tts import MoYoYoTTSConfig
 
 
 
 
 
 
 
 
2
 
3
  # 基础预训练模型文件映射
4
  BASE_PRETRAINED_FILES = {
 
10
  'chinese-roberta-wwm-ext-large/tokenizer.json': 'chinese-roberta-wwm-ext-large/tokenizer.json',
11
  }
12
 
13
+ # MoYoYo TTS配置列表
14
+ MOYOYO_TTS_CONFIGS = [
15
  {
16
  'repository': 'MoYoYoTech/tone-models',
17
  'character_name': 'Luo Xiang',
 
176
  'inference_parameters': {
177
  'text_lang': "zh",
178
  'prompt_text': "这是我们最大的希望能招聘的到人。所以今天阿里巴巴公司内部,我自己这么觉得,人才梯队的建设非常之好。",
 
179
  'prompt_lang': "zh",
180
  'top_k': 5,
181
  'top_p': 1,
 
189
  'seed': 233333,
190
  },
191
  },
192
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
 
194
 
195
+ def get_moyoyo_configs() -> list[MoYoYoTTSConfig]:
196
+ """获取所有MoYoYo TTS配置"""
197
+ return [MoYoYoTTSConfig(**config) for config in MOYOYO_TTS_CONFIGS]
src/VoiceDialogue/models/voice_model/moyoyo_tts.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import typing
2
+ from concurrent.futures.thread import ThreadPoolExecutor
3
+ from pathlib import Path
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+ from config.settings import settings
8
+ from utils.download_utils import download_file_from_huggingface
9
+ from .base import BaseTTSConfig, BaseTTSInference, TTSConfigType, VoiceModelStatus
10
+
11
+
12
+ class InferenceParameters(BaseModel):
13
+ """TTS推理参数类"""
14
+ text_lang: str = Field(default="zh", description="文本语言")
15
+ prompt_text: str = Field(default="", description="提示文本")
16
+ prompt_lang: str = Field(default="zh", description="提示语言")
17
+ top_k: int = Field(default=5, ge=1, le=100, description="Top-K采样")
18
+ top_p: float = Field(default=1.0, ge=0.0, le=1.0, description="Top-P采样")
19
+ temperature: float = Field(default=1.0, ge=0.0, description="温度参数")
20
+ text_split_method: str = Field(default="cut3", description="文本分割方法")
21
+ batch_size: int = Field(default=100, ge=1, description="批处理大小")
22
+ speed_factor: float = Field(default=1.1, ge=0.1, le=3.0, description="语速因子")
23
+ split_bucket: bool = Field(default=True, description="是否分桶")
24
+ return_fragment: bool = Field(default=False, description="是否返回片段")
25
+ fragment_interval: float = Field(default=0.07, ge=0.0, description="片段间隔")
26
+ seed: int = Field(default=233333, description="随机种子")
27
+ # parallel_infer: bool = Field(default=False, description="是否并行推理")
28
+
29
+
30
+ class MoYoYoTTSConfig(BaseTTSConfig):
31
+ """MoYoYo TTS配置类"""
32
+ tts_type: TTSConfigType = TTSConfigType.MOYOYO
33
+ repository: str
34
+ model_files: dict[str, str]
35
+ inference_parameters: InferenceParameters
36
+
37
+ _download_status: VoiceModelStatus = VoiceModelStatus.NOT_DOWNLOADED
38
+
39
+ @property
40
+ def download_status(self) -> VoiceModelStatus:
41
+ """获取下载状态"""
42
+ if self.is_model_complete():
43
+ return VoiceModelStatus.DOWNLOADED
44
+ return self._download_status
45
+
46
+ @download_status.setter
47
+ def download_status(self, status: VoiceModelStatus):
48
+ """设置下载状态"""
49
+ self._download_status = status
50
+
51
+ def get_model_storage_path(self) -> Path:
52
+ """获取模型存储路径"""
53
+ storage_path = settings.paths.AUDIO_MODELS_DIR / self.repository
54
+ storage_path.mkdir(parents=True, exist_ok=True)
55
+ return storage_path
56
+
57
+ def is_model_complete(self) -> bool:
58
+ """检查模型文件是否完整"""
59
+ storage_path = self.get_model_storage_path()
60
+ for model_file in self.model_files.values():
61
+ file_path = storage_path / model_file
62
+ if not file_path.exists():
63
+ return False
64
+ return True
65
+
66
+ def download_model(self, progress_callback: typing.Callable = None):
67
+ """下载模型"""
68
+ self.download_status = VoiceModelStatus.DOWNLOADING
69
+
70
+ try:
71
+ self._download_model_files(progress_callback)
72
+ self.download_status = VoiceModelStatus.DOWNLOADED
73
+ except Exception:
74
+ self.download_status = VoiceModelStatus.FAILED
75
+ raise
76
+
77
+ def _download_model_files(self, progress_callback: typing.Callable = None):
78
+ """从HuggingFace下载模型文件"""
79
+ storage_path = self.get_model_storage_path()
80
+ with ThreadPoolExecutor() as executor:
81
+ for model_file in self.model_files.values():
82
+ executor.submit(
83
+ download_file_from_huggingface,
84
+ storage_path,
85
+ self.repository,
86
+ model_file
87
+ )
88
+
89
+ if progress_callback:
90
+ progress_callback()
91
+
92
+ def delete_model(self):
93
+ """删除模型核心文件"""
94
+ storage_path = self.get_model_storage_path()
95
+ core_files = ['gpt-weights', 'sovits-weights']
96
+ for file_key in core_files:
97
+ file_path = storage_path / self.model_files.get(file_key, '')
98
+ if file_path.is_file():
99
+ file_path.unlink()
100
+ elif file_path.is_dir():
101
+ file_path.rmdir()
102
+ self.download_status = VoiceModelStatus.NOT_DOWNLOADED
103
+
104
+ # 模型文件路径属性
105
+ @property
106
+ def gpt_weights_path(self) -> Path:
107
+ """GPT权重文件路径"""
108
+ return self.get_model_storage_path() / self.model_files.get('gpt-weights', '')
109
+
110
+ @property
111
+ def sovits_weights_path(self) -> Path:
112
+ """SoVITS权重文件路径"""
113
+ return self.get_model_storage_path() / self.model_files.get('sovits-weights', '')
114
+
115
+ @property
116
+ def hubert_model_path(self) -> Path:
117
+ """中文HuBERT模型路径"""
118
+ return self.get_model_storage_path() / 'chinese-hubert-base'
119
+
120
+ @property
121
+ def bert_model_path(self) -> Path:
122
+ """中文BERT模型路径"""
123
+ return self.get_model_storage_path() / 'chinese-roberta-wwm-ext-large'
124
+
125
+ @property
126
+ def reference_audio_path(self) -> Path:
127
+ """参考��频文件路径"""
128
+ return self.get_model_storage_path() / self.model_files.get('reference_audio', '')
129
+
130
+ def get_runtime_config(self) -> typing.Dict[str, typing.Any]:
131
+ """获取Moyoyo运行时配置"""
132
+ return {
133
+ 'default_v2': {
134
+ 'version': 'v2',
135
+ 'device': 'cpu',
136
+ 'is_half': False,
137
+ 't2s_weights_path': self.gpt_weights_path,
138
+ 'vits_weights_path': self.sovits_weights_path,
139
+ 'cnhuhbert_base_path': self.hubert_model_path,
140
+ 'bert_base_path': self.bert_model_path,
141
+ }
142
+ }
143
+
144
+
145
+ class MoYoYoTTSInference(BaseTTSInference):
146
+ """MoYoYo TTS推理引擎"""
147
+
148
+ def generate_speech(self, text: str, config: BaseTTSConfig, **kwargs) -> bytes:
149
+ """生成语音"""
150
+ if not isinstance(config, MoYoYoTTSConfig):
151
+ raise ValueError("配置类型不匹配,需要MoYoYoTTSConfig")
152
+
153
+ # 这里实现MoYoYo TTS的具体推理逻辑
154
+ # 暂时返回空字节,实际实现需要调用相应的TTS模型
155
+ return b""
156
+
157
+ def is_supported_config(self, config: BaseTTSConfig) -> bool:
158
+ """检查是否支持此配置"""
159
+ return isinstance(config, MoYoYoTTSConfig)
src/VoiceDialogue/services/audio/audio_answer.py CHANGED
@@ -1,4 +1,5 @@
1
  import time
 
2
  from multiprocessing import Queue
3
  from queue import Empty
4
 
@@ -8,39 +9,27 @@ load_third_party()
8
 
9
  from moyoyo_tts import TTSModule, TTS_Config
10
 
11
- from models.voice_model import VoiceModel
12
  from models.voice_task import VoiceTask
13
  from services.core.base import BaseThread
14
  from services.core.constants import dropped_audio_cache, user_still_speaking_event, voice_state_manager
 
15
 
16
 
17
  class TTSAudioGenerator(BaseThread):
18
  """TTS 音频生成器 - 负责将文本转换为音频"""
19
 
20
  def __init__(self, group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None,
21
- processed_answer_queue, tts_generated_audio_queue, voice_role: VoiceModel):
22
  super().__init__(group, target, name, args, kwargs, daemon=daemon)
23
  self.processed_answer_queue: Queue = processed_answer_queue
24
  self.tts_generated_audio_queue: Queue = tts_generated_audio_queue
25
 
26
  self._device = "cpu" # mps slower 11.66(cpu) vs 39.42(mps)
27
- self._voice_role = voice_role
28
-
29
- def setup_tts_config(self, device, voice_role: VoiceModel):
30
- config = {
31
- 'default_v2': {
32
- 'version': 'v2',
33
- 'device': f'{device}',
34
- 'is_half': False,
35
- 't2s_weights_path': voice_role.gpt_weights_path,
36
- 'vits_weights_path': voice_role.sovits_weights_path,
37
- 'cnhuhbert_base_path': voice_role.hubert_model_path,
38
- 'bert_base_path': voice_role.bert_model_path,
39
- # 'prompt_semantic_path': voice_role.prompt_semantic_path,
40
- # 'refer_spec_path': voice_role.reference_spec_path,
41
- }
42
- }
43
- tts_config = TTS_Config(config)
44
  return tts_config
45
 
46
  def warmup(self, warmup_steps=1):
@@ -53,13 +42,13 @@ class TTSAudioGenerator(BaseThread):
53
 
54
  def run(self):
55
 
56
- tts_config = self.setup_tts_config(self._device, self._voice_role)
57
 
58
  self.tts_module = TTSModule(tts_config)
59
  self.tts_module.setup_inference_params(
60
- ref_audio=self._voice_role.reference_audio_path,
61
  parallel_infer=False,
62
- **self._voice_role.inference_parameters
63
  )
64
  self.warmup()
65
 
 
1
  import time
2
+ import typing
3
  from multiprocessing import Queue
4
  from queue import Empty
5
 
 
9
 
10
  from moyoyo_tts import TTSModule, TTS_Config
11
 
 
12
  from models.voice_task import VoiceTask
13
  from services.core.base import BaseThread
14
  from services.core.constants import dropped_audio_cache, user_still_speaking_event, voice_state_manager
15
+ from models.voice_model import MoYoYoTTSConfig
16
 
17
 
18
  class TTSAudioGenerator(BaseThread):
19
  """TTS 音频生成器 - 负责将文本转换为音频"""
20
 
21
  def __init__(self, group=None, target=None, name=None, args=(), kwargs={}, *, daemon=None,
22
+ processed_answer_queue, tts_generated_audio_queue, voice_role: MoYoYoTTSConfig):
23
  super().__init__(group, target, name, args, kwargs, daemon=daemon)
24
  self.processed_answer_queue: Queue = processed_answer_queue
25
  self.tts_generated_audio_queue: Queue = tts_generated_audio_queue
26
 
27
  self._device = "cpu" # mps slower 11.66(cpu) vs 39.42(mps)
28
+ self._tts_config = voice_role
29
+ self.tts_module: typing.Optional[TTSModule] = None
30
+
31
+ def setup_tts_config(self, voice_role: MoYoYoTTSConfig):
32
+ tts_config = TTS_Config(voice_role.get_runtime_config())
 
 
 
 
 
 
 
 
 
 
 
 
33
  return tts_config
34
 
35
  def warmup(self, warmup_steps=1):
 
42
 
43
  def run(self):
44
 
45
+ tts_config = self.setup_tts_config(self._tts_config)
46
 
47
  self.tts_module = TTSModule(tts_config)
48
  self.tts_module.setup_inference_params(
49
+ ref_audio=self._tts_config.reference_audio_path,
50
  parallel_infer=False,
51
+ **self._tts_config.inference_parameters.model_dump()
52
  )
53
  self.warmup()
54