liumaolin
commited on
Commit
·
2ebe57f
1
Parent(s):
d74bcbf
新增Mixin类以增强语音任务处理功能
Browse files- 添加`TaskStatusMixin`、`HistoryMixin`和`PerformanceLogMixin`类,提供语音任务状态检查、聊天历史更新和性能日志记录功能。
- 在`TTSAudioGenerator`和`AudioStreamPlayer`类中集成新混入类,优化任务处理逻辑。
- 更新`utils.py`,新增音频帧归一化和时长计算功能。
- src/voice_dialogue/services/audio/generator.py +35 -59
- src/voice_dialogue/services/audio/player.py +62 -101
- src/voice_dialogue/services/mixins.py +91 -0
- src/voice_dialogue/services/speech/monitor.py +5 -4
- src/voice_dialogue/services/speech/recognizer.py +1 -1
- src/voice_dialogue/services/text/generator.py +1 -1
- src/voice_dialogue/services/utils.py +37 -0
src/voice_dialogue/services/audio/generator.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
| 1 |
-
import re
|
| 2 |
import time
|
| 3 |
from multiprocessing import Queue
|
| 4 |
from queue import Empty
|
| 5 |
|
| 6 |
from voice_dialogue.core.base import BaseThread
|
| 7 |
-
from voice_dialogue.core.constants import
|
| 8 |
-
session_manager, is_debug_mode
|
| 9 |
from voice_dialogue.models.voice_task import VoiceTask
|
|
|
|
|
|
|
| 10 |
from voice_dialogue.utils.logger import logger
|
| 11 |
from .generators import tts_manager, BaseTTSConfig
|
| 12 |
|
| 13 |
|
| 14 |
-
class TTSAudioGenerator(BaseThread):
|
| 15 |
"""
|
| 16 |
TTS 音频生成器 - 负责将文本转换为音频
|
| 17 |
|
|
@@ -62,69 +62,45 @@ class TTSAudioGenerator(BaseThread):
|
|
| 62 |
|
| 63 |
while not self.is_exited:
|
| 64 |
try:
|
| 65 |
-
voice_task: VoiceTask = self.text_input_queue.get(block=
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
if not voice_task.answer_sentence:
|
| 70 |
-
continue
|
| 71 |
-
|
| 72 |
-
answer_id = voice_task.answer_id
|
| 73 |
-
if user_still_speaking_event.is_set():
|
| 74 |
-
voice_state_manager.drop_audio_task(voice_task.id)
|
| 75 |
-
dropped_audio_cache[answer_id] = answer_id
|
| 76 |
-
user_still_speaking_event.clear()
|
| 77 |
-
continue
|
| 78 |
|
| 79 |
-
|
| 80 |
-
continue
|
| 81 |
-
|
| 82 |
-
if self.is_task_interrupted(voice_task):
|
| 83 |
-
continue
|
| 84 |
-
|
| 85 |
-
if voice_task.session_id != session_manager.current_id:
|
| 86 |
-
continue
|
| 87 |
|
| 88 |
-
|
| 89 |
-
logger.info(f"跳过仅包含标点的文本: '{voice_task.answer_sentence}'")
|
| 90 |
continue
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
| 94 |
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
tts_generated_sentence_audio = self.tts_instance.synthesize(voice_task.answer_sentence)
|
| 98 |
-
except Exception as e:
|
| 99 |
-
logger.error(f"TTS 音频生成失败: {e}")
|
| 100 |
-
voice_state_manager.reset_task_id()
|
| 101 |
-
continue
|
| 102 |
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
# print(f'生成音频:{voice_task.answer_sentence}')
|
| 106 |
|
| 107 |
-
|
|
|
|
|
|
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
检查语音任务是否被中断
|
| 112 |
|
| 113 |
-
|
| 114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
"""
|
| 119 |
-
return (voice_state_manager.interrupt_task_id and
|
| 120 |
-
voice_task.id != voice_state_manager.interrupt_task_id)
|
| 121 |
|
| 122 |
-
|
| 123 |
-
"""
|
| 124 |
-
检查文本是否不包含任何单词(字母、数字或中文字符)。
|
| 125 |
-
如果文本只包含标点、空格等符号,则返回 True。
|
| 126 |
-
"""
|
| 127 |
-
# 搜索任何字母、数字或中文字符
|
| 128 |
-
if re.search(r'[\u4e00-\u9fa5a-zA-Z0-9]', text):
|
| 129 |
-
return False
|
| 130 |
-
return True
|
|
|
|
|
|
|
| 1 |
import time
|
| 2 |
from multiprocessing import Queue
|
| 3 |
from queue import Empty
|
| 4 |
|
| 5 |
from voice_dialogue.core.base import BaseThread
|
| 6 |
+
from voice_dialogue.core.constants import voice_state_manager, is_debug_mode
|
|
|
|
| 7 |
from voice_dialogue.models.voice_task import VoiceTask
|
| 8 |
+
from voice_dialogue.services.mixins import TaskStatusMixin
|
| 9 |
+
from voice_dialogue.services.utils import has_no_words
|
| 10 |
from voice_dialogue.utils.logger import logger
|
| 11 |
from .generators import tts_manager, BaseTTSConfig
|
| 12 |
|
| 13 |
|
| 14 |
+
class TTSAudioGenerator(BaseThread, TaskStatusMixin):
|
| 15 |
"""
|
| 16 |
TTS 音频生成器 - 负责将文本转换为音频
|
| 17 |
|
|
|
|
| 62 |
|
| 63 |
while not self.is_exited:
|
| 64 |
try:
|
| 65 |
+
voice_task: VoiceTask = self.text_input_queue.get(block=True, timeout=1)
|
| 66 |
+
if not voice_task:
|
| 67 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
+
self._process_task(voice_task)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
+
except Empty:
|
|
|
|
| 72 |
continue
|
| 73 |
+
except Exception as e:
|
| 74 |
+
logger.error(f"TTSAudioGenerator 主循环错误: {e}")
|
| 75 |
+
time.sleep(0.1)
|
| 76 |
|
| 77 |
+
def _process_task(self, voice_task: VoiceTask):
|
| 78 |
+
"""处理单个文本到语音任务"""
|
| 79 |
+
if not voice_task.answer_sentence:
|
| 80 |
+
return
|
| 81 |
|
| 82 |
+
if self.handle_user_speaking_interruption(voice_task):
|
| 83 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
+
if not self.is_task_valid(voice_task):
|
| 86 |
+
return
|
|
|
|
| 87 |
|
| 88 |
+
if has_no_words(voice_task.answer_sentence):
|
| 89 |
+
logger.info(f"跳过仅包含标点的文本: '{voice_task.answer_sentence}'")
|
| 90 |
+
return
|
| 91 |
|
| 92 |
+
if is_debug_mode():
|
| 93 |
+
logger.info(f"TTS 音频生成: {voice_task.answer_sentence}")
|
|
|
|
| 94 |
|
| 95 |
+
voice_task.tts_start_time = time.time()
|
| 96 |
+
try:
|
| 97 |
+
tts_generated_sentence_audio = self.tts_instance.synthesize(voice_task.answer_sentence)
|
| 98 |
+
except Exception as e:
|
| 99 |
+
logger.error(f"TTS 音频生成失败: {e}")
|
| 100 |
+
voice_state_manager.reset_task_id()
|
| 101 |
+
return
|
| 102 |
|
| 103 |
+
voice_task.tts_generated_sentence_audio = tts_generated_sentence_audio
|
| 104 |
+
voice_task.tts_end_time = time.time()
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
+
self.audio_output_queue.put(voice_task)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/voice_dialogue/services/audio/player.py
CHANGED
|
@@ -1,21 +1,22 @@
|
|
| 1 |
import tempfile
|
| 2 |
-
|
| 3 |
from multiprocessing import Queue
|
| 4 |
from queue import Empty
|
|
|
|
| 5 |
|
| 6 |
import soundfile as sf
|
| 7 |
from playsound import playsound
|
| 8 |
|
| 9 |
from voice_dialogue.core.base import BaseThread
|
| 10 |
from voice_dialogue.core.constants import (
|
| 11 |
-
|
| 12 |
-
silence_over_threshold_event, session_manager, is_debug_mode
|
| 13 |
)
|
| 14 |
from voice_dialogue.models.voice_task import VoiceTask, AnswerDisplayMessage
|
|
|
|
| 15 |
from voice_dialogue.utils.logger import logger
|
| 16 |
|
| 17 |
|
| 18 |
-
class AudioStreamPlayer(BaseThread):
|
| 19 |
"""音频流播放器 - 负责播放生成的音频并管理播放状态"""
|
| 20 |
|
| 21 |
def __init__(
|
|
@@ -27,116 +28,76 @@ class AudioStreamPlayer(BaseThread):
|
|
| 27 |
self.audio_playing_queue: Queue = audio_playing_queue
|
| 28 |
self.websocket_message_queue: Queue = websocket_message_queue
|
| 29 |
|
| 30 |
-
def
|
| 31 |
-
|
|
|
|
|
|
|
| 32 |
|
|
|
|
|
|
|
|
|
|
| 33 |
while not self.is_exited:
|
|
|
|
|
|
|
| 34 |
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
| 38 |
continue
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
break
|
| 49 |
-
|
| 50 |
-
if self.is_task_interrupted(voice_task):
|
| 51 |
-
break
|
| 52 |
-
|
| 53 |
-
if voice_task.session_id != session_manager.current_id:
|
| 54 |
-
break
|
| 55 |
-
|
| 56 |
-
if answer_id in dropped_audio_cache:
|
| 57 |
-
# print('Drop answer audio')
|
| 58 |
-
break
|
| 59 |
-
|
| 60 |
-
if not silence_over_threshold_event.is_set():
|
| 61 |
-
continue
|
| 62 |
-
|
| 63 |
-
if self.websocket_message_queue:
|
| 64 |
-
self.websocket_message_queue.put_nowait(
|
| 65 |
-
AnswerDisplayMessage(
|
| 66 |
-
session_id=voice_task.session_id,
|
| 67 |
-
task_id=task_id,
|
| 68 |
-
answer_index=voice_task.answer_index,
|
| 69 |
-
answer=voice_task.answer_sentence,
|
| 70 |
-
)
|
| 71 |
)
|
|
|
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
| 75 |
|
| 76 |
-
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
|
| 85 |
-
|
| 86 |
-
|
| 87 |
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
def
|
|
|
|
|
|
|
|
|
|
| 91 |
"""
|
| 92 |
-
|
|
|
|
| 93 |
|
| 94 |
-
|
| 95 |
-
voice_task: 当前处理的语音任务
|
| 96 |
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
audio_duration = librosa.get_duration(y=audio_data, sr=sample_rate)
|
| 110 |
-
logger.info(
|
| 111 |
-
f"\n"
|
| 112 |
-
f"┌───────────────────────── 任务信息 ───────────────────────┐\n"
|
| 113 |
-
f"│ 任务ID: {voice_task.id}\n"
|
| 114 |
-
f"├───────────────────────── 性能统计 ────────────────────────┤\n"
|
| 115 |
-
f"│ ASR 耗时: {asr_duration:.2f}s\n"
|
| 116 |
-
f"│ LLM 耗时: {llm_duration:.2f}s\n"
|
| 117 |
-
f"│ TTS 耗时: {tts_duration:.2f}s\n"
|
| 118 |
-
f"│ 音频长度: {audio_duration:.2f}s\n"
|
| 119 |
-
f"├───────────────────────── 生成内容 ────────────────────────┤\n"
|
| 120 |
-
f"│-> {voice_task.answer_sentence}\n"
|
| 121 |
-
f"└──────────────────────────────────────────────────────────┘"
|
| 122 |
-
)
|
| 123 |
-
|
| 124 |
-
def update_chat_history(self, voice_task):
|
| 125 |
-
chat_history = chat_history_cache.get(voice_task.session_id, OrderedDict())
|
| 126 |
-
task_answer_id = voice_task.answer_id
|
| 127 |
-
user_question = f'{task_answer_id}:human'
|
| 128 |
-
chat_history[user_question] = voice_task.transcribed_text
|
| 129 |
-
|
| 130 |
-
ai_answer = f'{task_answer_id}:ai'
|
| 131 |
-
cached_ai_answer = chat_history.get(ai_answer, [])
|
| 132 |
-
cached_ai_answer.append(voice_task.answer_sentence)
|
| 133 |
-
chat_history[ai_answer] = cached_ai_answer
|
| 134 |
-
|
| 135 |
-
chat_history_cache[voice_task.session_id] = chat_history
|
| 136 |
-
|
| 137 |
-
def playing_audio(self, audio_data, sample_rate=16000):
|
| 138 |
-
with tempfile.NamedTemporaryFile('w+b', suffix='.wav') as soundfile:
|
| 139 |
-
# print(f'================soundfile : {soundfile.name}')
|
| 140 |
-
sf.write(soundfile, audio_data, samplerate=sample_rate, subtype='PCM_16', closefd=False)
|
| 141 |
-
# print(soundfile.name)
|
| 142 |
-
playsound(soundfile.name, block=True)
|
|
|
|
| 1 |
import tempfile
|
| 2 |
+
import time
|
| 3 |
from multiprocessing import Queue
|
| 4 |
from queue import Empty
|
| 5 |
+
from typing import Optional
|
| 6 |
|
| 7 |
import soundfile as sf
|
| 8 |
from playsound import playsound
|
| 9 |
|
| 10 |
from voice_dialogue.core.base import BaseThread
|
| 11 |
from voice_dialogue.core.constants import (
|
| 12 |
+
voice_state_manager, silence_over_threshold_event, is_debug_mode
|
|
|
|
| 13 |
)
|
| 14 |
from voice_dialogue.models.voice_task import VoiceTask, AnswerDisplayMessage
|
| 15 |
+
from voice_dialogue.services.mixins import TaskStatusMixin, HistoryMixin, PerformanceLogMixin
|
| 16 |
from voice_dialogue.utils.logger import logger
|
| 17 |
|
| 18 |
|
| 19 |
+
class AudioStreamPlayer(BaseThread, TaskStatusMixin, HistoryMixin, PerformanceLogMixin):
|
| 20 |
"""音频流播放器 - 负责播放生成的音频并管理播放状态"""
|
| 21 |
|
| 22 |
def __init__(
|
|
|
|
| 28 |
self.audio_playing_queue: Queue = audio_playing_queue
|
| 29 |
self.websocket_message_queue: Queue = websocket_message_queue
|
| 30 |
|
| 31 |
+
def _get_task_from_queue(self) -> Optional[VoiceTask]:
|
| 32 |
+
"""从音频播放队列中获取任务。"""
|
| 33 |
+
# 使用阻塞式获取,当队列为空时,run循环中的Empty异常会处理它
|
| 34 |
+
return self.audio_playing_queue.get(block=True, timeout=1)
|
| 35 |
|
| 36 |
+
def _process_task(self, voice_task: VoiceTask):
|
| 37 |
+
"""处理单个音频播放任务。"""
|
| 38 |
+
# 这个内部循环用于等待一个外部事件(用户静音),同时检查任务是否被中断
|
| 39 |
while not self.is_exited:
|
| 40 |
+
if self.handle_user_speaking_interruption(voice_task):
|
| 41 |
+
return # 任务被中断,结束处理
|
| 42 |
|
| 43 |
+
if not self.is_task_valid(voice_task):
|
| 44 |
+
return # 任务无效,结束处理
|
| 45 |
+
|
| 46 |
+
# 等待用户彻底静音的信号
|
| 47 |
+
if not silence_over_threshold_event.is_set():
|
| 48 |
+
time.sleep(0.05) # 短暂等待,避免CPU空转
|
| 49 |
continue
|
| 50 |
|
| 51 |
+
# --- 开始播放逻辑 ---
|
| 52 |
+
if self.websocket_message_queue:
|
| 53 |
+
self.websocket_message_queue.put_nowait(
|
| 54 |
+
AnswerDisplayMessage(
|
| 55 |
+
session_id=voice_task.session_id,
|
| 56 |
+
task_id=voice_task.id,
|
| 57 |
+
answer_index=voice_task.answer_index,
|
| 58 |
+
answer=voice_task.answer_sentence,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
)
|
| 60 |
+
)
|
| 61 |
|
| 62 |
+
if is_debug_mode():
|
| 63 |
+
self.log_task_performance(voice_task, "音频播放")
|
| 64 |
|
| 65 |
+
self.update_chat_history(voice_task)
|
| 66 |
|
| 67 |
+
voice_state_manager.set_audio_playing(voice_task.id)
|
| 68 |
+
voice_state_manager.reset_task_id()
|
| 69 |
|
| 70 |
+
if not self.is_stopped:
|
| 71 |
+
audio_data, sample_rate = voice_task.tts_generated_sentence_audio
|
| 72 |
+
self._play_audio(audio_data, sample_rate)
|
| 73 |
|
| 74 |
+
# 任务处理完毕,跳出内部循环
|
| 75 |
+
break
|
| 76 |
|
| 77 |
+
def _play_audio(self, audio_data, sample_rate=16000):
|
| 78 |
+
with tempfile.NamedTemporaryFile('w+b', suffix='.wav') as soundfile:
|
| 79 |
+
sf.write(soundfile, audio_data, samplerate=sample_rate, subtype='PCM_16', closefd=False)
|
| 80 |
+
playsound(soundfile.name, block=True)
|
| 81 |
|
| 82 |
+
def run(self):
|
| 83 |
+
"""
|
| 84 |
+
主运行循环。
|
| 85 |
+
不断从队列获取任务,并调用_process_task进行处理。
|
| 86 |
"""
|
| 87 |
+
if not hasattr(self, 'is_ready'):
|
| 88 |
+
logger.warning(f"{self.__class__.__name__} 中缺少 'is_ready' 属性。")
|
| 89 |
|
| 90 |
+
self.is_ready = True
|
|
|
|
| 91 |
|
| 92 |
+
while not self.is_exited:
|
| 93 |
+
try:
|
| 94 |
+
task = self._get_task_from_queue()
|
| 95 |
+
if task:
|
| 96 |
+
self._process_task(task)
|
| 97 |
+
|
| 98 |
+
except Empty:
|
| 99 |
+
# 队列在1秒内没有新项目,这是正常现象,继续循环
|
| 100 |
+
continue
|
| 101 |
+
except Exception as e:
|
| 102 |
+
logger.error(f"在 AudioStreamPlayer 环节发生错误: {e}")
|
| 103 |
+
time.sleep(0.1) # 发生未知错误时短暂休眠
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/voice_dialogue/services/mixins.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import OrderedDict
|
| 2 |
+
|
| 3 |
+
from voice_dialogue.core.constants import (
|
| 4 |
+
voice_state_manager, session_manager, dropped_audio_cache,
|
| 5 |
+
user_still_speaking_event, chat_history_cache
|
| 6 |
+
)
|
| 7 |
+
from voice_dialogue.models.voice_task import VoiceTask
|
| 8 |
+
from voice_dialogue.utils.logger import logger
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class TaskStatusMixin:
|
| 12 |
+
"""提供语音任务状态检查和中断处理的通用功能"""
|
| 13 |
+
|
| 14 |
+
def is_task_interrupted(self, voice_task: VoiceTask) -> bool:
|
| 15 |
+
"""检查语音任务是否被其他任务中断"""
|
| 16 |
+
return (voice_state_manager.interrupt_task_id and
|
| 17 |
+
voice_task.id != voice_state_manager.interrupt_task_id)
|
| 18 |
+
|
| 19 |
+
def is_task_valid(self, voice_task: VoiceTask) -> bool:
|
| 20 |
+
"""检查语音任务是否有效(会话匹配、未被丢弃等)"""
|
| 21 |
+
if self.is_task_interrupted(voice_task):
|
| 22 |
+
return False
|
| 23 |
+
if voice_task.session_id != session_manager.current_id:
|
| 24 |
+
return False
|
| 25 |
+
if voice_task.answer_id in dropped_audio_cache:
|
| 26 |
+
return False
|
| 27 |
+
return True
|
| 28 |
+
|
| 29 |
+
def handle_user_speaking_interruption(self, voice_task: VoiceTask) -> bool:
|
| 30 |
+
"""处理用户继续说话导致的中断"""
|
| 31 |
+
if user_still_speaking_event.is_set():
|
| 32 |
+
logger.info(f'用户仍在说话,丢弃任务 {voice_task.id}')
|
| 33 |
+
voice_state_manager.drop_audio_task(voice_task.id)
|
| 34 |
+
dropped_audio_cache[voice_task.answer_id] = voice_task.answer_id
|
| 35 |
+
user_still_speaking_event.clear()
|
| 36 |
+
return True
|
| 37 |
+
return False
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class HistoryMixin:
|
| 41 |
+
"""提供更新聊天历史记录的功能"""
|
| 42 |
+
|
| 43 |
+
def update_chat_history(self, voice_task: VoiceTask) -> None:
|
| 44 |
+
"""更新会话的聊天历史"""
|
| 45 |
+
chat_history = chat_history_cache.get(voice_task.session_id, OrderedDict())
|
| 46 |
+
task_answer_id = voice_task.answer_id
|
| 47 |
+
|
| 48 |
+
user_question_key = f'{task_answer_id}:human'
|
| 49 |
+
if user_question_key not in chat_history:
|
| 50 |
+
chat_history[user_question_key] = voice_task.transcribed_text
|
| 51 |
+
|
| 52 |
+
ai_answer_key = f'{task_answer_id}:ai'
|
| 53 |
+
cached_ai_answer = chat_history.get(ai_answer_key, [])
|
| 54 |
+
cached_ai_answer.append(voice_task.answer_sentence)
|
| 55 |
+
chat_history[ai_answer_key] = cached_ai_answer
|
| 56 |
+
|
| 57 |
+
chat_history_cache[voice_task.session_id] = chat_history
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
class PerformanceLogMixin:
|
| 61 |
+
"""提供记录任务性能日志的功能"""
|
| 62 |
+
|
| 63 |
+
def log_task_performance(self, voice_task: VoiceTask, task_name: str = "任务"):
|
| 64 |
+
"""记录ASR, LLM, TTS各阶段耗时和音频长度"""
|
| 65 |
+
try:
|
| 66 |
+
from voice_dialogue.services.utils import calculate_audio_duration
|
| 67 |
+
|
| 68 |
+
asr_duration = getattr(voice_task, 'whisper_end_time', 0) - getattr(voice_task, 'whisper_start_time', 0)
|
| 69 |
+
llm_duration = getattr(voice_task, 'llm_end_time', 0) - getattr(voice_task, 'llm_start_time', 0)
|
| 70 |
+
tts_duration = getattr(voice_task, 'tts_end_time', 0) - getattr(voice_task, 'tts_start_time', 0)
|
| 71 |
+
|
| 72 |
+
audio_duration = 0
|
| 73 |
+
if hasattr(voice_task, 'tts_generated_sentence_audio') and voice_task.tts_generated_sentence_audio:
|
| 74 |
+
audio_data, sample_rate = voice_task.tts_generated_sentence_audio
|
| 75 |
+
audio_duration = calculate_audio_duration(audio_data, sample_rate)
|
| 76 |
+
|
| 77 |
+
logger.info(
|
| 78 |
+
f"\n"
|
| 79 |
+
f"┌───────────────────────── 任务信息 ───────────────────────┐\n"
|
| 80 |
+
f"│ 任务ID: {voice_task.id}\n"
|
| 81 |
+
f"├───────────────────────── 性能统计 ────────────────────────┤\n"
|
| 82 |
+
f"│ ASR 耗时: {asr_duration:.2f}s\n"
|
| 83 |
+
f"│ LLM 耗时: {llm_duration:.2f}s\n"
|
| 84 |
+
f"│ TTS 耗时: {tts_duration:.2f}s\n"
|
| 85 |
+
f"│ 音频长度: {audio_duration:.2f}s\n"
|
| 86 |
+
f"├───────────────────────── 生成内容 ────────────────────────┤\n"
|
| 87 |
+
f"│-> {voice_task.answer_sentence}\n"
|
| 88 |
+
f"└──────────────────────────────────────────────────────────┘"
|
| 89 |
+
)
|
| 90 |
+
except Exception as e:
|
| 91 |
+
logger.error(f"记录任务性能信息时出错: {e}")
|
src/voice_dialogue/services/speech/monitor.py
CHANGED
|
@@ -20,6 +20,7 @@ from voice_dialogue.core.constants import (
|
|
| 20 |
from voice_dialogue.core.enums import AudioState
|
| 21 |
from voice_dialogue.models.voice_task import VoiceTask
|
| 22 |
from voice_dialogue.services.audio.vad import SileroVAD
|
|
|
|
| 23 |
from voice_dialogue.utils.logger import logger
|
| 24 |
|
| 25 |
|
|
@@ -110,7 +111,7 @@ class SpeechStateMonitor(BaseThread):
|
|
| 110 |
|
| 111 |
def _normalize_audio_frame(self, data: bytes) -> np.ndarray:
|
| 112 |
"""将 int16 格式的音频字节数据转换为 [-1.0, 1.0] 范围的 numpy 浮点数组。"""
|
| 113 |
-
return
|
| 114 |
|
| 115 |
def _detect_speech(self, audio_frame: np.ndarray) -> bool:
|
| 116 |
return self._vad_instance.is_voice_active(audio_frame, self.sample_rate)
|
|
@@ -119,11 +120,11 @@ class SpeechStateMonitor(BaseThread):
|
|
| 119 |
"""从队列获取音频帧"""
|
| 120 |
try:
|
| 121 |
if self._enable_vad:
|
| 122 |
-
data = self.audio_frame_queue.get(block=
|
| 123 |
audio_frame = self._normalize_audio_frame(data)
|
| 124 |
is_voice_active = self._detect_speech(audio_frame)
|
| 125 |
else:
|
| 126 |
-
data, is_voice_active = self.audio_frame_queue.get(block=
|
| 127 |
audio_frame = self._normalize_audio_frame(data)
|
| 128 |
return audio_frame, is_voice_active
|
| 129 |
except Empty:
|
|
@@ -131,7 +132,7 @@ class SpeechStateMonitor(BaseThread):
|
|
| 131 |
|
| 132 |
def _calculate_frame_duration_ms(self, audio_frame):
|
| 133 |
"""计算音频帧时长(毫秒)"""
|
| 134 |
-
return
|
| 135 |
|
| 136 |
def _process_active_voice_frame(self, audio_frame: np.ndarray):
|
| 137 |
"""
|
|
|
|
| 20 |
from voice_dialogue.core.enums import AudioState
|
| 21 |
from voice_dialogue.models.voice_task import VoiceTask
|
| 22 |
from voice_dialogue.services.audio.vad import SileroVAD
|
| 23 |
+
from voice_dialogue.services.utils import normalize_audio_frame, calculate_audio_duration
|
| 24 |
from voice_dialogue.utils.logger import logger
|
| 25 |
|
| 26 |
|
|
|
|
| 111 |
|
| 112 |
def _normalize_audio_frame(self, data: bytes) -> np.ndarray:
|
| 113 |
"""将 int16 格式的音频字节数据转换为 [-1.0, 1.0] 范围的 numpy 浮点数组。"""
|
| 114 |
+
return normalize_audio_frame(data)
|
| 115 |
|
| 116 |
def _detect_speech(self, audio_frame: np.ndarray) -> bool:
|
| 117 |
return self._vad_instance.is_voice_active(audio_frame, self.sample_rate)
|
|
|
|
| 120 |
"""从队列获取音频帧"""
|
| 121 |
try:
|
| 122 |
if self._enable_vad:
|
| 123 |
+
data = self.audio_frame_queue.get(block=True, timeout=self.config.QUEUE_TIMEOUT)
|
| 124 |
audio_frame = self._normalize_audio_frame(data)
|
| 125 |
is_voice_active = self._detect_speech(audio_frame)
|
| 126 |
else:
|
| 127 |
+
data, is_voice_active = self.audio_frame_queue.get(block=True, timeout=self.config.QUEUE_TIMEOUT)
|
| 128 |
audio_frame = self._normalize_audio_frame(data)
|
| 129 |
return audio_frame, is_voice_active
|
| 130 |
except Empty:
|
|
|
|
| 132 |
|
| 133 |
def _calculate_frame_duration_ms(self, audio_frame):
|
| 134 |
"""计算音频帧时长(毫秒)"""
|
| 135 |
+
return calculate_audio_duration(audio_data=audio_frame, sample_rate=self.sample_rate) * 1000
|
| 136 |
|
| 137 |
def _process_active_voice_frame(self, audio_frame: np.ndarray):
|
| 138 |
"""
|
src/voice_dialogue/services/speech/recognizer.py
CHANGED
|
@@ -33,7 +33,7 @@ class ASRWorker(BaseThread):
|
|
| 33 |
|
| 34 |
while not self.is_exited:
|
| 35 |
try:
|
| 36 |
-
voice_task: VoiceTask = self.user_voice_queue.get(block=
|
| 37 |
except Empty:
|
| 38 |
continue
|
| 39 |
|
|
|
|
| 33 |
|
| 34 |
while not self.is_exited:
|
| 35 |
try:
|
| 36 |
+
voice_task: VoiceTask = self.user_voice_queue.get(block=True, timeout=1)
|
| 37 |
except Empty:
|
| 38 |
continue
|
| 39 |
|
src/voice_dialogue/services/text/generator.py
CHANGED
|
@@ -214,7 +214,7 @@ class LLMResponseGenerator(BaseThread):
|
|
| 214 |
"""主运行循环"""
|
| 215 |
while not self.is_exited:
|
| 216 |
try:
|
| 217 |
-
voice_task: VoiceTask = self.user_question_queue.get(block=
|
| 218 |
self._process_voice_task(voice_task)
|
| 219 |
except Empty:
|
| 220 |
continue
|
|
|
|
| 214 |
"""主运行循环"""
|
| 215 |
while not self.is_exited:
|
| 216 |
try:
|
| 217 |
+
voice_task: VoiceTask = self.user_question_queue.get(block=True, timeout=1)
|
| 218 |
self._process_voice_task(voice_task)
|
| 219 |
except Empty:
|
| 220 |
continue
|
src/voice_dialogue/services/utils.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
import librosa
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
from voice_dialogue.utils.logger import logger
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def has_no_words(text: str) -> bool:
|
| 10 |
+
"""
|
| 11 |
+
检查文本是否不包含任何单词(字母、数字或中文字符)。
|
| 12 |
+
如果文本只包含标点、空格等符号,则返回 True。
|
| 13 |
+
"""
|
| 14 |
+
if not text:
|
| 15 |
+
return True
|
| 16 |
+
# 搜索任何字母、数字或中文字符
|
| 17 |
+
if re.search(r'[\u4e00-\u9fa5a-zA-Z0-9]', text):
|
| 18 |
+
return False
|
| 19 |
+
return True
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def normalize_audio_frame(data: bytes) -> 'np.ndarray':
|
| 23 |
+
"""
|
| 24 |
+
将 int16 格式的音频字节数据转换为 [-1.0, 1.0] 范围的 numpy 浮点数组。
|
| 25 |
+
"""
|
| 26 |
+
return np.frombuffer(data, dtype=np.int16).astype(np.float32) / np.iinfo(np.int16).max
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def calculate_audio_duration(audio_data: 'np.ndarray', sample_rate: int = 16000) -> float:
|
| 30 |
+
"""
|
| 31 |
+
计算音频时长(秒)。
|
| 32 |
+
"""
|
| 33 |
+
try:
|
| 34 |
+
return librosa.get_duration(y=audio_data, sr=sample_rate)
|
| 35 |
+
except Exception as e:
|
| 36 |
+
logger.error(f"计算音频时长时发生错误: {e}")
|
| 37 |
+
return 0.0
|