liumaolin
commited on
Commit
·
516d7b8
1
Parent(s):
76e7fcd
Integrate FunASR service.
Browse files
README.md
CHANGED
|
@@ -33,10 +33,13 @@ VoiceDialogue 是一个基于 Python 的完整语音对话系统,实现了端
|
|
| 33 |
- **多格式音频支持** - 支持多种音频格式的输入输出
|
| 34 |
|
| 35 |
### 🗣️ 语音识别
|
| 36 |
-
-
|
| 37 |
-
-
|
| 38 |
-
-
|
| 39 |
-
-
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
### 🧠 语言模型
|
| 42 |
支持多种预训练大语言模型:
|
|
@@ -93,12 +96,12 @@ WHISPER_COREML=1 pip install git+https://github.com/absadiki/pywhispercpp
|
|
| 93 |
CMAKE_ARGS="-DGGML_METAL=on" pip install llama-cpp-python
|
| 94 |
```
|
| 95 |
|
| 96 |
-
5**安装项目依赖**
|
| 97 |
```bash
|
| 98 |
pip install -r requirements.txt
|
| 99 |
```
|
| 100 |
|
| 101 |
-
6**安装音频处理工具**
|
| 102 |
```bash
|
| 103 |
# macOS
|
| 104 |
brew install ffmpeg
|
|
@@ -210,7 +213,7 @@ VoiceDialogue/
|
|
| 210 |
│ │ │ └── audio_player.py # 音频播放
|
| 211 |
│ │ ├── speech/ # 语音识别服务
|
| 212 |
│ │ │ ├── speech_monitor.py # 语音状态监控
|
| 213 |
-
│ │ │ └──
|
| 214 |
│ │ ├── text/ # 文本生成服务
|
| 215 |
│ │ │ └── text_generator.py # LLM 文本生成
|
| 216 |
│ │ └── core/ # 核心服务
|
|
@@ -234,7 +237,7 @@ VoiceDialogue/
|
|
| 234 |
### 数据流程图
|
| 235 |
|
| 236 |
```
|
| 237 |
-
用户语音输入 → 回声消除 → 语音活动检测 →
|
| 238 |
↑ ↓
|
| 239 |
└───────────────────────────────── 实时语音交互循环 ─────────────────────────────────┘
|
| 240 |
```
|
|
@@ -265,7 +268,7 @@ VoiceDialogue/
|
|
| 265 |
|
| 266 |
### 基本使用流程
|
| 267 |
|
| 268 |
-
1. **启动系统**: 运行 `python
|
| 269 |
2. **等待加载**: 首次运行会下载模型,请耐心等待
|
| 270 |
3. **开始对话**: 看到"服务启动成功"后直接开始说话
|
| 271 |
4. **语音交互**: 系统会自动检测语音并进行对话
|
|
|
|
| 33 |
- **多格式音频支持** - 支持多种音频格式的输入输出
|
| 34 |
|
| 35 |
### 🗣️ 语音识别
|
| 36 |
+
- **智能语音识别引擎** - 中文使用FunASR高精度识别,其他语言使用Whisper模型
|
| 37 |
+
- **FunASR中文优化** - 专为中文语音优化的识别引擎,支持方言和口音识别
|
| 38 |
+
- **Whisper多语言支持** - 支持 Medium / Large 模型,覆盖多种国际语言
|
| 39 |
+
- **自动语言检测** - 根据配置自动选择最适合的识别引擎
|
| 40 |
+
- **实时转录处理** - 流式语音转文本处理,降低响应延迟
|
| 41 |
+
- **高精度识别** - 基于最新语音识别技术,提供业界领先的识别准确率
|
| 42 |
+
|
| 43 |
|
| 44 |
### 🧠 语言模型
|
| 45 |
支持多种预训练大语言模型:
|
|
|
|
| 96 |
CMAKE_ARGS="-DGGML_METAL=on" pip install llama-cpp-python
|
| 97 |
```
|
| 98 |
|
| 99 |
+
5. **安装项目依赖**
|
| 100 |
```bash
|
| 101 |
pip install -r requirements.txt
|
| 102 |
```
|
| 103 |
|
| 104 |
+
6. **安装音频处理工具**
|
| 105 |
```bash
|
| 106 |
# macOS
|
| 107 |
brew install ffmpeg
|
|
|
|
| 213 |
│ │ │ └── audio_player.py # 音频播放
|
| 214 |
│ │ ├── speech/ # 语音识别服务
|
| 215 |
│ │ │ ├── speech_monitor.py # 语音状态监控
|
| 216 |
+
│ │ │ └── asr_service.py # ASR 识别服务
|
| 217 |
│ │ ├── text/ # 文本生成服务
|
| 218 |
│ │ │ └── text_generator.py # LLM 文本生成
|
| 219 |
│ │ └── core/ # 核心服务
|
|
|
|
| 237 |
### 数据流程图
|
| 238 |
|
| 239 |
```
|
| 240 |
+
用户语音输入 → 回声消除 → 语音活动检测 → 语音转录 → LLM生成回复 → TTS合成 → 音频输出
|
| 241 |
↑ ↓
|
| 242 |
└───────────────────────────────── 实时语音交互循环 ─────────────────────────────────┘
|
| 243 |
```
|
|
|
|
| 268 |
|
| 269 |
### 基本使用流程
|
| 270 |
|
| 271 |
+
1. **启动系统**: 运行 `python src/VoiceDialogue/main.py`
|
| 272 |
2. **等待加载**: 首次运行会下载模型,请耐心等待
|
| 273 |
3. **开始对话**: 看到"服务启动成功"后直接开始说话
|
| 274 |
4. **语音交互**: 系统会自动检测语音并进行对话
|
src/VoiceDialogue/main.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
import sys
|
| 2 |
import typing
|
| 3 |
from multiprocessing import Queue
|
| 4 |
from pathlib import Path
|
|
@@ -13,7 +12,7 @@ from services.audio.aec_audio_capture import EchoCancellingAudioCapture
|
|
| 13 |
from services.audio.audio_answer import TTSAudioGenerator
|
| 14 |
from services.audio.audio_player import AudioStreamPlayer
|
| 15 |
from services.speech.speech_monitor import SpeechStateMonitor
|
| 16 |
-
from services.speech.
|
| 17 |
from services.text.text_generator import LLMResponseGenerator
|
| 18 |
|
| 19 |
|
|
@@ -48,9 +47,9 @@ def launch_system(
|
|
| 48 |
threads.append(user_voice_checker)
|
| 49 |
|
| 50 |
#
|
| 51 |
-
whisper_worker =
|
| 52 |
user_voice_queue=user_voice_queue, transcribed_text_queue=transcribed_text_queue,
|
| 53 |
-
|
| 54 |
)
|
| 55 |
whisper_worker.start()
|
| 56 |
threads.append(whisper_worker)
|
|
|
|
|
|
|
| 1 |
import typing
|
| 2 |
from multiprocessing import Queue
|
| 3 |
from pathlib import Path
|
|
|
|
| 12 |
from services.audio.audio_answer import TTSAudioGenerator
|
| 13 |
from services.audio.audio_player import AudioStreamPlayer
|
| 14 |
from services.speech.speech_monitor import SpeechStateMonitor
|
| 15 |
+
from services.speech.asr_service import ASRWorker
|
| 16 |
from services.text.text_generator import LLMResponseGenerator
|
| 17 |
|
| 18 |
|
|
|
|
| 47 |
threads.append(user_voice_checker)
|
| 48 |
|
| 49 |
#
|
| 50 |
+
whisper_worker = ASRWorker(
|
| 51 |
user_voice_queue=user_voice_queue, transcribed_text_queue=transcribed_text_queue,
|
| 52 |
+
language=user_language
|
| 53 |
)
|
| 54 |
whisper_worker.start()
|
| 55 |
threads.append(whisper_worker)
|
src/VoiceDialogue/services/speech/{whisper_service.py → asr_service.py}
RENAMED
|
@@ -4,47 +4,82 @@ from queue import Queue
|
|
| 4 |
|
| 5 |
import librosa
|
| 6 |
import numpy as np
|
|
|
|
| 7 |
from pywhispercpp.model import Model
|
| 8 |
|
| 9 |
from config import paths
|
| 10 |
-
from config.paths import RESOURCES_PATH
|
| 11 |
from models.voice_task import VoiceTask
|
| 12 |
from services.core.base import BaseThread
|
| 13 |
from services.core.constants import user_still_speaking_event, voice_state_manager, dropped_audio_cache
|
| 14 |
from utils.cache import LRUCacheDict
|
| 15 |
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
class WhisperCppClient:
|
| 18 |
"""Whisper C++ API客户端"""
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
|
|
|
| 22 |
else:
|
| 23 |
-
model =
|
| 24 |
|
| 25 |
-
models_dir = paths.MODELS_PATH /
|
| 26 |
self.whisper = Model(model=model, models_dir=models_dir)
|
| 27 |
|
| 28 |
-
def
|
| 29 |
-
frequency = 440.0
|
| 30 |
-
duration = duration_seconds + 0.1
|
| 31 |
-
t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False, dtype=audio_data.dtype)
|
| 32 |
-
silence = 0.5 * np.sin(2 * np.pi * frequency * t)
|
| 33 |
-
audio_data = np.concatenate([audio_data, silence])
|
| 34 |
-
return audio_data
|
| 35 |
-
|
| 36 |
-
def transcribe(self, audio_array: np.ndarray, language='en'):
|
| 37 |
if language == "zh":
|
| 38 |
-
prompt =
|
| 39 |
else:
|
| 40 |
-
prompt =
|
| 41 |
|
| 42 |
-
|
| 43 |
-
audio_duration = audio_array.shape[-1] / sample_rate
|
| 44 |
-
one_second = 1.0
|
| 45 |
-
if audio_duration < one_second:
|
| 46 |
-
padding_seconds = one_second - audio_duration
|
| 47 |
-
audio_array = self.padding_silence(audio_array, padding_seconds, sample_rate=sample_rate)
|
| 48 |
|
| 49 |
# print('............... language:', language)
|
| 50 |
segments = self.whisper.transcribe(
|
|
@@ -60,37 +95,94 @@ class WhisperCppClient:
|
|
| 60 |
return text
|
| 61 |
|
| 62 |
|
| 63 |
-
class
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
-
|
|
|
|
| 70 |
|
| 71 |
-
self.
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
self.user_voice_queue = user_voice_queue
|
| 74 |
self.transcribed_text_queue = transcribed_text_queue
|
| 75 |
|
| 76 |
self.cached_user_questions = LRUCacheDict(maxsize=10)
|
| 77 |
-
print('.........whisper worker initialized.')
|
| 78 |
-
|
| 79 |
-
def warmup(self):
|
| 80 |
-
print('[INFO:]Warming up ASR...')
|
| 81 |
-
warmup_audiofile = RESOURCES_PATH / 'audio' / 'jfk.flac'
|
| 82 |
-
data, sr = librosa.load(warmup_audiofile)
|
| 83 |
-
self.model.transcribe(data)
|
| 84 |
|
| 85 |
def run(self):
|
| 86 |
-
|
| 87 |
-
self.warmup()
|
| 88 |
|
| 89 |
while not self.stopped():
|
| 90 |
voice_task: VoiceTask = self.user_voice_queue.get()
|
| 91 |
voice_task.whisper_start_time = time.time()
|
| 92 |
user_voice: np.array = voice_task.user_voice
|
| 93 |
-
transcribed_text = self.
|
| 94 |
voice_task.whisper_end_time = time.time()
|
| 95 |
|
| 96 |
task_id = voice_task.id
|
|
@@ -114,3 +206,5 @@ class WhisperWorker(BaseThread):
|
|
| 114 |
|
| 115 |
voice_task.user_voice = []
|
| 116 |
self.transcribed_text_queue.put(voice_task)
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
import librosa
|
| 6 |
import numpy as np
|
| 7 |
+
from funasr import AutoModel
|
| 8 |
from pywhispercpp.model import Model
|
| 9 |
|
| 10 |
from config import paths
|
|
|
|
| 11 |
from models.voice_task import VoiceTask
|
| 12 |
from services.core.base import BaseThread
|
| 13 |
from services.core.constants import user_still_speaking_event, voice_state_manager, dropped_audio_cache
|
| 14 |
from utils.cache import LRUCacheDict
|
| 15 |
|
| 16 |
|
| 17 |
+
def ensure_minimum_audio_duration(
|
| 18 |
+
audio_array: np.ndarray, min_duration: float = 1.0, sample_rate: int = 16000
|
| 19 |
+
) -> np.ndarray:
|
| 20 |
+
"""
|
| 21 |
+
确保音频数组满足最小时长要求,如果不足则用静音填充
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
audio_array: 输入音频数组
|
| 25 |
+
min_duration: 最小时长要求(秒),默认1秒
|
| 26 |
+
sample_rate: 采样率,默认16000Hz
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
处理后的音频数组
|
| 30 |
+
"""
|
| 31 |
+
audio_duration = audio_array.shape[-1] / sample_rate
|
| 32 |
+
|
| 33 |
+
if audio_duration < min_duration:
|
| 34 |
+
padding_seconds = min_duration - audio_duration
|
| 35 |
+
audio_array = padding_silence(audio_array, padding_seconds, sample_rate)
|
| 36 |
+
|
| 37 |
+
return audio_array
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def padding_silence(
|
| 41 |
+
audio_data: np.ndarray, duration_seconds: float, sample_rate: int = 16000
|
| 42 |
+
) -> np.ndarray:
|
| 43 |
+
"""
|
| 44 |
+
为音频数据添加静音填充
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
audio_data: 原始音频数据
|
| 48 |
+
duration_seconds: 需要填充的时长(秒)
|
| 49 |
+
sample_rate: 采样率
|
| 50 |
+
|
| 51 |
+
Returns:
|
| 52 |
+
填充后的音频数据
|
| 53 |
+
"""
|
| 54 |
+
frequency = 440.0
|
| 55 |
+
duration = duration_seconds + 0.1
|
| 56 |
+
t = np.linspace(
|
| 57 |
+
0, duration, int(sample_rate * duration), endpoint=False, dtype=audio_data.dtype
|
| 58 |
+
)
|
| 59 |
+
silence = 0.5 * np.sin(2 * np.pi * frequency * t)
|
| 60 |
+
audio_data = np.concatenate([audio_data, silence])
|
| 61 |
+
return audio_data
|
| 62 |
+
|
| 63 |
+
|
| 64 |
class WhisperCppClient:
|
| 65 |
"""Whisper C++ API客户端"""
|
| 66 |
+
|
| 67 |
+
def __init__(self, model: typing.Literal["medium", "large"] = "medium"):
|
| 68 |
+
if model == "medium":
|
| 69 |
+
model = "medium-q5_0"
|
| 70 |
else:
|
| 71 |
+
model = "large-v3-turbo-q5_0"
|
| 72 |
|
| 73 |
+
models_dir = paths.MODELS_PATH / "asr"
|
| 74 |
self.whisper = Model(model=model, models_dir=models_dir)
|
| 75 |
|
| 76 |
+
def transcribe(self, audio_array: np.ndarray, language="en"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
if language == "zh":
|
| 78 |
+
prompt = "以下是简体中文普通话的句子。"
|
| 79 |
else:
|
| 80 |
+
prompt = "The following is an English sentence."
|
| 81 |
|
| 82 |
+
audio_array = ensure_minimum_audio_duration(audio_array)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
# print('............... language:', language)
|
| 85 |
segments = self.whisper.transcribe(
|
|
|
|
| 95 |
return text
|
| 96 |
|
| 97 |
|
| 98 |
+
class FunASRClient:
|
| 99 |
+
"""FunASR API客户端"""
|
| 100 |
+
|
| 101 |
+
def __init__(self):
|
| 102 |
+
# 设置模型缓存目录
|
| 103 |
+
models_dir = paths.MODELS_PATH / "asr"
|
| 104 |
+
asr_model_path = (
|
| 105 |
+
models_dir
|
| 106 |
+
/ "speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
| 107 |
+
)
|
| 108 |
+
vad_model_path = models_dir / "speech_fsmn_vad_zh-cn-16k-common-pytorch"
|
| 109 |
+
punc_model_path = (
|
| 110 |
+
models_dir / "punc_ct-transformer_cn-en-common-vocab471067-large"
|
| 111 |
+
)
|
| 112 |
+
self.funasr_model = AutoModel(
|
| 113 |
+
model=asr_model_path,
|
| 114 |
+
vad_model=vad_model_path.as_posix(),
|
| 115 |
+
punc_model=punc_model_path.as_posix(),
|
| 116 |
+
log_level="ERROR",
|
| 117 |
+
disable_update=True,
|
| 118 |
+
)
|
| 119 |
|
| 120 |
+
def transcribe(self, audio_array: np.ndarray, language="auto"):
|
| 121 |
+
audio_array = ensure_minimum_audio_duration(audio_array)
|
| 122 |
|
| 123 |
+
segments = self.funasr_model.generate(input=audio_array, disable_pbar=True)
|
| 124 |
|
| 125 |
+
transcibed_texts = []
|
| 126 |
+
for segment in segments:
|
| 127 |
+
content = segment.get("text", "")
|
| 128 |
+
transcibed_texts.append(content)
|
| 129 |
+
return " ".join(transcibed_texts)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
class UnifiedASRClient:
|
| 133 |
+
"""统一的语音识别客户端,根据语言自动选择FunASR或Whisper"""
|
| 134 |
+
|
| 135 |
+
def __init__(self, language: typing.Literal["auto", "zh", "en"] = "zh"):
|
| 136 |
+
self.language = language
|
| 137 |
+
|
| 138 |
+
if language == "zh":
|
| 139 |
+
self.client = FunASRClient()
|
| 140 |
+
else:
|
| 141 |
+
self.client = WhisperCppClient()
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def warmup(self):
|
| 145 |
+
"""预热模型"""
|
| 146 |
+
print('[INFO] 预热语音识别模型...')
|
| 147 |
+
try:
|
| 148 |
+
warmup_audiofile = paths.RESOURCES_PATH / 'audio' / 'jfk.flac'
|
| 149 |
+
if warmup_audiofile.exists():
|
| 150 |
+
data, sr = librosa.load(warmup_audiofile, sr=16000, mono=True)
|
| 151 |
+
self.client.transcribe(data, language=self.language)
|
| 152 |
+
else:
|
| 153 |
+
# 创建测试音频
|
| 154 |
+
test_audio = np.random.randn(16000).astype(np.float32) * 0.1 # 1秒的噪声
|
| 155 |
+
self.client.transcribe(test_audio, language=self.language)
|
| 156 |
+
print('[INFO] ASR模型预热完成')
|
| 157 |
+
except Exception as e:
|
| 158 |
+
print(f'[WARNING] ASR模型预热失败: {e}')
|
| 159 |
+
|
| 160 |
+
def transcribe(self, audio_array: np.ndarray) -> str:
|
| 161 |
+
return self.client.transcribe(audio_array, language=self.language)
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
class ASRWorker(BaseThread):
|
| 165 |
+
def __init__(self, group=None, target=None, name=None, args=(), kwargs=None, *, daemon=None,
|
| 166 |
+
user_voice_queue: Queue,
|
| 167 |
+
transcribed_text_queue: Queue,
|
| 168 |
+
language: typing.Literal["auto", "zh", "en"] = "zh"):
|
| 169 |
+
super().__init__(group, target, name, args, kwargs, daemon=daemon)
|
| 170 |
+
|
| 171 |
+
self.language = language
|
| 172 |
self.user_voice_queue = user_voice_queue
|
| 173 |
self.transcribed_text_queue = transcribed_text_queue
|
| 174 |
|
| 175 |
self.cached_user_questions = LRUCacheDict(maxsize=10)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
def run(self):
|
| 178 |
+
self.client = UnifiedASRClient(self.language)
|
| 179 |
+
self.client.warmup()
|
| 180 |
|
| 181 |
while not self.stopped():
|
| 182 |
voice_task: VoiceTask = self.user_voice_queue.get()
|
| 183 |
voice_task.whisper_start_time = time.time()
|
| 184 |
user_voice: np.array = voice_task.user_voice
|
| 185 |
+
transcribed_text = self.client.transcribe(user_voice)
|
| 186 |
voice_task.whisper_end_time = time.time()
|
| 187 |
|
| 188 |
task_id = voice_task.id
|
|
|
|
| 206 |
|
| 207 |
voice_task.user_voice = []
|
| 208 |
self.transcribed_text_queue.put(voice_task)
|
| 209 |
+
|
| 210 |
+
|