liumaolin commited on
Commit
516d7b8
·
1 Parent(s): 76e7fcd

Integrate FunASR service.

Browse files
README.md CHANGED
@@ -33,10 +33,13 @@ VoiceDialogue 是一个基于 Python 的完整语音对话系统,实现了端
33
  - **多格式音频支持** - 支持多种音频格式的输入输出
34
 
35
  ### 🗣️ 语音识别
36
- - **Whisper 模型支持** - 支持 Medium/Large 模型,可根据精度需求选择
37
- - **多语言识别** - 自动识别中文/英文语音内容
38
- - **实时转录** - 流式语音转文本处理,降低响应延迟
39
- - **高精度识别** - 基于 OpenAI Whisper 的领先语音识别技术
 
 
 
40
 
41
  ### 🧠 语言模型
42
  支持多种预训练大语言模型:
@@ -93,12 +96,12 @@ WHISPER_COREML=1 pip install git+https://github.com/absadiki/pywhispercpp
93
  CMAKE_ARGS="-DGGML_METAL=on" pip install llama-cpp-python
94
  ```
95
 
96
- 5**安装项目依赖**
97
  ```bash
98
  pip install -r requirements.txt
99
  ```
100
 
101
- 6**安装音频处理工具**
102
  ```bash
103
  # macOS
104
  brew install ffmpeg
@@ -210,7 +213,7 @@ VoiceDialogue/
210
  │ │ │ └── audio_player.py # 音频播放
211
  │ │ ├── speech/ # 语音识别服务
212
  │ │ │ ├── speech_monitor.py # 语音状态监控
213
- │ │ │ └── whisper_service.py # Whisper 识别服务
214
  │ │ ├── text/ # 文本生成服务
215
  │ │ │ └── text_generator.py # LLM 文本生成
216
  │ │ └── core/ # 核心服务
@@ -234,7 +237,7 @@ VoiceDialogue/
234
  ### 数据流程图
235
 
236
  ```
237
- 用户语音输入 → 回声消除 → 语音活动检测 → Whisper转录 → LLM生成回复 → TTS合成 → 音频输出
238
  ↑ ↓
239
  └───────────────────────────────── 实时语音交互循环 ─────────────────────────────────┘
240
  ```
@@ -265,7 +268,7 @@ VoiceDialogue/
265
 
266
  ### 基本使用流程
267
 
268
- 1. **启动系统**: 运行 `python -m src.VoiceDialogue.main`
269
  2. **等待加载**: 首次运行会下载模型,请耐心等待
270
  3. **开始对话**: 看到"服务启动成功"后直接开始说话
271
  4. **语音交互**: 系统会自动检测语音并进行对话
 
33
  - **多格式音频支持** - 支持多种音频格式的输入输出
34
 
35
  ### 🗣️ 语音识别
36
+ - **智能语音识别引擎** - 中文使用FunASR高精度识别,其他语言使用Whisper模型
37
+ - **FunASR中文优化** - 专为中文语音优化的识别引擎,支持方言和口音识别
38
+ - **Whisper多语言支持** - 支持 Medium / Large 模型,覆盖多种国际语言
39
+ - **自动语言检测** - 根据配置自动选择最适合的识别引擎
40
+ - **实时转录处理** - 流式语音转文本处理,降低响应延迟
41
+ - **高精度识别** - 基于最新语音识别技术,提供业界领先的识别准确率
42
+
43
 
44
  ### 🧠 语言模型
45
  支持多种预训练大语言模型:
 
96
  CMAKE_ARGS="-DGGML_METAL=on" pip install llama-cpp-python
97
  ```
98
 
99
+ 5. **安装项目依赖**
100
  ```bash
101
  pip install -r requirements.txt
102
  ```
103
 
104
+ 6. **安装音频处理工具**
105
  ```bash
106
  # macOS
107
  brew install ffmpeg
 
213
  │ │ │ └── audio_player.py # 音频播放
214
  │ │ ├── speech/ # 语音识别服务
215
  │ │ │ ├── speech_monitor.py # 语音状态监控
216
+ │ │ │ └── asr_service.py # ASR 识别服务
217
  │ │ ├── text/ # 文本生成服务
218
  │ │ │ └── text_generator.py # LLM 文本生成
219
  │ │ └── core/ # 核心服务
 
237
  ### 数据流程图
238
 
239
  ```
240
+ 用户语音输入 → 回声消除 → 语音活动检测 → 语音转录 → LLM生成回复 → TTS合成 → 音频输出
241
  ↑ ↓
242
  └───────────────────────────────── 实时语音交互循环 ─────────────────────────────────┘
243
  ```
 
268
 
269
  ### 基本使用流程
270
 
271
+ 1. **启动系统**: 运行 `python src/VoiceDialogue/main.py`
272
  2. **等待加载**: 首次运行会下载模型,请耐心等待
273
  3. **开始对话**: 看到"服务启动成功"后直接开始说话
274
  4. **语音交互**: 系统会自动检测语音并进行对话
src/VoiceDialogue/main.py CHANGED
@@ -1,4 +1,3 @@
1
- import sys
2
  import typing
3
  from multiprocessing import Queue
4
  from pathlib import Path
@@ -13,7 +12,7 @@ from services.audio.aec_audio_capture import EchoCancellingAudioCapture
13
  from services.audio.audio_answer import TTSAudioGenerator
14
  from services.audio.audio_player import AudioStreamPlayer
15
  from services.speech.speech_monitor import SpeechStateMonitor
16
- from services.speech.whisper_service import WhisperWorker
17
  from services.text.text_generator import LLMResponseGenerator
18
 
19
 
@@ -48,9 +47,9 @@ def launch_system(
48
  threads.append(user_voice_checker)
49
 
50
  #
51
- whisper_worker = WhisperWorker(
52
  user_voice_queue=user_voice_queue, transcribed_text_queue=transcribed_text_queue,
53
- lan=user_language, model=whisper_model
54
  )
55
  whisper_worker.start()
56
  threads.append(whisper_worker)
 
 
1
  import typing
2
  from multiprocessing import Queue
3
  from pathlib import Path
 
12
  from services.audio.audio_answer import TTSAudioGenerator
13
  from services.audio.audio_player import AudioStreamPlayer
14
  from services.speech.speech_monitor import SpeechStateMonitor
15
+ from services.speech.asr_service import ASRWorker
16
  from services.text.text_generator import LLMResponseGenerator
17
 
18
 
 
47
  threads.append(user_voice_checker)
48
 
49
  #
50
+ whisper_worker = ASRWorker(
51
  user_voice_queue=user_voice_queue, transcribed_text_queue=transcribed_text_queue,
52
+ language=user_language
53
  )
54
  whisper_worker.start()
55
  threads.append(whisper_worker)
src/VoiceDialogue/services/speech/{whisper_service.py → asr_service.py} RENAMED
@@ -4,47 +4,82 @@ from queue import Queue
4
 
5
  import librosa
6
  import numpy as np
 
7
  from pywhispercpp.model import Model
8
 
9
  from config import paths
10
- from config.paths import RESOURCES_PATH
11
  from models.voice_task import VoiceTask
12
  from services.core.base import BaseThread
13
  from services.core.constants import user_still_speaking_event, voice_state_manager, dropped_audio_cache
14
  from utils.cache import LRUCacheDict
15
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  class WhisperCppClient:
18
  """Whisper C++ API客户端"""
19
- def __init__(self, model: typing.Literal['medium', 'large'] = 'medium'):
20
- if model == 'medium':
21
- model = 'medium-q5_0'
 
22
  else:
23
- model = 'large-v3-turbo-q5_0'
24
 
25
- models_dir = paths.MODELS_PATH / 'asr'
26
  self.whisper = Model(model=model, models_dir=models_dir)
27
 
28
- def padding_silence(self, audio_data, duration_seconds, sample_rate=16000):
29
- frequency = 440.0
30
- duration = duration_seconds + 0.1
31
- t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False, dtype=audio_data.dtype)
32
- silence = 0.5 * np.sin(2 * np.pi * frequency * t)
33
- audio_data = np.concatenate([audio_data, silence])
34
- return audio_data
35
-
36
- def transcribe(self, audio_array: np.ndarray, language='en'):
37
  if language == "zh":
38
- prompt = '以下是简体中文普通话的句子。'
39
  else:
40
- prompt = 'The following is an English sentence.'
41
 
42
- sample_rate = 16000
43
- audio_duration = audio_array.shape[-1] / sample_rate
44
- one_second = 1.0
45
- if audio_duration < one_second:
46
- padding_seconds = one_second - audio_duration
47
- audio_array = self.padding_silence(audio_array, padding_seconds, sample_rate=sample_rate)
48
 
49
  # print('............... language:', language)
50
  segments = self.whisper.transcribe(
@@ -60,37 +95,94 @@ class WhisperCppClient:
60
  return text
61
 
62
 
63
- class WhisperWorker(BaseThread):
64
- def __init__(self, group=None, target=None, name=None, args=(), kwargs=None, *, daemon=None,
65
- user_voice_queue: Queue, transcribed_text_queue: Queue, lan="en",
66
- model: typing.Literal['medium', 'large'] = 'medium'):
67
- super().__init__(group, target, name, args, kwargs, daemon=daemon)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
- self.model = WhisperCppClient(model)
 
70
 
71
- self.language = lan
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  self.user_voice_queue = user_voice_queue
74
  self.transcribed_text_queue = transcribed_text_queue
75
 
76
  self.cached_user_questions = LRUCacheDict(maxsize=10)
77
- print('.........whisper worker initialized.')
78
-
79
- def warmup(self):
80
- print('[INFO:]Warming up ASR...')
81
- warmup_audiofile = RESOURCES_PATH / 'audio' / 'jfk.flac'
82
- data, sr = librosa.load(warmup_audiofile)
83
- self.model.transcribe(data)
84
 
85
  def run(self):
86
-
87
- self.warmup()
88
 
89
  while not self.stopped():
90
  voice_task: VoiceTask = self.user_voice_queue.get()
91
  voice_task.whisper_start_time = time.time()
92
  user_voice: np.array = voice_task.user_voice
93
- transcribed_text = self.model.transcribe(user_voice, language=self.language)
94
  voice_task.whisper_end_time = time.time()
95
 
96
  task_id = voice_task.id
@@ -114,3 +206,5 @@ class WhisperWorker(BaseThread):
114
 
115
  voice_task.user_voice = []
116
  self.transcribed_text_queue.put(voice_task)
 
 
 
4
 
5
  import librosa
6
  import numpy as np
7
+ from funasr import AutoModel
8
  from pywhispercpp.model import Model
9
 
10
  from config import paths
 
11
  from models.voice_task import VoiceTask
12
  from services.core.base import BaseThread
13
  from services.core.constants import user_still_speaking_event, voice_state_manager, dropped_audio_cache
14
  from utils.cache import LRUCacheDict
15
 
16
 
17
+ def ensure_minimum_audio_duration(
18
+ audio_array: np.ndarray, min_duration: float = 1.0, sample_rate: int = 16000
19
+ ) -> np.ndarray:
20
+ """
21
+ 确保音频数组满足最小时长要求,如果不足则用静音填充
22
+
23
+ Args:
24
+ audio_array: 输入音频数组
25
+ min_duration: 最小时长要求(秒),默认1秒
26
+ sample_rate: 采样率,默认16000Hz
27
+
28
+ Returns:
29
+ 处理后的音频数组
30
+ """
31
+ audio_duration = audio_array.shape[-1] / sample_rate
32
+
33
+ if audio_duration < min_duration:
34
+ padding_seconds = min_duration - audio_duration
35
+ audio_array = padding_silence(audio_array, padding_seconds, sample_rate)
36
+
37
+ return audio_array
38
+
39
+
40
+ def padding_silence(
41
+ audio_data: np.ndarray, duration_seconds: float, sample_rate: int = 16000
42
+ ) -> np.ndarray:
43
+ """
44
+ 为音频数据添加静音填充
45
+
46
+ Args:
47
+ audio_data: 原始音频数据
48
+ duration_seconds: 需要填充的时长(秒)
49
+ sample_rate: 采样率
50
+
51
+ Returns:
52
+ 填充后的音频数据
53
+ """
54
+ frequency = 440.0
55
+ duration = duration_seconds + 0.1
56
+ t = np.linspace(
57
+ 0, duration, int(sample_rate * duration), endpoint=False, dtype=audio_data.dtype
58
+ )
59
+ silence = 0.5 * np.sin(2 * np.pi * frequency * t)
60
+ audio_data = np.concatenate([audio_data, silence])
61
+ return audio_data
62
+
63
+
64
  class WhisperCppClient:
65
  """Whisper C++ API客户端"""
66
+
67
+ def __init__(self, model: typing.Literal["medium", "large"] = "medium"):
68
+ if model == "medium":
69
+ model = "medium-q5_0"
70
  else:
71
+ model = "large-v3-turbo-q5_0"
72
 
73
+ models_dir = paths.MODELS_PATH / "asr"
74
  self.whisper = Model(model=model, models_dir=models_dir)
75
 
76
+ def transcribe(self, audio_array: np.ndarray, language="en"):
 
 
 
 
 
 
 
 
77
  if language == "zh":
78
+ prompt = "以下是简体中文普通话的句子。"
79
  else:
80
+ prompt = "The following is an English sentence."
81
 
82
+ audio_array = ensure_minimum_audio_duration(audio_array)
 
 
 
 
 
83
 
84
  # print('............... language:', language)
85
  segments = self.whisper.transcribe(
 
95
  return text
96
 
97
 
98
+ class FunASRClient:
99
+ """FunASR API客户端"""
100
+
101
+ def __init__(self):
102
+ # 设置模型缓存目录
103
+ models_dir = paths.MODELS_PATH / "asr"
104
+ asr_model_path = (
105
+ models_dir
106
+ / "speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
107
+ )
108
+ vad_model_path = models_dir / "speech_fsmn_vad_zh-cn-16k-common-pytorch"
109
+ punc_model_path = (
110
+ models_dir / "punc_ct-transformer_cn-en-common-vocab471067-large"
111
+ )
112
+ self.funasr_model = AutoModel(
113
+ model=asr_model_path,
114
+ vad_model=vad_model_path.as_posix(),
115
+ punc_model=punc_model_path.as_posix(),
116
+ log_level="ERROR",
117
+ disable_update=True,
118
+ )
119
 
120
+ def transcribe(self, audio_array: np.ndarray, language="auto"):
121
+ audio_array = ensure_minimum_audio_duration(audio_array)
122
 
123
+ segments = self.funasr_model.generate(input=audio_array, disable_pbar=True)
124
 
125
+ transcibed_texts = []
126
+ for segment in segments:
127
+ content = segment.get("text", "")
128
+ transcibed_texts.append(content)
129
+ return " ".join(transcibed_texts)
130
+
131
+
132
+ class UnifiedASRClient:
133
+ """统一的语音识别客户端,根据语言自动选择FunASR或Whisper"""
134
+
135
+ def __init__(self, language: typing.Literal["auto", "zh", "en"] = "zh"):
136
+ self.language = language
137
+
138
+ if language == "zh":
139
+ self.client = FunASRClient()
140
+ else:
141
+ self.client = WhisperCppClient()
142
+
143
+
144
+ def warmup(self):
145
+ """预热模型"""
146
+ print('[INFO] 预热语音识别模型...')
147
+ try:
148
+ warmup_audiofile = paths.RESOURCES_PATH / 'audio' / 'jfk.flac'
149
+ if warmup_audiofile.exists():
150
+ data, sr = librosa.load(warmup_audiofile, sr=16000, mono=True)
151
+ self.client.transcribe(data, language=self.language)
152
+ else:
153
+ # 创建测试音频
154
+ test_audio = np.random.randn(16000).astype(np.float32) * 0.1 # 1秒的噪声
155
+ self.client.transcribe(test_audio, language=self.language)
156
+ print('[INFO] ASR模型预热完成')
157
+ except Exception as e:
158
+ print(f'[WARNING] ASR模型预热失败: {e}')
159
+
160
+ def transcribe(self, audio_array: np.ndarray) -> str:
161
+ return self.client.transcribe(audio_array, language=self.language)
162
+
163
+
164
+ class ASRWorker(BaseThread):
165
+ def __init__(self, group=None, target=None, name=None, args=(), kwargs=None, *, daemon=None,
166
+ user_voice_queue: Queue,
167
+ transcribed_text_queue: Queue,
168
+ language: typing.Literal["auto", "zh", "en"] = "zh"):
169
+ super().__init__(group, target, name, args, kwargs, daemon=daemon)
170
+
171
+ self.language = language
172
  self.user_voice_queue = user_voice_queue
173
  self.transcribed_text_queue = transcribed_text_queue
174
 
175
  self.cached_user_questions = LRUCacheDict(maxsize=10)
 
 
 
 
 
 
 
176
 
177
  def run(self):
178
+ self.client = UnifiedASRClient(self.language)
179
+ self.client.warmup()
180
 
181
  while not self.stopped():
182
  voice_task: VoiceTask = self.user_voice_queue.get()
183
  voice_task.whisper_start_time = time.time()
184
  user_voice: np.array = voice_task.user_voice
185
+ transcribed_text = self.client.transcribe(user_voice)
186
  voice_task.whisper_end_time = time.time()
187
 
188
  task_id = voice_task.id
 
206
 
207
  voice_task.user_voice = []
208
  self.transcribed_text_queue.put(voice_task)
209
+
210
+