Merge branch 'vad' of hf.co:MoYoYoTech/Translator into vad
Browse files* 'vad' of hf.co:MoYoYoTech/Translator:
fix max speech duration bug
remove time delaly in loop
add DESIGN_TIME_THREHOLD
- config.py +5 -1
- tests/audio_utils.py +54 -0
- tests/test_vad.ipynb +129 -0
- transcribe/helpers/vadprocessor.py +7 -7
- transcribe/pipelines/pipe_vad.py +3 -28
- transcribe/whisper_llm_serve.py +71 -40
config.py
CHANGED
|
@@ -2,7 +2,7 @@ import pathlib
|
|
| 2 |
import re
|
| 3 |
import logging
|
| 4 |
|
| 5 |
-
DEBUG =
|
| 6 |
LOG_LEVEL = logging.DEBUG if DEBUG else logging.INFO
|
| 7 |
|
| 8 |
logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
|
|
@@ -23,6 +23,10 @@ logging.getLogger().addHandler(console_handler)
|
|
| 23 |
|
| 24 |
# 文字输出长度阈值
|
| 25 |
TEXT_THREHOLD = 6
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
BASE_DIR = pathlib.Path(__file__).parent
|
| 28 |
MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
|
|
|
|
| 2 |
import re
|
| 3 |
import logging
|
| 4 |
|
| 5 |
+
DEBUG = False
|
| 6 |
LOG_LEVEL = logging.DEBUG if DEBUG else logging.INFO
|
| 7 |
|
| 8 |
logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
|
|
|
|
| 23 |
|
| 24 |
# 文字输出长度阈值
|
| 25 |
TEXT_THREHOLD = 6
|
| 26 |
+
# 音频段的决策时间
|
| 27 |
+
DESIGN_TIME_THREHOLD = 3
|
| 28 |
+
# 最长语音时长
|
| 29 |
+
MAX_SPEECH_DURATION_S = 15
|
| 30 |
|
| 31 |
BASE_DIR = pathlib.Path(__file__).parent
|
| 32 |
MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
|
tests/audio_utils.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import soundfile as sf
|
| 3 |
+
import time
|
| 4 |
+
|
| 5 |
+
def audio_stream_generator(audio_file_path, chunk_size=4096, simulate_realtime=True):
|
| 6 |
+
"""
|
| 7 |
+
音频流生成器,从音频文件中读取数据并以流的方式输出
|
| 8 |
+
|
| 9 |
+
参数:
|
| 10 |
+
audio_file_path: 音频文件路径
|
| 11 |
+
chunk_size: 每个数据块的大小(采样点数)
|
| 12 |
+
simulate_realtime: 是否模拟实时流处理的速度
|
| 13 |
+
|
| 14 |
+
生成:
|
| 15 |
+
numpy.ndarray: 每次生成一个chunk_size大小的np.float32数据块
|
| 16 |
+
"""
|
| 17 |
+
# 加载音频文件
|
| 18 |
+
audio_data, sample_rate = sf.read(audio_file_path)
|
| 19 |
+
|
| 20 |
+
# 确保音频数据是float32类型
|
| 21 |
+
if audio_data.dtype != np.float32:
|
| 22 |
+
audio_data = audio_data.astype(np.float32)
|
| 23 |
+
|
| 24 |
+
# 如果是立体声,转换为单声道
|
| 25 |
+
if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
|
| 26 |
+
audio_data = audio_data.mean(axis=1)
|
| 27 |
+
|
| 28 |
+
print(f"已加载音频文件: {audio_file_path}")
|
| 29 |
+
print(f"采样率: {sample_rate} Hz")
|
| 30 |
+
print(f"音频长度: {len(audio_data)/sample_rate:.2f} 秒")
|
| 31 |
+
|
| 32 |
+
# 计算每个块的时长(秒)
|
| 33 |
+
chunk_duration = chunk_size / sample_rate if simulate_realtime else 0
|
| 34 |
+
|
| 35 |
+
# 按块生成数据
|
| 36 |
+
audio_len = len(audio_data)
|
| 37 |
+
for pos in range(0, audio_len, chunk_size):
|
| 38 |
+
# 获取当前块
|
| 39 |
+
end_pos = min(pos + chunk_size, audio_len)
|
| 40 |
+
chunk = audio_data[pos:end_pos]
|
| 41 |
+
|
| 42 |
+
# 如果块大小不足,用0填充
|
| 43 |
+
if len(chunk) < chunk_size:
|
| 44 |
+
padded_chunk = np.zeros(chunk_size, dtype=np.float32)
|
| 45 |
+
padded_chunk[:len(chunk)] = chunk
|
| 46 |
+
chunk = padded_chunk
|
| 47 |
+
|
| 48 |
+
# 模拟实时处理的延迟
|
| 49 |
+
if simulate_realtime:
|
| 50 |
+
time.sleep(chunk_duration)
|
| 51 |
+
|
| 52 |
+
yield chunk
|
| 53 |
+
|
| 54 |
+
print("音频流处理完成")
|
tests/test_vad.ipynb
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 2,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"from audio_utils import audio_stream_generator\n",
|
| 10 |
+
"import IPython.display as ipd\n",
|
| 11 |
+
"import sys\n",
|
| 12 |
+
"sys.path.append(\"..\")\n",
|
| 13 |
+
"from transcribe.helpers.vadprocessor import FixedVADIterator\n"
|
| 14 |
+
]
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"cell_type": "code",
|
| 18 |
+
"execution_count": 3,
|
| 19 |
+
"metadata": {},
|
| 20 |
+
"outputs": [],
|
| 21 |
+
"source": [
|
| 22 |
+
"vac = FixedVADIterator(\n",
|
| 23 |
+
" threshold=0.5,\n",
|
| 24 |
+
" sampling_rate=16000,\n",
|
| 25 |
+
" # speech_pad_ms=10\n",
|
| 26 |
+
" min_silence_duration_ms = 100,\n",
|
| 27 |
+
" # speech_pad_ms = 30,\n",
|
| 28 |
+
" max_speech_duration_s=5.0,\n",
|
| 29 |
+
" )\n"
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"cell_type": "code",
|
| 34 |
+
"execution_count": 10,
|
| 35 |
+
"metadata": {},
|
| 36 |
+
"outputs": [],
|
| 37 |
+
"source": [
|
| 38 |
+
"SAMPLE_FILE_PATH = \"/Users/david/Samples/Audio/zh/liyongle.wav\"\n",
|
| 39 |
+
"SAMPLING_RATE = 16000\n",
|
| 40 |
+
"\n",
|
| 41 |
+
"chunks_generator = audio_stream_generator(SAMPLE_FILE_PATH, chunk_size=4096)\n",
|
| 42 |
+
"vac.reset_states()"
|
| 43 |
+
]
|
| 44 |
+
},
|
| 45 |
+
{
|
| 46 |
+
"cell_type": "code",
|
| 47 |
+
"execution_count": 11,
|
| 48 |
+
"metadata": {},
|
| 49 |
+
"outputs": [
|
| 50 |
+
{
|
| 51 |
+
"name": "stdout",
|
| 52 |
+
"output_type": "stream",
|
| 53 |
+
"text": [
|
| 54 |
+
"已加载音频文件: /Users/david/Samples/Audio/zh/liyongle.wav\n",
|
| 55 |
+
"采样率: 16000 Hz\n",
|
| 56 |
+
"音频长度: 64.00 秒\n",
|
| 57 |
+
"{'start': 3616}\n",
|
| 58 |
+
"{'end': 83968}\n",
|
| 59 |
+
"{'end': 164352}\n",
|
| 60 |
+
"{'end': 244736}\n",
|
| 61 |
+
"{'end': 325120}\n",
|
| 62 |
+
"{'end': 405504}\n",
|
| 63 |
+
"{'end': 485888}\n",
|
| 64 |
+
"{'end': 566272}\n",
|
| 65 |
+
"{'end': 624608}\n",
|
| 66 |
+
"{'start': 631328}\n",
|
| 67 |
+
"{'end': 691168}\n",
|
| 68 |
+
"{'start': 698912}\n",
|
| 69 |
+
"{'end': 779264}\n",
|
| 70 |
+
"{'end': 800736}\n",
|
| 71 |
+
"{'start': 805920}\n",
|
| 72 |
+
"{'end': 846816}\n",
|
| 73 |
+
"{'start': 855072}\n",
|
| 74 |
+
"{'end': 862176}\n",
|
| 75 |
+
"{'start': 864288}\n",
|
| 76 |
+
"{'end': 890336}\n",
|
| 77 |
+
"{'start': 893984}\n",
|
| 78 |
+
"{'end': 912352}\n",
|
| 79 |
+
"{'start': 917536}\n",
|
| 80 |
+
"{'end': 932320}\n",
|
| 81 |
+
"{'start': 939040}\n",
|
| 82 |
+
"{'end': 966112}\n",
|
| 83 |
+
"{'start': 970784}\n",
|
| 84 |
+
"{'end': 1015264}\n",
|
| 85 |
+
"{'start': 1019424}\n",
|
| 86 |
+
"音频流处理完成\n"
|
| 87 |
+
]
|
| 88 |
+
}
|
| 89 |
+
],
|
| 90 |
+
"source": [
|
| 91 |
+
"for chunk in chunks_generator:\n",
|
| 92 |
+
" # vad_iterator.reset_states()\n",
|
| 93 |
+
" # audio_buffer = np.append(audio_buffer, chunk)\n",
|
| 94 |
+
" \n",
|
| 95 |
+
" speech_dict = vac(chunk, return_seconds=False)\n",
|
| 96 |
+
" if speech_dict:\n",
|
| 97 |
+
" print(speech_dict)"
|
| 98 |
+
]
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"cell_type": "code",
|
| 102 |
+
"execution_count": null,
|
| 103 |
+
"metadata": {},
|
| 104 |
+
"outputs": [],
|
| 105 |
+
"source": []
|
| 106 |
+
}
|
| 107 |
+
],
|
| 108 |
+
"metadata": {
|
| 109 |
+
"kernelspec": {
|
| 110 |
+
"display_name": ".venv",
|
| 111 |
+
"language": "python",
|
| 112 |
+
"name": "python3"
|
| 113 |
+
},
|
| 114 |
+
"language_info": {
|
| 115 |
+
"codemirror_mode": {
|
| 116 |
+
"name": "ipython",
|
| 117 |
+
"version": 3
|
| 118 |
+
},
|
| 119 |
+
"file_extension": ".py",
|
| 120 |
+
"mimetype": "text/x-python",
|
| 121 |
+
"name": "python",
|
| 122 |
+
"nbconvert_exporter": "python",
|
| 123 |
+
"pygments_lexer": "ipython3",
|
| 124 |
+
"version": "3.11.11"
|
| 125 |
+
}
|
| 126 |
+
},
|
| 127 |
+
"nbformat": 4,
|
| 128 |
+
"nbformat_minor": 2
|
| 129 |
+
}
|
transcribe/helpers/vadprocessor.py
CHANGED
|
@@ -155,7 +155,7 @@ class VADIteratorOnnx:
|
|
| 155 |
raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
|
| 156 |
|
| 157 |
self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
| 158 |
-
self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
|
| 159 |
self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
| 160 |
self.reset_states()
|
| 161 |
|
|
@@ -184,7 +184,7 @@ class VADIteratorOnnx:
|
|
| 184 |
self.current_sample += window_size_samples
|
| 185 |
|
| 186 |
speech_prob = self.model(x, self.sampling_rate)[0,0]
|
| 187 |
-
|
| 188 |
|
| 189 |
if (speech_prob >= self.threshold) and self.temp_end:
|
| 190 |
self.temp_end = 0
|
|
@@ -196,11 +196,11 @@ class VADIteratorOnnx:
|
|
| 196 |
self.start = speech_start
|
| 197 |
return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
|
| 198 |
|
| 199 |
-
if (speech_prob >= self.threshold) and self.current_sample - self.start >= self.max_speech_samples:
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
|
| 205 |
if (speech_prob < self.threshold - 0.15) and self.triggered:
|
| 206 |
if not self.temp_end:
|
|
|
|
| 155 |
raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
|
| 156 |
|
| 157 |
self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
| 158 |
+
# self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
|
| 159 |
self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
| 160 |
self.reset_states()
|
| 161 |
|
|
|
|
| 184 |
self.current_sample += window_size_samples
|
| 185 |
|
| 186 |
speech_prob = self.model(x, self.sampling_rate)[0,0]
|
| 187 |
+
|
| 188 |
|
| 189 |
if (speech_prob >= self.threshold) and self.temp_end:
|
| 190 |
self.temp_end = 0
|
|
|
|
| 196 |
self.start = speech_start
|
| 197 |
return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
|
| 198 |
|
| 199 |
+
# if (speech_prob >= self.threshold) and self.current_sample - self.start >= self.max_speech_samples:
|
| 200 |
+
# if self.temp_end:
|
| 201 |
+
# self.temp_end = 0
|
| 202 |
+
# self.start = self.current_sample
|
| 203 |
+
# return {'end': int(self.current_sample) if not return_seconds else round(self.current_sample / self.sampling_rate, 1)}
|
| 204 |
|
| 205 |
if (speech_prob < self.threshold - 0.15) and self.triggered:
|
| 206 |
if not self.temp_end:
|
transcribe/pipelines/pipe_vad.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
|
| 2 |
from .base import MetaItem, BasePipe
|
| 3 |
-
from ..helpers.vadprocessor import FixedVADIterator
|
| 4 |
|
| 5 |
import numpy as np
|
| 6 |
import logging
|
|
@@ -16,15 +16,12 @@ class VadPipe(BasePipe):
|
|
| 16 |
super().__init__(in_queue, out_queue)
|
| 17 |
self._offset = 0 # 处理的frame size offset
|
| 18 |
self._status = 'END'
|
| 19 |
-
self.last_state_change_offset = 0
|
| 20 |
-
self.adaptive_ctrl = AdaptiveSilenceController()
|
| 21 |
|
| 22 |
|
| 23 |
def reset(self):
|
| 24 |
self._offset = 0
|
| 25 |
self._status = 'END'
|
| 26 |
-
|
| 27 |
-
self.adaptive_ctrl = AdaptiveSilenceController()
|
| 28 |
self.vac.reset_states()
|
| 29 |
|
| 30 |
@classmethod
|
|
@@ -36,7 +33,6 @@ class VadPipe(BasePipe):
|
|
| 36 |
# speech_pad_ms=10
|
| 37 |
min_silence_duration_ms = 100,
|
| 38 |
# speech_pad_ms = 30,
|
| 39 |
-
max_speech_duration_s=20.0,
|
| 40 |
)
|
| 41 |
cls.vac.reset_states()
|
| 42 |
|
|
@@ -53,16 +49,9 @@ class VadPipe(BasePipe):
|
|
| 53 |
if start_frame:
|
| 54 |
relative_start_frame =start_frame - self._offset
|
| 55 |
if end_frame:
|
| 56 |
-
relative_end_frame =
|
| 57 |
return relative_start_frame, relative_end_frame
|
| 58 |
|
| 59 |
-
def update_silence_ms(self):
|
| 60 |
-
min_silence = self.adaptive_ctrl.get_adaptive_silence_ms()
|
| 61 |
-
min_silence_samples = self.sample_rate * min_silence / 1000
|
| 62 |
-
old_silence_samples = self.vac.min_silence_samples
|
| 63 |
-
logging.warning(f"🫠 update_silence_ms :{old_silence_samples * 1000 / self.sample_rate :.2f}ms => current: {min_silence}ms ")
|
| 64 |
-
# self.vac.min_silence_samples = min_silence_samples
|
| 65 |
-
|
| 66 |
def process(self, in_data: MetaItem) -> MetaItem:
|
| 67 |
if self._offset == 0:
|
| 68 |
self.vac.reset_states()
|
|
@@ -77,29 +66,15 @@ class VadPipe(BasePipe):
|
|
| 77 |
if rel_start_frame is not None and rel_end_frame is None:
|
| 78 |
self._status = "START" # 语音开始
|
| 79 |
target_audio = source_audio[rel_start_frame:]
|
| 80 |
-
|
| 81 |
-
# 计算上一段静音长度
|
| 82 |
-
silence_len = (self._offset + rel_start_frame - self.last_state_change_offset) / self.sample_rate * 1000
|
| 83 |
-
self.adaptive_ctrl.update_silence(silence_len)
|
| 84 |
-
self.last_state_change_offset = self._offset + rel_start_frame
|
| 85 |
-
|
| 86 |
logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
|
| 87 |
elif rel_start_frame is None and rel_end_frame is not None:
|
| 88 |
self._status = "END" # 音频结束
|
| 89 |
target_audio = source_audio[:rel_end_frame]
|
| 90 |
-
|
| 91 |
-
speech_len = (rel_end_frame) / self.sample_rate * 1000
|
| 92 |
-
self.adaptive_ctrl.update_speech(speech_len)
|
| 93 |
-
self.last_state_change_offset = self._offset + rel_end_frame
|
| 94 |
logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
|
| 95 |
else:
|
| 96 |
self._status = 'END'
|
| 97 |
target_audio = source_audio[rel_start_frame:rel_end_frame]
|
| 98 |
logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
|
| 99 |
-
|
| 100 |
-
seg_len = (rel_end_frame - rel_start_frame) / self.sample_rate * 1000
|
| 101 |
-
self.adaptive_ctrl.update_speech(seg_len)
|
| 102 |
-
self.last_state_change_offset = self._offset + rel_end_frame
|
| 103 |
# logging.debug("❌ No valid speech segment detected, setting status to END")
|
| 104 |
else:
|
| 105 |
if self._status == 'START':
|
|
|
|
| 1 |
|
| 2 |
from .base import MetaItem, BasePipe
|
| 3 |
+
from ..helpers.vadprocessor import FixedVADIterator
|
| 4 |
|
| 5 |
import numpy as np
|
| 6 |
import logging
|
|
|
|
| 16 |
super().__init__(in_queue, out_queue)
|
| 17 |
self._offset = 0 # 处理的frame size offset
|
| 18 |
self._status = 'END'
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
def reset(self):
|
| 22 |
self._offset = 0
|
| 23 |
self._status = 'END'
|
| 24 |
+
|
|
|
|
| 25 |
self.vac.reset_states()
|
| 26 |
|
| 27 |
@classmethod
|
|
|
|
| 33 |
# speech_pad_ms=10
|
| 34 |
min_silence_duration_ms = 100,
|
| 35 |
# speech_pad_ms = 30,
|
|
|
|
| 36 |
)
|
| 37 |
cls.vac.reset_states()
|
| 38 |
|
|
|
|
| 49 |
if start_frame:
|
| 50 |
relative_start_frame =start_frame - self._offset
|
| 51 |
if end_frame:
|
| 52 |
+
relative_end_frame = end_frame - self._offset
|
| 53 |
return relative_start_frame, relative_end_frame
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
def process(self, in_data: MetaItem) -> MetaItem:
|
| 56 |
if self._offset == 0:
|
| 57 |
self.vac.reset_states()
|
|
|
|
| 66 |
if rel_start_frame is not None and rel_end_frame is None:
|
| 67 |
self._status = "START" # 语音开始
|
| 68 |
target_audio = source_audio[rel_start_frame:]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
|
| 70 |
elif rel_start_frame is None and rel_end_frame is not None:
|
| 71 |
self._status = "END" # 音频结束
|
| 72 |
target_audio = source_audio[:rel_end_frame]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
|
| 74 |
else:
|
| 75 |
self._status = 'END'
|
| 76 |
target_audio = source_audio[rel_start_frame:rel_end_frame]
|
| 77 |
logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
# logging.debug("❌ No valid speech segment detected, setting status to END")
|
| 79 |
else:
|
| 80 |
if self._status == 'START':
|
transcribe/whisper_llm_serve.py
CHANGED
|
@@ -14,12 +14,39 @@ from .utils import log_block, save_to_wave, TestDataWriter, filter_words
|
|
| 14 |
from .translatepipes import TranslatePipes
|
| 15 |
|
| 16 |
from transcribe.helpers.vadprocessor import VadProcessor
|
| 17 |
-
# from transcribe.helpers.vad_dynamic import VadProcessor
|
| 18 |
-
# from transcribe.helpers.vadprocessor import VadProcessor
|
| 19 |
from transcribe.pipelines import MetaItem
|
|
|
|
|
|
|
| 20 |
|
| 21 |
logger = getLogger("TranscriptionService")
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
class WhisperTranscriptionService:
|
| 25 |
"""
|
|
@@ -51,21 +78,15 @@ class WhisperTranscriptionService:
|
|
| 51 |
self._frame_queue = queue.Queue()
|
| 52 |
# 音频队列缓冲区
|
| 53 |
self.frames_np = np.array([], dtype=np.float32)
|
|
|
|
| 54 |
# 完整音频队列
|
| 55 |
-
self.
|
| 56 |
-
self._temp_string = ""
|
| 57 |
-
|
| 58 |
-
self._transcrible_analysis = None
|
| 59 |
# 启动处理线程
|
| 60 |
self._translate_thread_stop = threading.Event()
|
| 61 |
self._frame_processing_thread_stop = threading.Event()
|
| 62 |
|
| 63 |
self.translate_thread = self._start_thread(self._transcription_processing_loop)
|
| 64 |
self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
|
| 65 |
-
# if language == "zh":
|
| 66 |
-
# self._vad = VadProcessor(prob_threshold=0.8, silence_s=0.2, cache_s=0.15)
|
| 67 |
-
# else:
|
| 68 |
-
# self._vad = VadProcessor(prob_threshold=0.7, silence_s=0.2, cache_s=0.15)
|
| 69 |
self.row_number = 0
|
| 70 |
# for test
|
| 71 |
self._transcrible_time_cost = 0.
|
|
@@ -107,38 +128,60 @@ class WhisperTranscriptionService:
|
|
| 107 |
speech_status = processed_audio.speech_status
|
| 108 |
return speech_audio, speech_status
|
| 109 |
|
|
|
|
|
|
|
| 110 |
def _frame_processing_loop(self) -> None:
|
| 111 |
"""从队列获取音频帧并合并到缓冲区"""
|
| 112 |
while not self._frame_processing_thread_stop.is_set():
|
| 113 |
try:
|
| 114 |
frame_np = self._frame_queue.get(timeout=0.1)
|
| 115 |
frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
|
| 116 |
-
|
|
|
|
| 117 |
continue
|
|
|
|
| 118 |
with self.lock:
|
|
|
|
|
|
|
|
|
|
| 119 |
self.frames_np = np.append(self.frames_np, frame_np)
|
| 120 |
-
if
|
| 121 |
-
self.
|
|
|
|
|
|
|
| 122 |
self.frames_np = np.array([], dtype=np.float32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
except queue.Empty:
|
| 124 |
pass
|
| 125 |
|
| 126 |
def _transcription_processing_loop(self) -> None:
|
| 127 |
"""主转录处理循环"""
|
| 128 |
frame_epoch = 1
|
|
|
|
| 129 |
while not self._translate_thread_stop.is_set():
|
| 130 |
|
| 131 |
if len(self.frames_np) ==0:
|
| 132 |
time.sleep(0.01)
|
| 133 |
continue
|
|
|
|
| 134 |
with self.lock:
|
| 135 |
-
if len(self.
|
| 136 |
-
audio_buffer = self.
|
| 137 |
partial = False
|
| 138 |
else:
|
| 139 |
audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)].copy()# 获取 1.5s * epoch 个音频长度
|
| 140 |
partial = True
|
| 141 |
-
|
| 142 |
if len(audio_buffer) < int(self.sample_rate):
|
| 143 |
silence_audio = np.zeros(self.sample_rate, dtype=np.float32)
|
| 144 |
silence_audio[-len(audio_buffer):] = audio_buffer
|
|
@@ -149,37 +192,25 @@ class WhisperTranscriptionService:
|
|
| 149 |
segments = meta_item.segments
|
| 150 |
logger.debug(f"Segments: {segments}")
|
| 151 |
segments = filter_words(segments)
|
|
|
|
| 152 |
if len(segments):
|
| 153 |
seg_text = self.text_separator.join(seg.text for seg in segments)
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
else:
|
| 163 |
-
self._temp_string = ""
|
| 164 |
-
|
| 165 |
-
result = TransResult(
|
| 166 |
-
seg_id=self.row_number,
|
| 167 |
-
context=seg_text,
|
| 168 |
-
from_=self.source_language,
|
| 169 |
-
to=self.target_language,
|
| 170 |
-
tran_content=self._translate_text_large(seg_text),
|
| 171 |
-
partial=partial
|
| 172 |
-
)
|
| 173 |
if partial == False:
|
| 174 |
self.row_number += 1
|
| 175 |
-
|
| 176 |
-
self._send_result_to_client(result)
|
| 177 |
-
|
| 178 |
-
if partial == False:
|
| 179 |
frame_epoch = 1
|
| 180 |
else:
|
| 181 |
frame_epoch += 1
|
| 182 |
-
|
|
|
|
|
|
|
| 183 |
|
| 184 |
def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
|
| 185 |
"""转录音频并返回转录片段"""
|
|
|
|
| 14 |
from .translatepipes import TranslatePipes
|
| 15 |
|
| 16 |
from transcribe.helpers.vadprocessor import VadProcessor
|
|
|
|
|
|
|
| 17 |
from transcribe.pipelines import MetaItem
|
| 18 |
+
from dataclasses import dataclass, field
|
| 19 |
+
|
| 20 |
|
| 21 |
logger = getLogger("TranscriptionService")
|
| 22 |
|
| 23 |
+
@dataclass
|
| 24 |
+
class FullSegment:
|
| 25 |
+
"""整句"""
|
| 26 |
+
audio_array: np.ndarray
|
| 27 |
+
created_time: float = field(default_factory=time.time)
|
| 28 |
+
|
| 29 |
+
@staticmethod
|
| 30 |
+
def merge(*audio_segments: list["FullSegment"]):
|
| 31 |
+
audio_segments_sorted = sorted([*audio_segments], key=lambda item: item.created_time)
|
| 32 |
+
return FullSegment(
|
| 33 |
+
created_time=audio_segments_sorted[0].created_time,
|
| 34 |
+
audio_array=np.concatenate([i.audio_array for i in audio_segments_sorted], axis=0)
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
@property
|
| 38 |
+
def time_duration(self) -> float:
|
| 39 |
+
return len(self.audio_array) / config.SAMPLE_RATE
|
| 40 |
+
|
| 41 |
+
@property
|
| 42 |
+
def start_timestamp(self):
|
| 43 |
+
return self.created_time
|
| 44 |
+
|
| 45 |
+
@property
|
| 46 |
+
def end_timestamp(self):
|
| 47 |
+
return self.created_time + self.time_duration
|
| 48 |
+
|
| 49 |
+
|
| 50 |
|
| 51 |
class WhisperTranscriptionService:
|
| 52 |
"""
|
|
|
|
| 78 |
self._frame_queue = queue.Queue()
|
| 79 |
# 音频队列缓冲区
|
| 80 |
self.frames_np = np.array([], dtype=np.float32)
|
| 81 |
+
self.frames_np_start_timestamp = None
|
| 82 |
# 完整音频队列
|
| 83 |
+
self.full_segments_queue = collections.deque()
|
|
|
|
|
|
|
|
|
|
| 84 |
# 启动处理线程
|
| 85 |
self._translate_thread_stop = threading.Event()
|
| 86 |
self._frame_processing_thread_stop = threading.Event()
|
| 87 |
|
| 88 |
self.translate_thread = self._start_thread(self._transcription_processing_loop)
|
| 89 |
self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
self.row_number = 0
|
| 91 |
# for test
|
| 92 |
self._transcrible_time_cost = 0.
|
|
|
|
| 128 |
speech_status = processed_audio.speech_status
|
| 129 |
return speech_audio, speech_status
|
| 130 |
|
| 131 |
+
|
| 132 |
+
|
| 133 |
def _frame_processing_loop(self) -> None:
|
| 134 |
"""从队列获取音频帧并合并到缓冲区"""
|
| 135 |
while not self._frame_processing_thread_stop.is_set():
|
| 136 |
try:
|
| 137 |
frame_np = self._frame_queue.get(timeout=0.1)
|
| 138 |
frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
|
| 139 |
+
|
| 140 |
+
if frame_np is None:
|
| 141 |
continue
|
| 142 |
+
|
| 143 |
with self.lock:
|
| 144 |
+
if speech_status == "START" and self.frames_np_start_timestamp is None:
|
| 145 |
+
self.frames_np_start_timestamp = time.time()
|
| 146 |
+
# 添加音频到音频缓冲区
|
| 147 |
self.frames_np = np.append(self.frames_np, frame_np)
|
| 148 |
+
if len(self.frames_np) >= self.sample_rate * config.MAX_SPEECH_DURATION_S:
|
| 149 |
+
audio_array=self.frames_np.copy()
|
| 150 |
+
self.full_segments_queue.appendleft(audio_array) # 根据时间是否满足三秒长度 来整合音频块
|
| 151 |
+
self.frames_np_start_timestamp = time.time()
|
| 152 |
self.frames_np = np.array([], dtype=np.float32)
|
| 153 |
+
|
| 154 |
+
elif speech_status == "END" and len(self.frames_np) > 0 and self.frames_np_start_timestamp:
|
| 155 |
+
time_diff = time.time() - self.frames_np_start_timestamp
|
| 156 |
+
if time_diff >= config.DESIGN_TIME_THREHOLD:
|
| 157 |
+
audio_array=self.frames_np.copy()
|
| 158 |
+
self.full_segments_queue.appendleft(audio_array) # 根据时间是否满足三秒长度 来整合音频块
|
| 159 |
+
self.frames_np_start_timestamp = None
|
| 160 |
+
self.frames_np = np.array([], dtype=np.float32)
|
| 161 |
+
else:
|
| 162 |
+
logger.debug(f"🥳 当前时间与上一句的时间差: {time_diff:.2f}s,继续增加缓冲区")
|
| 163 |
+
|
| 164 |
except queue.Empty:
|
| 165 |
pass
|
| 166 |
|
| 167 |
def _transcription_processing_loop(self) -> None:
|
| 168 |
"""主转录处理循环"""
|
| 169 |
frame_epoch = 1
|
| 170 |
+
|
| 171 |
while not self._translate_thread_stop.is_set():
|
| 172 |
|
| 173 |
if len(self.frames_np) ==0:
|
| 174 |
time.sleep(0.01)
|
| 175 |
continue
|
| 176 |
+
|
| 177 |
with self.lock:
|
| 178 |
+
if len(self.full_segments_queue) > 0:
|
| 179 |
+
audio_buffer = self.full_segments_queue.pop()
|
| 180 |
partial = False
|
| 181 |
else:
|
| 182 |
audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)].copy()# 获取 1.5s * epoch 个音频长度
|
| 183 |
partial = True
|
| 184 |
+
|
| 185 |
if len(audio_buffer) < int(self.sample_rate):
|
| 186 |
silence_audio = np.zeros(self.sample_rate, dtype=np.float32)
|
| 187 |
silence_audio[-len(audio_buffer):] = audio_buffer
|
|
|
|
| 192 |
segments = meta_item.segments
|
| 193 |
logger.debug(f"Segments: {segments}")
|
| 194 |
segments = filter_words(segments)
|
| 195 |
+
|
| 196 |
if len(segments):
|
| 197 |
seg_text = self.text_separator.join(seg.text for seg in segments)
|
| 198 |
+
result = TransResult(
|
| 199 |
+
seg_id=self.row_number,
|
| 200 |
+
context=seg_text,
|
| 201 |
+
from_=self.source_language,
|
| 202 |
+
to=self.target_language,
|
| 203 |
+
tran_content=self._translate_text_large(seg_text),
|
| 204 |
+
partial=partial
|
| 205 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
if partial == False:
|
| 207 |
self.row_number += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
frame_epoch = 1
|
| 209 |
else:
|
| 210 |
frame_epoch += 1
|
| 211 |
+
self._send_result_to_client(result)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
|
| 215 |
def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
|
| 216 |
"""转录音频并返回转录片段"""
|