Xin Zhang commited on
Commit
89384e7
·
2 Parent(s): e2963fd 93a0cf7

Merge branch 'vad' of hf.co:MoYoYoTech/Translator into vad

Browse files

* 'vad' of hf.co:MoYoYoTech/Translator:
fix max speech duration bug
remove time delaly in loop
add DESIGN_TIME_THREHOLD

config.py CHANGED
@@ -2,7 +2,7 @@ import pathlib
2
  import re
3
  import logging
4
 
5
- DEBUG = True
6
  LOG_LEVEL = logging.DEBUG if DEBUG else logging.INFO
7
 
8
  logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
@@ -23,6 +23,10 @@ logging.getLogger().addHandler(console_handler)
23
 
24
  # 文字输出长度阈值
25
  TEXT_THREHOLD = 6
 
 
 
 
26
 
27
  BASE_DIR = pathlib.Path(__file__).parent
28
  MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
 
2
  import re
3
  import logging
4
 
5
+ DEBUG = False
6
  LOG_LEVEL = logging.DEBUG if DEBUG else logging.INFO
7
 
8
  logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
 
23
 
24
  # 文字输出长度阈值
25
  TEXT_THREHOLD = 6
26
+ # 音频段的决策时间
27
+ DESIGN_TIME_THREHOLD = 3
28
+ # 最长语音时长
29
+ MAX_SPEECH_DURATION_S = 15
30
 
31
  BASE_DIR = pathlib.Path(__file__).parent
32
  MODEL_DIR = BASE_DIR / "moyoyo_asr_models"
tests/audio_utils.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import soundfile as sf
3
+ import time
4
+
5
+ def audio_stream_generator(audio_file_path, chunk_size=4096, simulate_realtime=True):
6
+ """
7
+ 音频流生成器,从音频文件中读取数据并以流的方式输出
8
+
9
+ 参数:
10
+ audio_file_path: 音频文件路径
11
+ chunk_size: 每个数据块的大小(采样点数)
12
+ simulate_realtime: 是否模拟实时流处理的速度
13
+
14
+ 生成:
15
+ numpy.ndarray: 每次生成一个chunk_size大小的np.float32数据块
16
+ """
17
+ # 加载音频文件
18
+ audio_data, sample_rate = sf.read(audio_file_path)
19
+
20
+ # 确保音频数据是float32类型
21
+ if audio_data.dtype != np.float32:
22
+ audio_data = audio_data.astype(np.float32)
23
+
24
+ # 如果是立体声,转换为单声道
25
+ if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
26
+ audio_data = audio_data.mean(axis=1)
27
+
28
+ print(f"已加载音频文件: {audio_file_path}")
29
+ print(f"采样率: {sample_rate} Hz")
30
+ print(f"音频长度: {len(audio_data)/sample_rate:.2f} 秒")
31
+
32
+ # 计算每个块的时长(秒)
33
+ chunk_duration = chunk_size / sample_rate if simulate_realtime else 0
34
+
35
+ # 按块生成数据
36
+ audio_len = len(audio_data)
37
+ for pos in range(0, audio_len, chunk_size):
38
+ # 获取当前块
39
+ end_pos = min(pos + chunk_size, audio_len)
40
+ chunk = audio_data[pos:end_pos]
41
+
42
+ # 如果块大小不足,用0填充
43
+ if len(chunk) < chunk_size:
44
+ padded_chunk = np.zeros(chunk_size, dtype=np.float32)
45
+ padded_chunk[:len(chunk)] = chunk
46
+ chunk = padded_chunk
47
+
48
+ # 模拟实时处理的延迟
49
+ if simulate_realtime:
50
+ time.sleep(chunk_duration)
51
+
52
+ yield chunk
53
+
54
+ print("音频流处理完成")
tests/test_vad.ipynb ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from audio_utils import audio_stream_generator\n",
10
+ "import IPython.display as ipd\n",
11
+ "import sys\n",
12
+ "sys.path.append(\"..\")\n",
13
+ "from transcribe.helpers.vadprocessor import FixedVADIterator\n"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 3,
19
+ "metadata": {},
20
+ "outputs": [],
21
+ "source": [
22
+ "vac = FixedVADIterator(\n",
23
+ " threshold=0.5,\n",
24
+ " sampling_rate=16000,\n",
25
+ " # speech_pad_ms=10\n",
26
+ " min_silence_duration_ms = 100,\n",
27
+ " # speech_pad_ms = 30,\n",
28
+ " max_speech_duration_s=5.0,\n",
29
+ " )\n"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 10,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "SAMPLE_FILE_PATH = \"/Users/david/Samples/Audio/zh/liyongle.wav\"\n",
39
+ "SAMPLING_RATE = 16000\n",
40
+ "\n",
41
+ "chunks_generator = audio_stream_generator(SAMPLE_FILE_PATH, chunk_size=4096)\n",
42
+ "vac.reset_states()"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "code",
47
+ "execution_count": 11,
48
+ "metadata": {},
49
+ "outputs": [
50
+ {
51
+ "name": "stdout",
52
+ "output_type": "stream",
53
+ "text": [
54
+ "已加载音频文件: /Users/david/Samples/Audio/zh/liyongle.wav\n",
55
+ "采样率: 16000 Hz\n",
56
+ "音频长度: 64.00 秒\n",
57
+ "{'start': 3616}\n",
58
+ "{'end': 83968}\n",
59
+ "{'end': 164352}\n",
60
+ "{'end': 244736}\n",
61
+ "{'end': 325120}\n",
62
+ "{'end': 405504}\n",
63
+ "{'end': 485888}\n",
64
+ "{'end': 566272}\n",
65
+ "{'end': 624608}\n",
66
+ "{'start': 631328}\n",
67
+ "{'end': 691168}\n",
68
+ "{'start': 698912}\n",
69
+ "{'end': 779264}\n",
70
+ "{'end': 800736}\n",
71
+ "{'start': 805920}\n",
72
+ "{'end': 846816}\n",
73
+ "{'start': 855072}\n",
74
+ "{'end': 862176}\n",
75
+ "{'start': 864288}\n",
76
+ "{'end': 890336}\n",
77
+ "{'start': 893984}\n",
78
+ "{'end': 912352}\n",
79
+ "{'start': 917536}\n",
80
+ "{'end': 932320}\n",
81
+ "{'start': 939040}\n",
82
+ "{'end': 966112}\n",
83
+ "{'start': 970784}\n",
84
+ "{'end': 1015264}\n",
85
+ "{'start': 1019424}\n",
86
+ "音频流处理完成\n"
87
+ ]
88
+ }
89
+ ],
90
+ "source": [
91
+ "for chunk in chunks_generator:\n",
92
+ " # vad_iterator.reset_states()\n",
93
+ " # audio_buffer = np.append(audio_buffer, chunk)\n",
94
+ " \n",
95
+ " speech_dict = vac(chunk, return_seconds=False)\n",
96
+ " if speech_dict:\n",
97
+ " print(speech_dict)"
98
+ ]
99
+ },
100
+ {
101
+ "cell_type": "code",
102
+ "execution_count": null,
103
+ "metadata": {},
104
+ "outputs": [],
105
+ "source": []
106
+ }
107
+ ],
108
+ "metadata": {
109
+ "kernelspec": {
110
+ "display_name": ".venv",
111
+ "language": "python",
112
+ "name": "python3"
113
+ },
114
+ "language_info": {
115
+ "codemirror_mode": {
116
+ "name": "ipython",
117
+ "version": 3
118
+ },
119
+ "file_extension": ".py",
120
+ "mimetype": "text/x-python",
121
+ "name": "python",
122
+ "nbconvert_exporter": "python",
123
+ "pygments_lexer": "ipython3",
124
+ "version": "3.11.11"
125
+ }
126
+ },
127
+ "nbformat": 4,
128
+ "nbformat_minor": 2
129
+ }
transcribe/helpers/vadprocessor.py CHANGED
@@ -155,7 +155,7 @@ class VADIteratorOnnx:
155
  raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
156
 
157
  self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
158
- self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
159
  self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
160
  self.reset_states()
161
 
@@ -184,7 +184,7 @@ class VADIteratorOnnx:
184
  self.current_sample += window_size_samples
185
 
186
  speech_prob = self.model(x, self.sampling_rate)[0,0]
187
- # print(f"{self.current_sample/self.sampling_rate:.2f}: {speech_prob}")
188
 
189
  if (speech_prob >= self.threshold) and self.temp_end:
190
  self.temp_end = 0
@@ -196,11 +196,11 @@ class VADIteratorOnnx:
196
  self.start = speech_start
197
  return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
198
 
199
- if (speech_prob >= self.threshold) and self.current_sample - self.start >= self.max_speech_samples:
200
- if self.temp_end:
201
- self.temp_end = 0
202
- self.start = self.current_sample
203
- return {'end': int(self.current_sample) if not return_seconds else round(self.current_sample / self.sampling_rate, 1)}
204
 
205
  if (speech_prob < self.threshold - 0.15) and self.triggered:
206
  if not self.temp_end:
 
155
  raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
156
 
157
  self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
158
+ # self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
159
  self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
160
  self.reset_states()
161
 
 
184
  self.current_sample += window_size_samples
185
 
186
  speech_prob = self.model(x, self.sampling_rate)[0,0]
187
+
188
 
189
  if (speech_prob >= self.threshold) and self.temp_end:
190
  self.temp_end = 0
 
196
  self.start = speech_start
197
  return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
198
 
199
+ # if (speech_prob >= self.threshold) and self.current_sample - self.start >= self.max_speech_samples:
200
+ # if self.temp_end:
201
+ # self.temp_end = 0
202
+ # self.start = self.current_sample
203
+ # return {'end': int(self.current_sample) if not return_seconds else round(self.current_sample / self.sampling_rate, 1)}
204
 
205
  if (speech_prob < self.threshold - 0.15) and self.triggered:
206
  if not self.temp_end:
transcribe/pipelines/pipe_vad.py CHANGED
@@ -1,6 +1,6 @@
1
 
2
  from .base import MetaItem, BasePipe
3
- from ..helpers.vadprocessor import FixedVADIterator, AdaptiveSilenceController
4
 
5
  import numpy as np
6
  import logging
@@ -16,15 +16,12 @@ class VadPipe(BasePipe):
16
  super().__init__(in_queue, out_queue)
17
  self._offset = 0 # 处理的frame size offset
18
  self._status = 'END'
19
- self.last_state_change_offset = 0
20
- self.adaptive_ctrl = AdaptiveSilenceController()
21
 
22
 
23
  def reset(self):
24
  self._offset = 0
25
  self._status = 'END'
26
- self.last_state_change_offset = 0
27
- self.adaptive_ctrl = AdaptiveSilenceController()
28
  self.vac.reset_states()
29
 
30
  @classmethod
@@ -36,7 +33,6 @@ class VadPipe(BasePipe):
36
  # speech_pad_ms=10
37
  min_silence_duration_ms = 100,
38
  # speech_pad_ms = 30,
39
- max_speech_duration_s=20.0,
40
  )
41
  cls.vac.reset_states()
42
 
@@ -53,16 +49,9 @@ class VadPipe(BasePipe):
53
  if start_frame:
54
  relative_start_frame =start_frame - self._offset
55
  if end_frame:
56
- relative_end_frame = max(0, end_frame - self._offset)
57
  return relative_start_frame, relative_end_frame
58
 
59
- def update_silence_ms(self):
60
- min_silence = self.adaptive_ctrl.get_adaptive_silence_ms()
61
- min_silence_samples = self.sample_rate * min_silence / 1000
62
- old_silence_samples = self.vac.min_silence_samples
63
- logging.warning(f"🫠 update_silence_ms :{old_silence_samples * 1000 / self.sample_rate :.2f}ms => current: {min_silence}ms ")
64
- # self.vac.min_silence_samples = min_silence_samples
65
-
66
  def process(self, in_data: MetaItem) -> MetaItem:
67
  if self._offset == 0:
68
  self.vac.reset_states()
@@ -77,29 +66,15 @@ class VadPipe(BasePipe):
77
  if rel_start_frame is not None and rel_end_frame is None:
78
  self._status = "START" # 语音开始
79
  target_audio = source_audio[rel_start_frame:]
80
-
81
- # 计算上一段静音长度
82
- silence_len = (self._offset + rel_start_frame - self.last_state_change_offset) / self.sample_rate * 1000
83
- self.adaptive_ctrl.update_silence(silence_len)
84
- self.last_state_change_offset = self._offset + rel_start_frame
85
-
86
  logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
87
  elif rel_start_frame is None and rel_end_frame is not None:
88
  self._status = "END" # 音频结束
89
  target_audio = source_audio[:rel_end_frame]
90
-
91
- speech_len = (rel_end_frame) / self.sample_rate * 1000
92
- self.adaptive_ctrl.update_speech(speech_len)
93
- self.last_state_change_offset = self._offset + rel_end_frame
94
  logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
95
  else:
96
  self._status = 'END'
97
  target_audio = source_audio[rel_start_frame:rel_end_frame]
98
  logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
99
-
100
- seg_len = (rel_end_frame - rel_start_frame) / self.sample_rate * 1000
101
- self.adaptive_ctrl.update_speech(seg_len)
102
- self.last_state_change_offset = self._offset + rel_end_frame
103
  # logging.debug("❌ No valid speech segment detected, setting status to END")
104
  else:
105
  if self._status == 'START':
 
1
 
2
  from .base import MetaItem, BasePipe
3
+ from ..helpers.vadprocessor import FixedVADIterator
4
 
5
  import numpy as np
6
  import logging
 
16
  super().__init__(in_queue, out_queue)
17
  self._offset = 0 # 处理的frame size offset
18
  self._status = 'END'
 
 
19
 
20
 
21
  def reset(self):
22
  self._offset = 0
23
  self._status = 'END'
24
+
 
25
  self.vac.reset_states()
26
 
27
  @classmethod
 
33
  # speech_pad_ms=10
34
  min_silence_duration_ms = 100,
35
  # speech_pad_ms = 30,
 
36
  )
37
  cls.vac.reset_states()
38
 
 
49
  if start_frame:
50
  relative_start_frame =start_frame - self._offset
51
  if end_frame:
52
+ relative_end_frame = end_frame - self._offset
53
  return relative_start_frame, relative_end_frame
54
 
 
 
 
 
 
 
 
55
  def process(self, in_data: MetaItem) -> MetaItem:
56
  if self._offset == 0:
57
  self.vac.reset_states()
 
66
  if rel_start_frame is not None and rel_end_frame is None:
67
  self._status = "START" # 语音开始
68
  target_audio = source_audio[rel_start_frame:]
 
 
 
 
 
 
69
  logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
70
  elif rel_start_frame is None and rel_end_frame is not None:
71
  self._status = "END" # 音频结束
72
  target_audio = source_audio[:rel_end_frame]
 
 
 
 
73
  logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
74
  else:
75
  self._status = 'END'
76
  target_audio = source_audio[rel_start_frame:rel_end_frame]
77
  logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
 
 
 
 
78
  # logging.debug("❌ No valid speech segment detected, setting status to END")
79
  else:
80
  if self._status == 'START':
transcribe/whisper_llm_serve.py CHANGED
@@ -14,12 +14,39 @@ from .utils import log_block, save_to_wave, TestDataWriter, filter_words
14
  from .translatepipes import TranslatePipes
15
 
16
  from transcribe.helpers.vadprocessor import VadProcessor
17
- # from transcribe.helpers.vad_dynamic import VadProcessor
18
- # from transcribe.helpers.vadprocessor import VadProcessor
19
  from transcribe.pipelines import MetaItem
 
 
20
 
21
  logger = getLogger("TranscriptionService")
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  class WhisperTranscriptionService:
25
  """
@@ -51,21 +78,15 @@ class WhisperTranscriptionService:
51
  self._frame_queue = queue.Queue()
52
  # 音频队列缓冲区
53
  self.frames_np = np.array([], dtype=np.float32)
 
54
  # 完整音频队列
55
- self.segments_queue = collections.deque()
56
- self._temp_string = ""
57
-
58
- self._transcrible_analysis = None
59
  # 启动处理线程
60
  self._translate_thread_stop = threading.Event()
61
  self._frame_processing_thread_stop = threading.Event()
62
 
63
  self.translate_thread = self._start_thread(self._transcription_processing_loop)
64
  self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
65
- # if language == "zh":
66
- # self._vad = VadProcessor(prob_threshold=0.8, silence_s=0.2, cache_s=0.15)
67
- # else:
68
- # self._vad = VadProcessor(prob_threshold=0.7, silence_s=0.2, cache_s=0.15)
69
  self.row_number = 0
70
  # for test
71
  self._transcrible_time_cost = 0.
@@ -107,38 +128,60 @@ class WhisperTranscriptionService:
107
  speech_status = processed_audio.speech_status
108
  return speech_audio, speech_status
109
 
 
 
110
  def _frame_processing_loop(self) -> None:
111
  """从队列获取音频帧并合并到缓冲区"""
112
  while not self._frame_processing_thread_stop.is_set():
113
  try:
114
  frame_np = self._frame_queue.get(timeout=0.1)
115
  frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
116
- if frame_np is None or len(frame_np) == 0:
 
117
  continue
 
118
  with self.lock:
 
 
 
119
  self.frames_np = np.append(self.frames_np, frame_np)
120
- if speech_status == "END" and len(self.frames_np) > 0:
121
- self.segments_queue.appendleft(self.frames_np.copy())
 
 
122
  self.frames_np = np.array([], dtype=np.float32)
 
 
 
 
 
 
 
 
 
 
 
123
  except queue.Empty:
124
  pass
125
 
126
  def _transcription_processing_loop(self) -> None:
127
  """主转录处理循环"""
128
  frame_epoch = 1
 
129
  while not self._translate_thread_stop.is_set():
130
 
131
  if len(self.frames_np) ==0:
132
  time.sleep(0.01)
133
  continue
 
134
  with self.lock:
135
- if len(self.segments_queue) >0:
136
- audio_buffer = self.segments_queue.pop()
137
  partial = False
138
  else:
139
  audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)].copy()# 获取 1.5s * epoch 个音频长度
140
  partial = True
141
-
142
  if len(audio_buffer) < int(self.sample_rate):
143
  silence_audio = np.zeros(self.sample_rate, dtype=np.float32)
144
  silence_audio[-len(audio_buffer):] = audio_buffer
@@ -149,37 +192,25 @@ class WhisperTranscriptionService:
149
  segments = meta_item.segments
150
  logger.debug(f"Segments: {segments}")
151
  segments = filter_words(segments)
 
152
  if len(segments):
153
  seg_text = self.text_separator.join(seg.text for seg in segments)
154
- if self._temp_string:
155
- seg_text = self._temp_string + seg_text
156
-
157
- if partial == False:
158
- # segment_length = len(seg_text.split(self.text_separator)) if self.text_separator else len(seg_text)
159
- if len(seg_text) < config.TEXT_THREHOLD:
160
- partial = True
161
- self._temp_string = seg_text
162
- else:
163
- self._temp_string = ""
164
-
165
- result = TransResult(
166
- seg_id=self.row_number,
167
- context=seg_text,
168
- from_=self.source_language,
169
- to=self.target_language,
170
- tran_content=self._translate_text_large(seg_text),
171
- partial=partial
172
- )
173
  if partial == False:
174
  self.row_number += 1
175
-
176
- self._send_result_to_client(result)
177
-
178
- if partial == False:
179
  frame_epoch = 1
180
  else:
181
  frame_epoch += 1
182
-
 
 
183
 
184
  def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
185
  """转录音频并返回转录片段"""
 
14
  from .translatepipes import TranslatePipes
15
 
16
  from transcribe.helpers.vadprocessor import VadProcessor
 
 
17
  from transcribe.pipelines import MetaItem
18
+ from dataclasses import dataclass, field
19
+
20
 
21
  logger = getLogger("TranscriptionService")
22
 
23
+ @dataclass
24
+ class FullSegment:
25
+ """整句"""
26
+ audio_array: np.ndarray
27
+ created_time: float = field(default_factory=time.time)
28
+
29
+ @staticmethod
30
+ def merge(*audio_segments: list["FullSegment"]):
31
+ audio_segments_sorted = sorted([*audio_segments], key=lambda item: item.created_time)
32
+ return FullSegment(
33
+ created_time=audio_segments_sorted[0].created_time,
34
+ audio_array=np.concatenate([i.audio_array for i in audio_segments_sorted], axis=0)
35
+ )
36
+
37
+ @property
38
+ def time_duration(self) -> float:
39
+ return len(self.audio_array) / config.SAMPLE_RATE
40
+
41
+ @property
42
+ def start_timestamp(self):
43
+ return self.created_time
44
+
45
+ @property
46
+ def end_timestamp(self):
47
+ return self.created_time + self.time_duration
48
+
49
+
50
 
51
  class WhisperTranscriptionService:
52
  """
 
78
  self._frame_queue = queue.Queue()
79
  # 音频队列缓冲区
80
  self.frames_np = np.array([], dtype=np.float32)
81
+ self.frames_np_start_timestamp = None
82
  # 完整音频队列
83
+ self.full_segments_queue = collections.deque()
 
 
 
84
  # 启动处理线程
85
  self._translate_thread_stop = threading.Event()
86
  self._frame_processing_thread_stop = threading.Event()
87
 
88
  self.translate_thread = self._start_thread(self._transcription_processing_loop)
89
  self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
 
 
 
 
90
  self.row_number = 0
91
  # for test
92
  self._transcrible_time_cost = 0.
 
128
  speech_status = processed_audio.speech_status
129
  return speech_audio, speech_status
130
 
131
+
132
+
133
  def _frame_processing_loop(self) -> None:
134
  """从队列获取音频帧并合并到缓冲区"""
135
  while not self._frame_processing_thread_stop.is_set():
136
  try:
137
  frame_np = self._frame_queue.get(timeout=0.1)
138
  frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
139
+
140
+ if frame_np is None:
141
  continue
142
+
143
  with self.lock:
144
+ if speech_status == "START" and self.frames_np_start_timestamp is None:
145
+ self.frames_np_start_timestamp = time.time()
146
+ # 添加音频到音频缓冲区
147
  self.frames_np = np.append(self.frames_np, frame_np)
148
+ if len(self.frames_np) >= self.sample_rate * config.MAX_SPEECH_DURATION_S:
149
+ audio_array=self.frames_np.copy()
150
+ self.full_segments_queue.appendleft(audio_array) # 根据时间是否满足三秒长度 来整合音频块
151
+ self.frames_np_start_timestamp = time.time()
152
  self.frames_np = np.array([], dtype=np.float32)
153
+
154
+ elif speech_status == "END" and len(self.frames_np) > 0 and self.frames_np_start_timestamp:
155
+ time_diff = time.time() - self.frames_np_start_timestamp
156
+ if time_diff >= config.DESIGN_TIME_THREHOLD:
157
+ audio_array=self.frames_np.copy()
158
+ self.full_segments_queue.appendleft(audio_array) # 根据时间是否满足三秒长度 来整合音频块
159
+ self.frames_np_start_timestamp = None
160
+ self.frames_np = np.array([], dtype=np.float32)
161
+ else:
162
+ logger.debug(f"🥳 当前时间与上一句的时间差: {time_diff:.2f}s,继续增加缓冲区")
163
+
164
  except queue.Empty:
165
  pass
166
 
167
  def _transcription_processing_loop(self) -> None:
168
  """主转录处理循环"""
169
  frame_epoch = 1
170
+
171
  while not self._translate_thread_stop.is_set():
172
 
173
  if len(self.frames_np) ==0:
174
  time.sleep(0.01)
175
  continue
176
+
177
  with self.lock:
178
+ if len(self.full_segments_queue) > 0:
179
+ audio_buffer = self.full_segments_queue.pop()
180
  partial = False
181
  else:
182
  audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)].copy()# 获取 1.5s * epoch 个音频长度
183
  partial = True
184
+
185
  if len(audio_buffer) < int(self.sample_rate):
186
  silence_audio = np.zeros(self.sample_rate, dtype=np.float32)
187
  silence_audio[-len(audio_buffer):] = audio_buffer
 
192
  segments = meta_item.segments
193
  logger.debug(f"Segments: {segments}")
194
  segments = filter_words(segments)
195
+
196
  if len(segments):
197
  seg_text = self.text_separator.join(seg.text for seg in segments)
198
+ result = TransResult(
199
+ seg_id=self.row_number,
200
+ context=seg_text,
201
+ from_=self.source_language,
202
+ to=self.target_language,
203
+ tran_content=self._translate_text_large(seg_text),
204
+ partial=partial
205
+ )
 
 
 
 
 
 
 
 
 
 
 
206
  if partial == False:
207
  self.row_number += 1
 
 
 
 
208
  frame_epoch = 1
209
  else:
210
  frame_epoch += 1
211
+ self._send_result_to_client(result)
212
+
213
+
214
 
215
  def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
216
  """转录音频并返回转录片段"""