daihui.zhang commited on
Commit
e19aebc
·
1 Parent(s): 5518c26
transcribe/pipelines/pipe_vad.py CHANGED
@@ -28,12 +28,12 @@ class VadPipe(BasePipe):
28
  def init(cls):
29
  if cls.vac is None:
30
  cls.vac = FixedVADIterator(
31
- threshold=0.3,
32
  sampling_rate=cls.sample_rate,
33
  # speech_pad_ms=10
34
- min_silence_duration_ms = 100,
35
  # speech_pad_ms = 30,
36
- max_speech_duration_s=15
37
  )
38
  cls.vac.reset_states()
39
 
@@ -50,7 +50,7 @@ class VadPipe(BasePipe):
50
  if start_frame:
51
  relative_start_frame = start_frame - self._offset
52
  if end_frame:
53
- relative_end_frame = end_frame - self._offset
54
  return relative_start_frame, relative_end_frame
55
 
56
  def process(self, in_data: MetaItem) -> MetaItem:
@@ -70,13 +70,10 @@ class VadPipe(BasePipe):
70
  self._status = "END" # 音频结束
71
  target_audio = source_audio[:rel_end_frame]
72
  logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
73
- elif rel_start_frame is not None and rel_end_frame is not None:
74
  self._status = 'END'
75
  target_audio = source_audio[rel_start_frame:rel_end_frame]
76
  logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
77
- else:
78
- self._status = 'END'
79
- target_audio = np.array([],dtype=np.float32)
80
  # logging.debug("❌ No valid speech segment detected, setting status to END")
81
  else:
82
  if self._status == 'START':
 
28
  def init(cls):
29
  if cls.vac is None:
30
  cls.vac = FixedVADIterator(
31
+ threshold=0.5,
32
  sampling_rate=cls.sample_rate,
33
  # speech_pad_ms=10
34
+ min_silence_duration_ms = 50,
35
  # speech_pad_ms = 30,
36
+ max_speech_duration_s=20
37
  )
38
  cls.vac.reset_states()
39
 
 
50
  if start_frame:
51
  relative_start_frame = start_frame - self._offset
52
  if end_frame:
53
+ relative_end_frame = max(0, end_frame - self._offset)
54
  return relative_start_frame, relative_end_frame
55
 
56
  def process(self, in_data: MetaItem) -> MetaItem:
 
70
  self._status = "END" # 音频结束
71
  target_audio = source_audio[:rel_end_frame]
72
  logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
73
+ else:
74
  self._status = 'END'
75
  target_audio = source_audio[rel_start_frame:rel_end_frame]
76
  logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
 
 
 
77
  # logging.debug("❌ No valid speech segment detected, setting status to END")
78
  else:
79
  if self._status == 'START':
transcribe/whisper_llm_serve.py CHANGED
@@ -132,7 +132,7 @@ class WhisperTranscriptionService:
132
  try:
133
  frame_np = self._frame_queue.get(timeout=0.1)
134
  frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
135
- if frame_np is None:
136
  continue
137
  with self.lock:
138
  if self.frames_np is None:
@@ -165,19 +165,20 @@ class WhisperTranscriptionService:
165
  while not self._translate_thread_stop.is_set():
166
 
167
  if self.frames_np is None:
168
- time.sleep(0.2)
169
  continue
170
 
171
- with self.lock:
172
- if len(self.segments_queue) >0:
173
- audio_buffer = self.segments_queue.pop()
174
- partial = False
175
- else:
 
176
  audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)]# 获取 1.5s * epoch 个音频长度
177
- partial = True
178
 
179
  if len(audio_buffer) ==0:
180
- time.sleep(0.2)
181
  continue
182
 
183
  if len(audio_buffer) < int(self.sample_rate):
 
132
  try:
133
  frame_np = self._frame_queue.get(timeout=0.1)
134
  frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
135
+ if frame_np is None or len(frame_np) == 0:
136
  continue
137
  with self.lock:
138
  if self.frames_np is None:
 
165
  while not self._translate_thread_stop.is_set():
166
 
167
  if self.frames_np is None:
168
+ time.sleep(0.01)
169
  continue
170
 
171
+
172
+ if len(self.segments_queue) >0:
173
+ audio_buffer = self.segments_queue.pop()
174
+ partial = False
175
+ else:
176
+ with self.lock:
177
  audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)]# 获取 1.5s * epoch 个音频长度
178
+ partial = True
179
 
180
  if len(audio_buffer) ==0:
181
+ time.sleep(0.01)
182
  continue
183
 
184
  if len(audio_buffer) < int(self.sample_rate):