daihui.zhang commited on
Commit
f14a125
·
1 Parent(s): 89384e7

add speech start padding 100ms

Browse files
config.py CHANGED
@@ -24,7 +24,7 @@ logging.getLogger().addHandler(console_handler)
24
  # 文字输出长度阈值
25
  TEXT_THREHOLD = 6
26
  # 音频段的决策时间
27
- DESIGN_TIME_THREHOLD = 3
28
  # 最长语音时长
29
  MAX_SPEECH_DURATION_S = 15
30
 
 
24
  # 文字输出长度阈值
25
  TEXT_THREHOLD = 6
26
  # 音频段的决策时间
27
+ FRAME_SCOPE_TIME_THREHOLD = 3
28
  # 最长语音时长
29
  MAX_SPEECH_DURATION_S = 15
30
 
transcribe/pipelines/pipe_vad.py CHANGED
@@ -61,11 +61,10 @@ class VadPipe(BasePipe):
61
  speech_data = self._process_speech_chunk(source_audio)
62
 
63
  if speech_data: # 表示有音频的变化点出现
64
-
65
  rel_start_frame, rel_end_frame = speech_data
66
  if rel_start_frame is not None and rel_end_frame is None:
67
  self._status = "START" # 语音开始
68
- target_audio = source_audio[rel_start_frame:]
69
  logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
70
  elif rel_start_frame is None and rel_end_frame is not None:
71
  self._status = "END" # 音频结束
 
61
  speech_data = self._process_speech_chunk(source_audio)
62
 
63
  if speech_data: # 表示有音频的变化点出现
 
64
  rel_start_frame, rel_end_frame = speech_data
65
  if rel_start_frame is not None and rel_end_frame is None:
66
  self._status = "START" # 语音开始
67
+ target_audio = source_audio[max(rel_start_frame-100, 0):]
68
  logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
69
  elif rel_start_frame is None and rel_end_frame is not None:
70
  self._status = "END" # 音频结束
transcribe/whisper_llm_serve.py CHANGED
@@ -153,7 +153,7 @@ class WhisperTranscriptionService:
153
 
154
  elif speech_status == "END" and len(self.frames_np) > 0 and self.frames_np_start_timestamp:
155
  time_diff = time.time() - self.frames_np_start_timestamp
156
- if time_diff >= config.DESIGN_TIME_THREHOLD:
157
  audio_array=self.frames_np.copy()
158
  self.full_segments_queue.appendleft(audio_array) # 根据时间是否满足三秒长度 来整合音频块
159
  self.frames_np_start_timestamp = None
 
153
 
154
  elif speech_status == "END" and len(self.frames_np) > 0 and self.frames_np_start_timestamp:
155
  time_diff = time.time() - self.frames_np_start_timestamp
156
+ if time_diff >= config.FRAME_SCOPE_TIME_THREHOLD:
157
  audio_array=self.frames_np.copy()
158
  self.full_segments_queue.appendleft(audio_array) # 根据时间是否满足三秒长度 来整合音频块
159
  self.frames_np_start_timestamp = None