daihui.zhang commited on
Commit
f5bdb50
·
1 Parent(s): d84bca3

fix vad buf

Browse files
main.py CHANGED
@@ -57,6 +57,8 @@ async def root():
57
  async def translate(websocket: WebSocket):
58
  query_parameters_dict = websocket.query_params
59
  from_lang, to_lang = query_parameters_dict.get('from'), query_parameters_dict.get('to')
 
 
60
  client = WhisperTranscriptionService(
61
  websocket,
62
  pipe,
@@ -64,6 +66,7 @@ async def translate(websocket: WebSocket):
64
  client_uid=f"{uuid1()}",
65
  )
66
 
 
67
  if from_lang and to_lang:
68
  client.set_language(from_lang, to_lang)
69
  logger.info(f"Source lange: {from_lang} -> Dst lange: {to_lang}")
 
57
  async def translate(websocket: WebSocket):
58
  query_parameters_dict = websocket.query_params
59
  from_lang, to_lang = query_parameters_dict.get('from'), query_parameters_dict.get('to')
60
+
61
+ pipe.reset()
62
  client = WhisperTranscriptionService(
63
  websocket,
64
  pipe,
 
66
  client_uid=f"{uuid1()}",
67
  )
68
 
69
+
70
  if from_lang and to_lang:
71
  client.set_language(from_lang, to_lang)
72
  logger.info(f"Source lange: {from_lang} -> Dst lange: {to_lang}")
transcribe/pipelines/pipe_vad.py CHANGED
@@ -56,8 +56,18 @@ class VadPipe(BasePipe):
56
  model = None
57
  sample_rate = 16000
58
  window_size_samples = 512
 
59
 
 
 
 
 
 
60
 
 
 
 
 
61
  @classmethod
62
  def init(cls):
63
  if cls.model is None:
@@ -81,9 +91,52 @@ class VadPipe(BasePipe):
81
 
82
  # def reduce_noise(self, data):
83
  # return nr.reduce_noise(y=data, sr=self.sample_rate)
84
-
85
 
 
 
 
 
 
 
 
 
 
 
86
  def process(self, in_data: MetaItem) -> MetaItem:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  source_audio = in_data.source_audio
88
  source_audio = np.frombuffer(source_audio, dtype=np.float32)
89
  # source_audio = self.reduce_noise(source_audio)
 
56
  model = None
57
  sample_rate = 16000
58
  window_size_samples = 512
59
+ chunk_size = 512
60
 
61
+ def __init__(self, in_queue=None, out_queue=None) -> None:
62
+ super().__init__(in_queue, out_queue)
63
+ self._offset = 0 # 处理的frame size offset
64
+ self._status = 'END'
65
+
66
 
67
+ def reset(self):
68
+ self._offset = 0
69
+ self._status = 'END'
70
+
71
  @classmethod
72
  def init(cls):
73
  if cls.model is None:
 
91
 
92
  # def reduce_noise(self, data):
93
  # return nr.reduce_noise(y=data, sr=self.sample_rate)
 
94
 
95
+ def _process_speech_chunk(self, source_audio:np.ndarray):
96
+ speech_dict = self.vac(source_audio, return_seconds=False)
97
+ if speech_dict:
98
+ start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
99
+ if start_frame:
100
+ relative_start_frame = max(0, (start_frame - self._offset))
101
+ if end_frame:
102
+ relative_end_frame = min((end_frame+1 - self._offset),len(source_audio))
103
+ return relative_start_frame, relative_end_frame
104
+
105
  def process(self, in_data: MetaItem) -> MetaItem:
106
+ if self._offset == 0:
107
+ self.vac.reset_states()
108
+
109
+ source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
110
+ speech_data = self._process_iter_chunk(source_audio)
111
+ self._offset += len(source_audio)
112
+ if speech_data: # 表示有音频的变化点出现
113
+ rel_start_frame, rel_end_frame = speech_data
114
+ if rel_start_frame and not rel_end_frame:
115
+ self._status = "START" # 语音开始
116
+ target_audio = source_audio[rel_start_frame:]
117
+ elif not rel_start_frame and rel_end_frame:
118
+ self._status = "END" # 音频结束
119
+ target_audio = source_audio[:rel_end_frame]
120
+ elif rel_start_frame and rel_end_frame:
121
+ self._status = 'END'
122
+ target_audio = source_audio[rel_start_frame:rel_end_frame]
123
+ else:
124
+ self._status = 'END'
125
+ target_audio = np.array([],dtype=np.float32)
126
+ else:
127
+ if self._status == 'START':
128
+ target_audio = source_audio
129
+ else: # end
130
+ target_audio = np.array([],dtype=np.float32)
131
+
132
+
133
+ in_data.audio = target_audio.tobytes()
134
+ in_data.source_audio = b''
135
+ return in_data
136
+
137
+
138
+
139
+ def process_all(self, in_data: MetaItem) -> MetaItem:
140
  source_audio = in_data.source_audio
141
  source_audio = np.frombuffer(source_audio, dtype=np.float32)
142
  # source_audio = self.reduce_noise(source_audio)
transcribe/translatepipes.py CHANGED
@@ -19,6 +19,9 @@ class TranslatePipes:
19
  self._translate_7b_pipe = self._launch_process(Translate7BPipe())
20
  # vad
21
  self._vad_pipe = self._launch_process(VadPipe())
 
 
 
22
 
23
  def _launch_process(self, process_obj):
24
  process_obj.daemon = True
 
19
  self._translate_7b_pipe = self._launch_process(Translate7BPipe())
20
  # vad
21
  self._vad_pipe = self._launch_process(VadPipe())
22
+
23
+ def reset(self):
24
+ self._vad_pipe.reset()
25
 
26
  def _launch_process(self, process_obj):
27
  process_obj.daemon = True
transcribe/whisper_llm_serve.py CHANGED
@@ -54,6 +54,9 @@ class WhisperTranscriptionService(ServeClientBase):
54
  self.translate_thread = self._start_thread(self._transcription_processing_loop)
55
  self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
56
 
 
 
 
57
  # for test
58
  self._transcrible_time_cost = 0.
59
  self._translate_time_cost = 0.
@@ -106,8 +109,11 @@ class WhisperTranscriptionService(ServeClientBase):
106
  while not self._frame_processing_thread_stop.is_set():
107
  try:
108
  frame_np = self._frame_queue.get(timeout=0.1)
 
109
  if frame_np is None:
110
  logger.error("Received None frame, stopping thread")
 
 
111
  with self.lock:
112
  if self.frames_np is None:
113
  self.frames_np = frame_np.copy()
@@ -116,18 +122,16 @@ class WhisperTranscriptionService(ServeClientBase):
116
  except queue.Empty:
117
  pass
118
 
119
- def _apply_voice_activity_detection(self) -> None:
120
  """应用语音活动检测来优化音频缓冲区"""
121
- with self.lock:
122
- if self.frames_np is not None:
123
- # self._c+= 1
124
- frame = self.frames_np.copy()
125
- processed_audio = self._translate_pipe.voice_detect(frame.tobytes())
126
- self.frames_np = np.frombuffer(processed_audio.audio, dtype=np.float32).copy()
127
- return self.frames_np.copy()
128
- # if len(frame) > self.sample_rate:
129
- # save_to_wave(f"{self._c}-org.wav", frame)
130
- # save_to_wave(f"{self._c}-vad.wav", self.frames_np)
131
 
132
  def _update_audio_buffer(self, offset: int) -> None:
133
  """从音频缓冲区中移除已处理的部分"""
@@ -145,8 +149,8 @@ class WhisperTranscriptionService(ServeClientBase):
145
  def _get_audio_for_processing(self) -> Optional[np.ndarray]:
146
  """准备用于处理的音频块"""
147
  # 应用VAD处理
148
- frame_np = self._apply_voice_activity_detection()
149
- # frame_np = self.frames_np.copy()
150
  # 没有音频帧
151
  if frame_np is None:
152
  return None
 
54
  self.translate_thread = self._start_thread(self._transcription_processing_loop)
55
  self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
56
 
57
+ #
58
+ self._vad_processed_offset = 0
59
+
60
  # for test
61
  self._transcrible_time_cost = 0.
62
  self._translate_time_cost = 0.
 
109
  while not self._frame_processing_thread_stop.is_set():
110
  try:
111
  frame_np = self._frame_queue.get(timeout=0.1)
112
+ frame_np = self._apply_voice_activity_detection(frame_np)
113
  if frame_np is None:
114
  logger.error("Received None frame, stopping thread")
115
+ # apply vad speech check:
116
+
117
  with self.lock:
118
  if self.frames_np is None:
119
  self.frames_np = frame_np.copy()
 
122
  except queue.Empty:
123
  pass
124
 
125
+ def _apply_voice_activity_detection(self, frame_np:np.array) -> None:
126
  """应用语音活动检测来优化音频缓冲区"""
127
+ # self._c+= 1
128
+ processed_audio = self._translate_pipe.voice_detect(frame_np.tobytes())
129
+ speech_audio = np.frombuffer(processed_audio.audio, dtype=np.float32)
130
+ # if speech_audio:
131
+ # if len(frame) > self.sample_rate:
132
+ # save_to_wave(f"{self._c}-org.wav", frame)
133
+ # save_to_wave(f"{self._c}-vad.wav", self.frames_np)
134
+ return speech_audio
 
 
135
 
136
  def _update_audio_buffer(self, offset: int) -> None:
137
  """从音频缓冲区中移除已处理的部分"""
 
149
  def _get_audio_for_processing(self) -> Optional[np.ndarray]:
150
  """准备用于处理的音频块"""
151
  # 应用VAD处理
152
+ # frame_np = self._apply_voice_activity_detection()
153
+ frame_np = self.frames_np.copy()
154
  # 没有音频帧
155
  if frame_np is None:
156
  return None