Xin Zhang commited on
Commit
2d75b7c
·
2 Parent(s): 750e8d5 e19aebc

Merge branch 'vad' of hf.co:MoYoYoTech/Translator into vad

Browse files

* 'vad' of hf.co:MoYoYoTech/Translator:
update
filter [] words
Disable FunASR pbar.

# Conflicts:
# transcribe/pipelines/pipe_vad.py

transcribe/helpers/funasr.py CHANGED
@@ -30,7 +30,7 @@ class FunASR:
30
  audio_frames = np.frombuffer(audio_buffer, dtype=np.float32)
31
  # sf.write(f'{config.ASSERT_DIR}/{time.time()}.wav', audio_frames, samplerate=16000)
32
  try:
33
- output = self.model.generate(input=audio_frames)
34
  return output
35
  except Exception as e:
36
  logger.error(e)
 
30
  audio_frames = np.frombuffer(audio_buffer, dtype=np.float32)
31
  # sf.write(f'{config.ASSERT_DIR}/{time.time()}.wav', audio_frames, samplerate=16000)
32
  try:
33
+ output = self.model.generate(input=audio_frames, disable_pbar=True)
34
  return output
35
  except Exception as e:
36
  logger.error(e)
transcribe/helpers/vadprocessor.py CHANGED
@@ -113,6 +113,7 @@ class VADIteratorOnnx:
113
  sampling_rate: int = 16000,
114
  min_silence_duration_ms: int = 100,
115
  max_speech_duration_s: float = float('inf'),
 
116
  ):
117
  self.model = OnnxWrapper(VAD_MODEL_PATH, True)
118
  self.threshold = threshold
@@ -123,7 +124,7 @@ class VADIteratorOnnx:
123
 
124
  self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
125
  self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
126
- # self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
127
  self.reset_states()
128
 
129
  def reset_states(self):
@@ -158,7 +159,8 @@ class VADIteratorOnnx:
158
 
159
  if (speech_prob >= self.threshold) and not self.triggered:
160
  self.triggered = True
161
- speech_start = max(0, self.current_sample - window_size_samples)
 
162
  self.start = speech_start
163
  return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
164
 
@@ -174,7 +176,8 @@ class VADIteratorOnnx:
174
  if self.current_sample - self.temp_end < self.min_silence_samples:
175
  return None
176
  else:
177
- speech_end = self.temp_end - window_size_samples
 
178
  self.temp_end = 0
179
  self.triggered = False
180
  return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
 
113
  sampling_rate: int = 16000,
114
  min_silence_duration_ms: int = 100,
115
  max_speech_duration_s: float = float('inf'),
116
+ speech_pad_ms: int = 30
117
  ):
118
  self.model = OnnxWrapper(VAD_MODEL_PATH, True)
119
  self.threshold = threshold
 
124
 
125
  self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
126
  self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
127
+ self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
128
  self.reset_states()
129
 
130
  def reset_states(self):
 
159
 
160
  if (speech_prob >= self.threshold) and not self.triggered:
161
  self.triggered = True
162
+ # speech_start = max(0, self.current_sample - window_size_samples)
163
+ speech_start = max(0, self.current_sample - self.speech_pad_samples - window_size_samples)
164
  self.start = speech_start
165
  return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
166
 
 
176
  if self.current_sample - self.temp_end < self.min_silence_samples:
177
  return None
178
  else:
179
+ # speech_end = self.temp_end - window_size_samples
180
+ speech_end = self.temp_end + self.speech_pad_samples - window_size_samples
181
  self.temp_end = 0
182
  self.triggered = False
183
  return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
transcribe/helpers/whisper.py CHANGED
@@ -52,7 +52,7 @@ class WhisperCPP:
52
  initial_prompt=prompt,
53
  language=language,
54
  # token_timestamps=True,
55
- # split_on_word=True,
56
  # max_len=max_len
57
  )
58
  return output
 
52
  initial_prompt=prompt,
53
  language=language,
54
  # token_timestamps=True,
55
+ split_on_word=True,
56
  # max_len=max_len
57
  )
58
  return output
transcribe/pipelines/pipe_vad.py CHANGED
@@ -50,7 +50,7 @@ class VadPipe(BasePipe):
50
  if start_frame:
51
  relative_start_frame = start_frame - self._offset
52
  if end_frame:
53
- relative_end_frame = end_frame - self._offset
54
  return relative_start_frame, relative_end_frame
55
 
56
  def process(self, in_data: MetaItem) -> MetaItem:
@@ -70,13 +70,10 @@ class VadPipe(BasePipe):
70
  self._status = "END" # 音频结束
71
  target_audio = source_audio[:rel_end_frame]
72
  logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
73
- elif rel_start_frame is not None and rel_end_frame is not None:
74
  self._status = 'END'
75
  target_audio = source_audio[rel_start_frame:rel_end_frame]
76
  logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
77
- else:
78
- self._status = 'END'
79
- target_audio = np.array([],dtype=np.float32)
80
  # logging.debug("❌ No valid speech segment detected, setting status to END")
81
  else:
82
  if self._status == 'START':
 
50
  if start_frame:
51
  relative_start_frame = start_frame - self._offset
52
  if end_frame:
53
+ relative_end_frame = max(0, end_frame - self._offset)
54
  return relative_start_frame, relative_end_frame
55
 
56
  def process(self, in_data: MetaItem) -> MetaItem:
 
70
  self._status = "END" # 音频结束
71
  target_audio = source_audio[:rel_end_frame]
72
  logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
73
+ else:
74
  self._status = 'END'
75
  target_audio = source_audio[rel_start_frame:rel_end_frame]
76
  logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
 
 
 
77
  # logging.debug("❌ No valid speech segment detected, setting status to END")
78
  else:
79
  if self._status == 'START':
transcribe/utils.py CHANGED
@@ -7,6 +7,51 @@ from scipy.io.wavfile import write
7
  import config
8
  import csv
9
  import av
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def log_block(key: str, value, unit=''):
11
  if config.DEBUG:
12
  return
 
7
  import config
8
  import csv
9
  import av
10
+ import re
11
+
12
+ # Compile regex patterns once outside the loop for better performance
13
+ p_pattern = re.compile(r"(\s*\[.*?\])")
14
+ p_start_pattern = re.compile(r"(\s*\[.*)")
15
+ p_end_pattern = re.compile(r"(\s*.*\])")
16
+
17
+
18
+ def filter_words(res_word):
19
+ """
20
+ Filter words according to specific bracket patterns.
21
+
22
+ Args:
23
+ res_word: Iterable of word objects with a 'text' attribute
24
+
25
+ Returns:
26
+ List of filtered word objects
27
+ """
28
+ asr_results = []
29
+ skip_word = False
30
+
31
+ for word in res_word:
32
+ # Skip words that completely match the pattern
33
+ if p_pattern.match(word.text):
34
+ continue
35
+
36
+ # Mark the start of a section to skip
37
+ if p_start_pattern.match(word.text):
38
+ skip_word = True
39
+ continue
40
+
41
+ # Mark the end of a section to skip
42
+ if p_end_pattern.match(word.text) and skip_word:
43
+ skip_word = False
44
+ continue
45
+
46
+ # Skip words if we're in a skip section
47
+ if skip_word:
48
+ continue
49
+
50
+ # Add the word to results if it passed all filters
51
+ asr_results.append(word)
52
+
53
+ return asr_results
54
+
55
  def log_block(key: str, value, unit=''):
56
  if config.DEBUG:
57
  return
transcribe/whisper_llm_serve.py CHANGED
@@ -11,7 +11,7 @@ import config
11
  import collections
12
  from api_model import TransResult, Message, DebugResult
13
 
14
- from .utils import log_block, save_to_wave, TestDataWriter
15
  from .translatepipes import TranslatePipes
16
  from .strategy import (
17
  TranscriptStabilityAnalyzer, TranscriptToken)
@@ -132,7 +132,7 @@ class WhisperTranscriptionService:
132
  try:
133
  frame_np = self._frame_queue.get(timeout=0.1)
134
  frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
135
- if frame_np is None:
136
  continue
137
  with self.lock:
138
  if self.frames_np is None:
@@ -165,19 +165,20 @@ class WhisperTranscriptionService:
165
  while not self._translate_thread_stop.is_set():
166
 
167
  if self.frames_np is None:
168
- time.sleep(0.2)
169
  continue
170
 
171
- with self.lock:
172
- if len(self.segments_queue) >0:
173
- audio_buffer = self.segments_queue.pop()
174
- partial = False
175
- else:
 
176
  audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)]# 获取 1.5s * epoch 个音频长度
177
- partial = True
178
 
179
  if len(audio_buffer) ==0:
180
- time.sleep(0.2)
181
  continue
182
 
183
  if len(audio_buffer) < int(self.sample_rate):
@@ -191,6 +192,7 @@ class WhisperTranscriptionService:
191
  meta_item = self._transcribe_audio(audio_buffer)
192
  segments = meta_item.segments
193
  logger.debug(f"Segments: {segments}")
 
194
  if len(segments):
195
  seg_text = self.text_separator.join(seg.text for seg in segments)
196
  if self._temp_string:
 
11
  import collections
12
  from api_model import TransResult, Message, DebugResult
13
 
14
+ from .utils import log_block, save_to_wave, TestDataWriter, filter_words
15
  from .translatepipes import TranslatePipes
16
  from .strategy import (
17
  TranscriptStabilityAnalyzer, TranscriptToken)
 
132
  try:
133
  frame_np = self._frame_queue.get(timeout=0.1)
134
  frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
135
+ if frame_np is None or len(frame_np) == 0:
136
  continue
137
  with self.lock:
138
  if self.frames_np is None:
 
165
  while not self._translate_thread_stop.is_set():
166
 
167
  if self.frames_np is None:
168
+ time.sleep(0.01)
169
  continue
170
 
171
+
172
+ if len(self.segments_queue) >0:
173
+ audio_buffer = self.segments_queue.pop()
174
+ partial = False
175
+ else:
176
+ with self.lock:
177
  audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)]# 获取 1.5s * epoch 个音频长度
178
+ partial = True
179
 
180
  if len(audio_buffer) ==0:
181
+ time.sleep(0.01)
182
  continue
183
 
184
  if len(audio_buffer) < int(self.sample_rate):
 
192
  meta_item = self._transcribe_audio(audio_buffer)
193
  segments = meta_item.segments
194
  logger.debug(f"Segments: {segments}")
195
+ segments = filter_words(segments)
196
  if len(segments):
197
  seg_text = self.text_separator.join(seg.text for seg in segments)
198
  if self._temp_string: