Xin Zhang commited on
Commit
c399ddb
·
2 Parent(s): fdeedee d8ef700

Merge branch 'main' of hf.co:MoYoYoTech/Translator

Browse files

* 'main' of hf.co:MoYoYoTech/Translator:
add custom vad silence
add custom vad silence

Files changed (1) hide show
  1. transcribe/pipelines/pipe_vad.py +47 -2
transcribe/pipelines/pipe_vad.py CHANGED
@@ -2,10 +2,55 @@
2
  from .base import MetaItem, BasePipe
3
  from ..helpers.vadprocessor import SileroVADProcessor, FixedVADIterator
4
  import numpy as np
5
- from silero_vad import get_speech_timestamps,collect_chunks
6
  import torch
 
7
  # import noisereduce as nr
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  class VadPipe(BasePipe):
11
  model = None
@@ -46,7 +91,7 @@ class VadPipe(BasePipe):
46
  speech_timestamps = get_speech_timestamps(torch.Tensor(source_audio), self.model.silero_vad, sampling_rate=16000)
47
 
48
  if speech_timestamps:
49
- send_audio = collect_chunks(speech_timestamps, torch.Tensor(source_audio))
50
  send_audio = send_audio.numpy()
51
  in_data.audio = send_audio
52
  # send_audio = self.reduce_noise(send_audio).tobytes()
 
2
  from .base import MetaItem, BasePipe
3
  from ..helpers.vadprocessor import SileroVADProcessor, FixedVADIterator
4
  import numpy as np
5
+ from silero_vad import get_speech_timestamps
6
  import torch
7
+ from typing import List
8
  # import noisereduce as nr
9
 
10
+ def collect_chunks(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000):
11
+ chunks = []
12
+ silent_samples = int(0.3 * sample_rate) # 300ms 的静音样本数
13
+ silence = torch.zeros(silent_samples) # 创建300ms的静音
14
+
15
+ for i in range(len(tss)):
16
+ # 先添加当前语音片段
17
+ chunks.append(wav[tss[i]['start']: tss[i]['end']])
18
+
19
+ # 如果不是最后一个片段,且与下一个片段间隔大于100ms,则添加静音
20
+ if i < len(tss) - 1:
21
+ gap = tss[i+1]['start'] - tss[i]['end']
22
+ if gap > 0.1 * sample_rate: # 判断间隔是否大于100ms
23
+ chunks.append(silence) # 添加300ms静音
24
+
25
+ return torch.cat(chunks)
26
+
27
+ def collect_chunks_improved(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000):
28
+ chunks = []
29
+ silent_samples = int(0.3 * sample_rate) # 300ms 的静音样本数
30
+ silence = torch.zeros(silent_samples) # 创建300ms的静音
31
+ min_gap_samples = int(0.1 * sample_rate) # 最小间隔阈值 (100ms)
32
+
33
+ # 对时间戳进行简单的平滑处理
34
+ smoothed_tss = []
35
+ for i, ts in enumerate(tss):
36
+ if i > 0 and ts['start'] - tss[i-1]['end'] < 0.02 * sample_rate: # 如果间隔小于20ms,认为是连续的
37
+ smoothed_tss[-1]['end'] = ts['end'] # 合并到前一个片段
38
+ else:
39
+ smoothed_tss.append(ts)
40
+
41
+ for i in range(len(smoothed_tss)):
42
+ # 添加当前语音片段
43
+ chunks.append(wav[smoothed_tss[i]['start']: smoothed_tss[i]['end']])
44
+
45
+ # 如果不是最后一个片段,且与下一个片段间隔大于阈值,则添加静音
46
+ if i < len(smoothed_tss) - 1:
47
+ gap = smoothed_tss[i+1]['start'] - smoothed_tss[i]['end']
48
+ if gap > min_gap_samples:
49
+ # 根据间隔大小动态调整静音长度,但最大不超过300ms
50
+ silence_length = min(gap // 2, silent_samples)
51
+ chunks.append(torch.zeros(silence_length))
52
+
53
+ return torch.cat(chunks)
54
 
55
  class VadPipe(BasePipe):
56
  model = None
 
91
  speech_timestamps = get_speech_timestamps(torch.Tensor(source_audio), self.model.silero_vad, sampling_rate=16000)
92
 
93
  if speech_timestamps:
94
+ send_audio = collect_chunks_improved(speech_timestamps, torch.Tensor(source_audio))
95
  send_audio = send_audio.numpy()
96
  in_data.audio = send_audio
97
  # send_audio = self.reduce_noise(send_audio).tobytes()