inoryQwQ commited on
Commit
0caa3bc
·
1 Parent(s): b796b3b

Fully functional StreamVAD

Browse files
Files changed (7) hide show
  1. .gitignore +2 -0
  2. README.md +43 -0
  3. SileroOrt.py +71 -0
  4. StreamVAD.py +78 -0
  5. main.py +35 -0
  6. requirements.txt +6 -0
  7. silero_vad.onnx +3 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ output
2
+ __pycache__
README.md CHANGED
@@ -1,3 +1,46 @@
1
  ---
2
  license: mit
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
  ---
4
+ # SileroVAD
5
+
6
+ 流式语音端点识别
7
+
8
+
9
+ ## Demo
10
+
11
+ ```
12
+ python main.py --input demo.wav --output_dir output --model silero_vad.onnx
13
+ ```
14
+
15
+ 被分段的语音后保存在output目录中
16
+
17
+
18
+ ## 在项目中使用
19
+
20
+ 1. 复制silero_vad.onnx SileroOrt.py StreamVAD.py 三个文件到项目中
21
+ 2. from StreamVAD import StreamVAD
22
+ 3.
23
+ 初始化
24
+ ```
25
+ vad = StreamVAD(args.model,
26
+ sensitivity=0.5,
27
+ silence_ms=200)
28
+ ```
29
+
30
+ 运行
31
+ ```
32
+ for result in vad.run(audio, vad.model.sr):
33
+ if result:
34
+ print(result)
35
+ ```
36
+
37
+ result的格式为:
38
+ ```
39
+ {
40
+ 'start_ts': 语音开始的时间
41
+ 'end_ts': 语音结束的时间
42
+ 'audio': 语音数据
43
+ }
44
+ ```
45
+
46
+ 时间戳的格式可通过StreamVAD.datetime_format设置
SileroOrt.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import onnxruntime as ort
3
+ import librosa
4
+
5
+
6
+ class SileroOrt:
7
+ def __init__(self, model_path: str):
8
+ super().__init__()
9
+
10
+ self.batch_size = 1
11
+ self.sr = 16000
12
+ self.hidden_size = 128
13
+ self.context_size = 64 if self.sr == 16000 else 32
14
+ self.context = np.zeros((self.batch_size, self.context_size), dtype=np.float32)
15
+ self.state = np.zeros((2, self.batch_size, self.hidden_size), dtype=np.float32)
16
+ self.num_samples = 512 if self.sr == 16000 else 256
17
+
18
+ self.model = ort.InferenceSession(model_path, providers=["CPUExecutionProvider"])
19
+ self.reset_states()
20
+
21
+
22
+ def reset_states(self):
23
+ self.context = np.zeros((self.batch_size, self.context_size), dtype=np.float32)
24
+ self.state = np.zeros((2, self.batch_size, self.hidden_size), dtype=np.float32)
25
+
26
+
27
+ def __call__(self, x):
28
+ if len(x.shape) == 1:
29
+ x = x[None, ...]
30
+
31
+ data = np.concatenate([self.context, x], axis=1)
32
+ data = np.pad(data, ((0, 0), (0, 64)), 'reflect')
33
+ input_feed = {
34
+ "data": data,
35
+ "state": self.state
36
+ }
37
+
38
+ output, self.state = self.model.run(None, input_feed=input_feed)
39
+ self.context = x[..., -self.context_size:]
40
+
41
+ if len(output.shape) == 0:
42
+ output = np.array([output], dtype=np.float32)
43
+ return output
44
+
45
+
46
+ def audio_forward(self, x: np.ndarray, sr: int):
47
+ if len(x.shape) > 1:
48
+ # mono
49
+ x = x[0]
50
+
51
+ if x.dtype == np.int16:
52
+ x = x.astype(np.float32) / 32768
53
+
54
+ if sr != self.sr:
55
+ x = librosa.resample(x, orig_sr=sr, target_sr=self.sr)
56
+
57
+ outs = []
58
+ num_samples = self.num_samples
59
+
60
+ if x.shape[0] % num_samples:
61
+ pad_num = num_samples - (x.shape[0] % num_samples)
62
+ x = np.pad(x, ((0, pad_num)), 'constant', value=0.0)
63
+
64
+ for i in range(0, x.shape[0], num_samples):
65
+ wavs_batch = x[i:i+num_samples]
66
+ out_chunk = self.__call__(wavs_batch)
67
+ # print(out_chunk)
68
+ outs.append(out_chunk)
69
+
70
+ stacked = np.concatenate(outs, axis=-1)
71
+ return stacked
StreamVAD.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from SileroOrt import SileroOrt
2
+ import numpy as np
3
+ from datetime import datetime, timedelta
4
+
5
+
6
+ class StreamVAD:
7
+ def __init__(self, model_path,
8
+ sensitivity=0.5,
9
+ silence_ms=200,
10
+ datetime_format='%Y-%m-%d %H:%M:%S.%f'):
11
+ '''
12
+ model_path: path of silero_vad.onnx
13
+ sensitivity: thresh of voice activation,
14
+ higher means more sensitive,
15
+ hence, low speech prob thresh
16
+ silence_ms: pop audio after silence for silence_ms milliseconds
17
+ datetime_format: format of datetime in return data
18
+ '''
19
+
20
+ self.model = SileroOrt(model_path)
21
+ self.sensitivity = sensitivity
22
+ self.silence_ms = silence_ms
23
+ self.datetime_format = datetime_format
24
+
25
+ self.reset()
26
+
27
+
28
+ def reset(self):
29
+ self.silence_count = 0
30
+ self.speech_count = 0
31
+ self.return_data = {
32
+ "start_ts": '',
33
+ "end_ts": '',
34
+ "audio": None
35
+ }
36
+ self.vad_data_list = []
37
+ self.model.reset_states()
38
+
39
+
40
+ def run(self, audio: np.ndarray, sr: int = 16000):
41
+ # record datetime
42
+ cur_ts = datetime.now()
43
+
44
+ # inference
45
+ speech_probs = self.model.audio_forward(audio, sr)[0]
46
+
47
+ for i, prob in enumerate(speech_probs):
48
+ audio_slice = audio[i * self.model.num_samples : (i + 1) * self.model.num_samples]
49
+ ts = cur_ts.strftime(self.datetime_format)
50
+
51
+ # is speech
52
+ if prob > 1 - self.sensitivity:
53
+ self.silence_count = 0
54
+ # new speech segment
55
+ if self.speech_count == 0:
56
+ self.return_data['start_ts'] = ts
57
+
58
+ self.speech_count += 1
59
+ self.vad_data_list.append(audio_slice)
60
+ # silence
61
+ else:
62
+ if self.speech_count > 0:
63
+ self.silence_count += 1
64
+
65
+ # exceed silence limit
66
+ if 1000 * self.silence_count * self.model.num_samples / self.model.sr > self.silence_ms:
67
+ # return audio segment
68
+ self.return_data['end_ts'] = ts
69
+ self.return_data['audio'] = np.concatenate(self.vad_data_list, axis=-1)
70
+
71
+ yield self.return_data
72
+
73
+ self.reset()
74
+ else:
75
+ self.vad_data_list.append(audio_slice)
76
+
77
+ # timestamp
78
+ cur_ts += timedelta(seconds=self.model.num_samples / self.model.sr)
main.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from StreamVAD import StreamVAD
3
+ import os
4
+ import librosa
5
+ import soundfile as sf
6
+
7
+
8
+ def get_args():
9
+ parser = argparse.ArgumentParser()
10
+ parser.add_argument('--input', type=str, required=True, help='Input audio file')
11
+ parser.add_argument('--model', type=str, default='./silero_vad.onnx')
12
+ parser.add_argument('--output_dir', type=str, default='output', help='Output audio dir')
13
+ return parser.parse_args()
14
+
15
+
16
+ def main():
17
+ args = get_args()
18
+ os.makedirs(args.output_dir, exist_ok=True)
19
+
20
+ vad = StreamVAD(args.model,
21
+ sensitivity=0.5,
22
+ silence_ms=200)
23
+
24
+ audio, _ = librosa.load(args.input, sr=vad.model.sr, mono=True)
25
+ i = 0
26
+ for result in vad.run(audio, vad.model.sr):
27
+ if result:
28
+ print(result)
29
+ filename = os.path.join(args.output_dir, f"{i}.wav")
30
+ sf.write(filename, result['audio'], samplerate=vad.model.sr)
31
+ i += 1
32
+
33
+
34
+ if __name__ == '__main__':
35
+ main()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ onnxruntime==1.17.0
2
+ librosa
3
+ numpy<2.0
4
+ samplerate
5
+ resampy
6
+ soundfile
silero_vad.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:144f7a8e8db2bbe7e90407f966ec811cbcdc7258fffbc867798597a33c957118
3
+ size 1247953