niobures commited on
Commit
67bfd54
·
verified ·
1 Parent(s): 92c5a64

Silero-VAD (ONNX)

Browse files
.gitattributes CHANGED
@@ -45,3 +45,4 @@ models/stt/en/en_sample.wav filter=lfs diff=lfs merge=lfs -text
45
  models/stt/de/de_sample.wav filter=lfs diff=lfs merge=lfs -text
46
  models/stt/es/es_sample.wav filter=lfs diff=lfs merge=lfs -text
47
  models/ailia-models/code/en_example.wav filter=lfs diff=lfs merge=lfs -text
 
 
45
  models/stt/de/de_sample.wav filter=lfs diff=lfs merge=lfs -text
46
  models/stt/es/es_sample.wav filter=lfs diff=lfs merge=lfs -text
47
  models/ailia-models/code/en_example.wav filter=lfs diff=lfs merge=lfs -text
48
+ models/vad/onnx/SileroVAD[[:space:]](AXERA-TECH)/demo.wav filter=lfs diff=lfs merge=lfs -text
models/vad/onnx/SileroVAD (AXERA-TECH)/.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.onnx filter=lfs diff=lfs merge=lfs -text
37
+ *.wav filter=lfs diff=lfs merge=lfs -text
models/vad/onnx/SileroVAD (AXERA-TECH)/README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+ # SileroVAD
5
+
6
+ 流式语音端点识别
7
+
8
+
9
+ ## Demo
10
+
11
+ ### CLI
12
+
13
+ ```
14
+ python main.py --input demo.wav --output_dir output --model silero_vad.onnx
15
+ ```
16
+ 被分段的语音后保存在output目录中
17
+
18
+
19
+ ### Gradio
20
+ ```
21
+ pip install gradio
22
+
23
+ python gradio_app.py
24
+ ```
25
+ ![](/gradio.png)
26
+
27
+
28
+
29
+ ## 在项目中使用
30
+
31
+ 1. 复制silero_vad.onnx SileroOrt.py StreamVAD.py 三个文件到项目中
32
+ 2. from StreamVAD import StreamVAD
33
+ 3.
34
+ 初始化
35
+ ```
36
+ vad = StreamVAD(args.model,
37
+ sensitivity=0.5,
38
+ silence_ms=200)
39
+ ```
40
+
41
+ 运行
42
+ ```
43
+ for result in vad.run(audio, vad.model.sr):
44
+ if result:
45
+ print(result)
46
+ ```
47
+
48
+ result的格式为:
49
+ ```
50
+ {
51
+ 'start_ts': 语音开始的时间
52
+ 'end_ts': 语音结束的时间
53
+ 'audio': 语音数据
54
+ }
55
+ ```
56
+
57
+ 时间戳的格式可通过StreamVAD.datetime_format设置
models/vad/onnx/SileroVAD (AXERA-TECH)/SileroOrt.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import onnxruntime as ort
3
+ import librosa
4
+
5
+
6
+ class SileroOrt:
7
+ def __init__(self, model_path: str):
8
+ super().__init__()
9
+
10
+ self.batch_size = 1
11
+ self.sr = 16000
12
+ self.hidden_size = 128
13
+ self.context_size = 64 if self.sr == 16000 else 32
14
+ self.num_samples = 512 if self.sr == 16000 else 256
15
+
16
+ self.model = ort.InferenceSession(model_path, providers=["CPUExecutionProvider"])
17
+ self.reset_states()
18
+
19
+
20
+ def reset_states(self):
21
+ self.context = np.zeros((self.batch_size, self.context_size), dtype=np.float32)
22
+ self.state = np.zeros((2, self.batch_size, self.hidden_size), dtype=np.float32)
23
+
24
+
25
+ def __call__(self, x):
26
+ if len(x.shape) == 1:
27
+ x = x[None, ...]
28
+
29
+ data = np.concatenate([self.context, x], axis=1)
30
+ data = np.pad(data, ((0, 0), (0, 64)), 'reflect')
31
+ input_feed = {
32
+ "data": data,
33
+ "state": self.state
34
+ }
35
+
36
+ output, self.state = self.model.run(None, input_feed=input_feed)
37
+ self.context = x[..., -self.context_size:]
38
+
39
+ if len(output.shape) == 0:
40
+ output = np.array([output], dtype=np.float32)
41
+ return output
42
+
43
+
44
+ def audio_forward(self, x: np.ndarray, sr: int):
45
+ if len(x.shape) > 1:
46
+ # mono
47
+ x = x[0]
48
+
49
+ if x.dtype == np.int16:
50
+ x = x.astype(np.float32) / 32768
51
+
52
+ if sr != self.sr:
53
+ x = librosa.resample(x, orig_sr=sr, target_sr=self.sr)
54
+
55
+ outs = []
56
+ num_samples = self.num_samples
57
+
58
+ if x.shape[0] % num_samples:
59
+ pad_num = num_samples - (x.shape[0] % num_samples)
60
+ x = np.pad(x, ((0, pad_num)), 'constant')
61
+
62
+ for i in range(0, x.shape[0], num_samples):
63
+ wavs_batch = x[i:i+num_samples]
64
+ out_chunk = self.__call__(wavs_batch)
65
+ # print(out_chunk)
66
+ outs.append(out_chunk)
67
+
68
+ stacked = np.concatenate(outs, axis=-1)
69
+ return stacked
models/vad/onnx/SileroVAD (AXERA-TECH)/StreamVAD.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from SileroOrt import SileroOrt
2
+ import numpy as np
3
+ from datetime import datetime, timedelta
4
+
5
+
6
+ class StreamVAD:
7
+ def __init__(self, model_path,
8
+ sensitivity=0.5,
9
+ silence_ms=200,
10
+ datetime_format='%Y-%m-%d %H:%M:%S.%f'):
11
+ '''
12
+ model_path: path of silero_vad.onnx
13
+ sensitivity: thresh of voice activation,
14
+ higher means more sensitive,
15
+ hence, low speech prob thresh
16
+ silence_ms: pop audio after silence for silence_ms milliseconds
17
+ datetime_format: format of datetime in return data
18
+ '''
19
+
20
+ self.model = SileroOrt(model_path)
21
+ self.sensitivity = sensitivity
22
+ self.silence_ms = silence_ms
23
+ self.datetime_format = datetime_format
24
+
25
+ self.reset()
26
+
27
+
28
+ def reset(self):
29
+ self.silence_count = 0
30
+ self.speech_count = 0
31
+ self.return_data = {
32
+ "start_ts": '',
33
+ "end_ts": '',
34
+ "audio": None
35
+ }
36
+ self.vad_data_list = []
37
+ self.model.reset_states()
38
+
39
+
40
+ def run(self, audio: np.ndarray, sr: int = 16000):
41
+ # record datetime
42
+ cur_ts = datetime.now()
43
+
44
+ # freq scale
45
+ freq_scale = int(sr / self.model.sr)
46
+
47
+ # inference
48
+ speech_probs = self.model.audio_forward(audio, sr)[0]
49
+
50
+ for i, prob in enumerate(speech_probs):
51
+ audio_slice = audio[i * self.model.num_samples * freq_scale : (i + 1) * self.model.num_samples * freq_scale]
52
+ ts = cur_ts.strftime(self.datetime_format)
53
+
54
+ # is speech
55
+ if prob > 1 - self.sensitivity:
56
+ self.silence_count = 0
57
+ # new speech segment
58
+ if self.speech_count == 0:
59
+ self.return_data['start_ts'] = ts
60
+
61
+ self.speech_count += 1
62
+ self.vad_data_list.append(audio_slice)
63
+ # silence
64
+ else:
65
+ if self.speech_count > 0:
66
+ self.silence_count += 1
67
+
68
+ # exceed silence limit
69
+ if 1000 * self.silence_count * self.model.num_samples / self.model.sr > self.silence_ms:
70
+ # return audio segment
71
+ self.return_data['end_ts'] = ts
72
+ self.return_data['audio'] = np.concatenate(self.vad_data_list, axis=-1)
73
+
74
+ yield self.return_data
75
+
76
+ self.reset()
77
+ else:
78
+ self.vad_data_list.append(audio_slice)
79
+
80
+ # timestamp
81
+ cur_ts += timedelta(seconds=self.model.num_samples / self.model.sr)
models/vad/onnx/SileroVAD (AXERA-TECH)/config.json ADDED
File without changes
models/vad/onnx/SileroVAD (AXERA-TECH)/demo.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89f17d9c94c4b31eb320f424628bcbc920abaddbee6e2760fd868bfb1d9a2e47
3
+ size 1920044
models/vad/onnx/SileroVAD (AXERA-TECH)/gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ output
2
+ __pycache__
3
+ .vscode
models/vad/onnx/SileroVAD (AXERA-TECH)/gradio.png ADDED
models/vad/onnx/SileroVAD (AXERA-TECH)/gradio_app.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from StreamVAD import StreamVAD
3
+ from dataclasses import dataclass, field
4
+
5
+
6
+ vad = StreamVAD(
7
+ 'silero_vad.onnx',
8
+ sensitivity=0.5,
9
+ silence_ms=200
10
+ )
11
+
12
+ @dataclass
13
+ class AppState:
14
+ history: list = field(default_factory=list)
15
+
16
+
17
+ def process_audio(audio, state):
18
+ # print(audio)
19
+ # audio is a tuple of (sample_rate, numpy int16 array)
20
+ sr, audio_data = audio
21
+ for result in vad.run(audio_data, sr):
22
+ if result:
23
+ state.history.append(
24
+ gr.ChatMessage(role='user', content=gr.Audio(
25
+ label=f"{result['start_ts']} - {result['end_ts']}",
26
+ value=(sr, result['audio']),
27
+ waveform_options=gr.WaveformOptions(show_recording_waveform=False),
28
+ editable=False
29
+ )
30
+ ),
31
+ )
32
+
33
+ return state.history
34
+
35
+
36
+ with gr.Blocks() as demo:
37
+ state = gr.State(value=AppState())
38
+
39
+ with gr.Row():
40
+ chatbot = gr.Chatbot(type='messages')
41
+
42
+ with gr.Row():
43
+ input_audio = gr.Audio(sources=['microphone'], type='numpy', streaming=True)
44
+
45
+ # streaming process
46
+ input_audio.stream(fn=process_audio, inputs=[input_audio, state], outputs=[chatbot])
47
+
48
+ demo.launch(debug=True)
models/vad/onnx/SileroVAD (AXERA-TECH)/main.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from StreamVAD import StreamVAD
3
+ import os
4
+ import librosa
5
+ import soundfile as sf
6
+
7
+
8
+ def get_args():
9
+ parser = argparse.ArgumentParser()
10
+ parser.add_argument('--input', type=str, required=True, help='Input audio file')
11
+ parser.add_argument('--model', type=str, default='./silero_vad.onnx')
12
+ parser.add_argument('--output_dir', type=str, default='output', help='Output audio dir')
13
+ return parser.parse_args()
14
+
15
+
16
+ def main():
17
+ args = get_args()
18
+ os.makedirs(args.output_dir, exist_ok=True)
19
+
20
+ vad = StreamVAD(args.model,
21
+ sensitivity=0.5,
22
+ silence_ms=200)
23
+
24
+ audio, _ = librosa.load(args.input, sr=vad.model.sr, mono=True)
25
+ i = 0
26
+ for result in vad.run(audio, vad.model.sr):
27
+ if result:
28
+ print(result)
29
+ filename = os.path.join(args.output_dir, f"{i}.wav")
30
+ sf.write(filename, result['audio'], samplerate=vad.model.sr)
31
+ i += 1
32
+
33
+
34
+ if __name__ == '__main__':
35
+ main()
models/vad/onnx/SileroVAD (AXERA-TECH)/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ onnxruntime==1.17.0
2
+ librosa
3
+ numpy<2.0
4
+ samplerate
5
+ resampy
6
+ soundfile
models/vad/onnx/SileroVAD (AXERA-TECH)/silero_vad.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:144f7a8e8db2bbe7e90407f966ec811cbcdc7258fffbc867798597a33c957118
3
+ size 1247953
models/vad/onnx/SileroVAD (AXERA-TECH)/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/AXERA-TECH/SileroVAD
models/vad/onnx/silero-vad (luvox-ai)/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/vad/onnx/silero-vad (luvox-ai)/silero_vad.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a153a22f4509e292a94e67d6f9b85e8deb25b4988682b7e174c65279d8788e3
3
+ size 2327524
models/vad/onnx/silero-vad (luvox-ai)/silero_vad_16k_op15.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ed98ddbad84ccac4cd0aeb3099049280713df825c610a8ed34543318f1b2c49
3
+ size 1289603
models/vad/onnx/silero-vad (luvox-ai)/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/luvox-ai/silero-vad