niobures commited on
Commit
78d75bc
·
verified ·
1 Parent(s): 6886c05

SenseVoice (SenseVoiceSmall-RKNN2)

Browse files
.gitattributes CHANGED
@@ -47,3 +47,8 @@ SenseVoice/sensevoice.cpp-binaries-for-jetson-nano-jetpack-4/lib/libggml-base.so
47
  SenseVoice/sensevoice.cpp-binaries-for-jetson-nano-jetpack-4/lib/libggml-cpu.so filter=lfs diff=lfs merge=lfs -text
48
  SenseVoice/sensevoice.cpp-binaries-for-jetson-nano-jetpack-4/lib/libggml-cuda.so filter=lfs diff=lfs merge=lfs -text
49
  SenseVoice/sensevoice.cpp-binaries-for-jetson-nano-jetpack-4/lib/libsense-voice-core.a filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
47
  SenseVoice/sensevoice.cpp-binaries-for-jetson-nano-jetpack-4/lib/libggml-cpu.so filter=lfs diff=lfs merge=lfs -text
48
  SenseVoice/sensevoice.cpp-binaries-for-jetson-nano-jetpack-4/lib/libggml-cuda.so filter=lfs diff=lfs merge=lfs -text
49
  SenseVoice/sensevoice.cpp-binaries-for-jetson-nano-jetpack-4/lib/libsense-voice-core.a filter=lfs diff=lfs merge=lfs -text
50
+ SenseVoice/SenseVoiceSmall-RKNN2/chinese.wav filter=lfs diff=lfs merge=lfs -text
51
+ SenseVoice/SenseVoiceSmall-RKNN2/english.wav filter=lfs diff=lfs merge=lfs -text
52
+ SenseVoice/SenseVoiceSmall-RKNN2/librknnrt.so filter=lfs diff=lfs merge=lfs -text
53
+ SenseVoice/SenseVoiceSmall-RKNN2/rknn_toolkit_lite2-2.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl filter=lfs diff=lfs merge=lfs -text
54
+ SenseVoice/SenseVoiceSmall-RKNN2/sense-voice-encoder.rknn filter=lfs diff=lfs merge=lfs -text
SenseVoice/SenseVoiceSmall-RKNN2/.gitattributes ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ output.wav filter=lfs diff=lfs merge=lfs -text
37
+ sense-voice-encoder.rknn filter=lfs diff=lfs merge=lfs -text
38
+ chinese.wav filter=lfs diff=lfs merge=lfs -text
39
+ english.wav filter=lfs diff=lfs merge=lfs -text
40
+ librknnrt.so filter=lfs diff=lfs merge=lfs -text
41
+ rknn_toolkit_lite2-2.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl filter=lfs diff=lfs merge=lfs -text
SenseVoice/SenseVoiceSmall-RKNN2/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ myenv/
SenseVoice/SenseVoiceSmall-RKNN2/.huggingfaceignore ADDED
@@ -0,0 +1 @@
 
 
1
+ myenv/
SenseVoice/SenseVoiceSmall-RKNN2/README.md ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: agpl-3.0
3
+ language:
4
+ - en
5
+ - zh
6
+ - ja
7
+ - ko
8
+ base_model: lovemefan/SenseVoice-onnx
9
+ tags:
10
+ - rknn
11
+ ---
12
+
13
+ # SenseVoiceSmall-RKNN2
14
+
15
+ SenseVoice is an audio foundation model with audio understanding capabilities, including Automatic Speech Recognition (ASR), Language Identification (LID), Speech Emotion Recognition (SER), and Acoustic Event Classification (AEC) or Acoustic Event Detection (AED).
16
+
17
+ Currently, SenseVoice-small supports multilingual speech recognition, emotion recognition, and event detection for Chinese, Cantonese, English, Japanese, and Korean, with extremely low inference latency.
18
+
19
+ - Inference speed (RKNN2): About 20x real-time on a single NPU core of RK3588 (processing 20 seconds of audio per second), approximately 6 times faster than the official whisper model provided in the rknn-model-zoo.
20
+ - Memory usage (RKNN2): About 1.1GB
21
+
22
+ ## Usage
23
+
24
+ 1. Clone the project to your local machine
25
+
26
+ 2. Install dependencies
27
+
28
+ ```bash
29
+ pip install kaldi_native_fbank onnxruntime sentencepiece soundfile pyyaml numpy<2
30
+
31
+ pip install rknn_toolkit_lite2-2.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
32
+ ```
33
+ [Source](https://github.com/airockchip/rknn-toolkit2/blob/master/rknn-toolkit-lite2/packages/rknn_toolkit_lite2-2.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl) of the .whl file:
34
+
35
+ 3. Copy librknnt.so to /usr/lib/
36
+
37
+ Source of librknnt.so: https://github.com/airockchip/rknn-toolkit2/blob/master/rknpu2/runtime/Linux/librknn_api/aarch64/librknnrt.so
38
+
39
+ 4. Run
40
+
41
+ ```bash
42
+ python ./sensevoice_rknn.py --audio_file english.wav
43
+ ```
44
+
45
+ If you find that recognition is not working correctly when testing with your own audio files, you may need to convert them to 16kHz, 16-bit, mono WAV format in advance.
46
+
47
+ ```bash
48
+ ffmpeg -i input.mp3 -f wav -acodec pcm_s16le -ac 1 -ar 16000 output.wav
49
+ ```
50
+
51
+ ## RKNN Model Conversion
52
+
53
+ You need to install rknn-toolkit2 v2.1.0 or higher in advance.
54
+
55
+ 1. Download or convert the ONNX model
56
+
57
+ You can download the ONNX model from https://huggingface.co/lovemefan/SenseVoice-onnx.
58
+ It should also be possible to convert from a PyTorch model to an ONNX model according to the documentation at https://github.com/FunAudioLLM/SenseVoice.
59
+
60
+ The model file should be named 'sense-voice-encoder.onnx' and placed in the same directory as the conversion script.
61
+
62
+ 2. Convert to RKNN model
63
+ ```bash
64
+ python convert_rknn.py
65
+ ```
66
+
67
+ ## Known Issues
68
+
69
+ - When using fp16 inference with RKNN2, overflow may occur, resulting in inf values. You can try modifying the scaling ratio of the input data to resolve this.
70
+ Set `SPEECH_SCALE` to a smaller value in `sensevoice_rknn.py`.
71
+
72
+ ## References
73
+ - [FunAudioLLM/SenseVoiceSmall](https://huggingface.co/FunAudioLLM/SenseVoiceSmall)
74
+ - [lovemefan/SenseVoice-python](https://github.com/lovemefan/SenseVoice-python)
75
+
76
+
77
+ ## FastAPI Transcription Server
78
+
79
+ This project includes a FastAPI server (`server.py`) that provides an HTTP endpoint for speech-to-text transcription.
80
+
81
+ ### Running the Server
82
+
83
+ 1. Ensure all dependencies for `sensevoice_rknn.py` and the server are installed. This includes `fastapi` and `uvicorn`:
84
+ ```bash
85
+ pip install fastapi uvicorn
86
+ ```
87
+ 2. Place the required model files (`*.rknn`, `*.onnx`, `spm.model`) in the same directory as `server.py`.
88
+ 3. Run the server:
89
+ ```bash
90
+ python server.py
91
+ ```
92
+ The server will start on `http://0.0.0.0:8000` by default.
93
+
94
+ ### API Endpoint: `/transcribe`
95
+
96
+ * **Method:** `POST`
97
+ * **Description:** Transcribes the audio file specified in the request.
98
+ * **Request Body:** JSON object with the following fields:
99
+ * `audio_file_path` (string, required): The absolute path to the WAV audio file on the server's filesystem.
100
+ * `language` (string, optional, default: `"en"`): The language code for transcription. Supported codes depend on the model (e.g., "en", "zh", "ja", "ko").
101
+ * `use_itn` (boolean, optional, default: `false`): Whether to apply Inverse Text Normalization to the transcription output.
102
+
103
+ * **Example Request (`curl`):**
104
+ ```bash
105
+ curl -X POST -H "Content-Type: application/json" \
106
+ -d '{"audio_file_path": "/path/to/your/audio.wav", "language": "en", "use_itn": false}' \
107
+ http://0.0.0.0:8000/transcribe
108
+ ```
109
+
110
+ * **Response Body:** JSON object with the following fields:
111
+ * `full_transcription` (string): The complete transcribed text, including any special tokens from the model.
112
+ * `segments` (list of objects): A list where each object represents a transcribed audio segment and contains:
113
+ * `start_time_s` (float): Start time of the segment in seconds.
114
+ * `end_time_s` (float): End time of the segment in seconds.
115
+ * `text` (string): Transcribed text for the segment.
116
+
117
+ * **Example Response:**
118
+ ```json
119
+ {
120
+ "full_transcription": "<|en|><|HAPPY|><|Speech|><|woitn|>the stale smell of old beer lingers <|en|><|NEUTRAL|><|Speech|><|woitn|>it takes heat to bring out the odor but <|en|><|HAPPY|><|Speech|><|woitn|>a cold dip restores health and zest a salt pickle tastes fine with ham tacos al pastor are my favorite <|en|><|EMO_UNKNOWN|><|Speech|><|woitn|>a zestful food is the hot cross bun",
121
+ "segments": [
122
+ {
123
+ "start_time_s": 1.01,
124
+ "end_time_s": 3.93,
125
+ "text": "<|en|><|HAPPY|><|Speech|><|woitn|>the stale smell of old beer lingers"
126
+ },
127
+ {
128
+ "start_time_s": 4.21,
129
+ "end_time_s": 6.59,
130
+ "text": "<|en|><|NEUTRAL|><|Speech|><|woitn|>it takes heat to bring out the odor but"
131
+ },
132
+ {
133
+ "start_time_s": 6.87,
134
+ "end_time_s": 14.68,
135
+ "text": "<|en|><|HAPPY|><|Speech|><|woitn|>a cold dip restores health and zest a salt pickle tastes fine with ham tacos al pastor are my favorite"
136
+ },
137
+ {
138
+ "start_time_s": 14.96,
139
+ "end_time_s": 18.34,
140
+ "text": "<|en|><|EMO_UNKNOWN|><|Speech|><|woitn|>a zestful food is the hot cross bun"
141
+ }
142
+ ]
143
+ }
144
+ ```
145
+
SenseVoice/SenseVoiceSmall-RKNN2/am.mvn ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <Nnet>
2
+ <Splice> 560 560
3
+ [ 0 ]
4
+ <AddShift> 560 560
5
+ <LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
6
+ <Rescale> 560 560
7
+ <LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
8
+ </Nnet>
SenseVoice/SenseVoiceSmall-RKNN2/chinese.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6f02d2c58b9a8a294a306ccb60bdf667587d74984915a3ec87a6de5e04bb020
3
+ size 1289994
SenseVoice/SenseVoiceSmall-RKNN2/chn_jpn_yue_eng_ko_spectok.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa87f86064c3730d799ddf7af3c04659151102cba548bce325cf06ba4da4e6a8
3
+ size 377341
SenseVoice/SenseVoiceSmall-RKNN2/convert_rknn.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ import os
5
+ from rknn.api import RKNN
6
+ from math import exp
7
+ from sys import exit
8
+ import argparse
9
+ import onnxscript
10
+ from onnxscript.rewriter import pattern
11
+ import onnx.numpy_helper as onh
12
+ import numpy as np
13
+ import onnx
14
+ import onnxruntime as ort
15
+ from rknn.utils import onnx_edit
16
+
17
+ os.chdir(os.path.dirname(os.path.abspath(__file__)))
18
+
19
+ speech_length = 171
20
+
21
+ def convert_encoder():
22
+ rknn = RKNN(verbose=True)
23
+
24
+ ONNX_MODEL=f"sense-voice-encoder.onnx"
25
+ RKNN_MODEL=ONNX_MODEL.replace(".onnx",".rknn")
26
+ DATASET="dataset.txt"
27
+ QUANTIZE=False
28
+
29
+ #开局先给我来个大惊喜,rknn做第一步常量折叠的时候就会在这个子图里报错,所以要单独拿出来先跑一遍
30
+ #然后把这个子图的输出结果保存下来喂给rknn
31
+ onnx.utils.extract_model(ONNX_MODEL, "extract_model.onnx", ['speech_lengths'], ['/make_pad_mask/Cast_2_output_0'])
32
+ sess = ort.InferenceSession("extract_model.onnx", providers=['CPUExecutionProvider'])
33
+ extract_result = sess.run(None, {"speech_lengths": np.array([speech_length], dtype=np.int64)})[0]
34
+
35
+ # 删掉模型最后的多余transpose, 速度从365ms提升到350ms
36
+ ret = onnx_edit(model = ONNX_MODEL,
37
+ export_path = ONNX_MODEL.replace(".onnx", "_edited.onnx"),
38
+ # # 1, len, 25055 -> 1, 25055, 1, len # 这个是坏的, 我真服了,
39
+ # outputs_transform = {'encoder_out': 'a,b,c->a,c,1,b'},
40
+ outputs_transform = {'encoder_out': 'a,b,c->a,c,b'},
41
+ )
42
+ ONNX_MODEL = ONNX_MODEL.replace(".onnx", "_edited.onnx")
43
+
44
+ # pre-process config
45
+ print('--> Config model')
46
+ rknn.config(quantized_algorithm='normal', quantized_method='channel', target_platform='rk3588', optimization_level=3)
47
+ print('done')
48
+
49
+ # Load ONNX model
50
+ print("--> Loading model")
51
+ ret = rknn.load_onnx(
52
+ model=ONNX_MODEL,
53
+ inputs=["speech", "/make_pad_mask/Cast_2_output_0"],
54
+ input_size_list=[[1, speech_length, 560], [extract_result.shape[0], extract_result.shape[1]]],
55
+ input_initial_val=[None, extract_result],
56
+ # outputs=["output"]
57
+ )
58
+
59
+ if ret != 0:
60
+ print('Load model failed!')
61
+ exit(ret)
62
+ print('done')
63
+
64
+ # Build model
65
+ print('--> Building model')
66
+ ret = rknn.build(do_quantization=QUANTIZE, dataset=DATASET, rknn_batch_size=None)
67
+ if ret != 0:
68
+ print('Build model failed!')
69
+ exit(ret)
70
+ print('done')
71
+
72
+ # export
73
+ print('--> Export RKNN model')
74
+ ret = rknn.export_rknn(RKNN_MODEL)
75
+ if ret != 0:
76
+ print('Export RKNN model failed!')
77
+ exit(ret)
78
+ print('done')
79
+
80
+ # usage: python convert_rknn.py encoder|all
81
+
82
+ if __name__ == "__main__":
83
+ parser = argparse.ArgumentParser()
84
+ parser.add_argument("model", type=str, help="model to convert", choices=["encoder", "all"], nargs='?')
85
+ args = parser.parse_args()
86
+ if args.model is None:
87
+ args.model = "all"
88
+
89
+ if args.model == "encoder":
90
+ convert_encoder()
91
+ elif args.model == "all":
92
+ convert_encoder()
93
+ else:
94
+ print(f"Unknown model: {args.model}")
95
+ exit(1)
SenseVoice/SenseVoiceSmall-RKNN2/embedding.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83cf1fc5680fdf6d7edb411be5ce351cad4eca03b29a5bf5050aa19dfcc12267
3
+ size 35968
SenseVoice/SenseVoiceSmall-RKNN2/english.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9774197507f8ea811cb7f9f25b81c0e7b25bce55c125774b5ca4e832008727a1
3
+ size 587660
SenseVoice/SenseVoiceSmall-RKNN2/fsmn-am.mvn ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <Nnet>
2
+ <Splice> 400 400
3
+ [ 0 ]
4
+ <AddShift> 400 400
5
+ <LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
6
+ <Rescale> 400 400
7
+ <LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
8
+ </Nnet>
SenseVoice/SenseVoiceSmall-RKNN2/fsmn-config.yaml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ WavFrontend:
2
+ frontend_conf:
3
+ fs: 16000
4
+ window: hamming
5
+ n_mels: 80
6
+ frame_length: 25
7
+ frame_shift: 10
8
+ dither: 0.0
9
+ lfr_m: 5
10
+ lfr_n: 1
11
+
12
+ FSMN:
13
+ use_cuda: False
14
+ CUDAExecutionProvider:
15
+ device_id: 0
16
+ arena_extend_strategy: kNextPowerOfTwo
17
+ cudnn_conv_algo_search: EXHAUSTIVE
18
+ do_copy_in_default_stream: true
19
+ encoder_conf:
20
+ input_dim: 400
21
+ input_affine_dim: 140
22
+ fsmn_layers: 4
23
+ linear_dim: 250
24
+ proj_dim: 128
25
+ lorder: 20
26
+ rorder: 0
27
+ lstride: 1
28
+ rstride: 0
29
+ output_affine_dim: 140
30
+ output_dim: 248
31
+
32
+ vadPostArgs:
33
+ sample_rate: 16000
34
+ detect_mode: 1
35
+ snr_mode: 0
36
+ max_end_silence_time: 800
37
+ max_start_silence_time: 3000
38
+ do_start_point_detection: True
39
+ do_end_point_detection: True
40
+ window_size_ms: 200
41
+ sil_to_speech_time_thres: 150
42
+ speech_to_sil_time_thres: 150
43
+ speech_2_noise_ratio: 1.0
44
+ do_extend: 1
45
+ lookback_time_start_point: 200
46
+ lookahead_time_end_point: 100
47
+ max_single_segment_time: 10000
48
+ snr_thres: -100.0
49
+ noise_frame_num_used_for_snr: 100
50
+ decibel_thres: -100.0
51
+ speech_noise_thres: 0.6
52
+ fe_prior_thres: 0.0001
53
+ silence_pdf_num: 1
54
+ sil_pdf_ids: [ 0 ]
55
+ speech_noise_thresh_low: -0.1
56
+ speech_noise_thresh_high: 0.3
57
+ output_frame_probs: False
58
+ frame_in_ms: 10
59
+ frame_length_ms: 25
SenseVoice/SenseVoiceSmall-RKNN2/fsmnvad-offline.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bbd68b11519e916b6871ff6f8df15e2100936b256be9cb104cd63fb7c859965
3
+ size 1725472
SenseVoice/SenseVoiceSmall-RKNN2/librknnrt.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d31fc19c85b85f6091b2bd0f6af9d962d5264a4e410bfb536402ec92bac738e8
3
+ size 7726232
SenseVoice/SenseVoiceSmall-RKNN2/requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cffi==1.17.1
2
+ coloredlogs==15.0.1
3
+ flatbuffers==25.2.10
4
+ humanfriendly==10.0
5
+ kaldi-native-fbank==1.21.2
6
+ mpmath==1.3.0
7
+ numpy==2.2.6
8
+ onnxruntime==1.22.0
9
+ packaging==25.0
10
+ protobuf==6.31.1
11
+ psutil==7.0.0
12
+ pycparser==2.22
13
+ PyYAML==6.0.2
14
+ # rknn-toolkit-lite2 @ file:./rknn_toolkit_lite2-2.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
15
+ ruamel.yaml==0.18.14
16
+ ruamel.yaml.clib==0.2.12
17
+ sentencepiece==0.2.0
18
+ soundfile==0.13.1
19
+ sympy==1.14.0
SenseVoice/SenseVoiceSmall-RKNN2/rknn_toolkit_lite2-2.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:821e80c95e6838308c133915660b1a6ae78bb8d079b2cbbd46a02dae61192d33
3
+ size 559386
SenseVoice/SenseVoiceSmall-RKNN2/sense-voice-encoder.rknn ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8db70c1a8d4887e35dff55ab0f5d8da283d32359bd1599ece51eb81f99a6f468
3
+ size 485687354
SenseVoice/SenseVoiceSmall-RKNN2/sensevoice_rknn.py ADDED
@@ -0,0 +1,1402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File: onnx/fsmn_vad_ort_session.py
2
+ # ```py
3
+
4
+ # -*- coding:utf-8 -*-
5
+ # @FileName :fsmn_vad_ort_session.py.py
6
+ # @Time :2024/8/31 16:45
7
+ # @Author :lovemefan
8
+ # @Email :lovemefan@outlook.com
9
+
10
+ import argparse
11
+ import logging
12
+ import math
13
+ import os
14
+ import time
15
+ import warnings
16
+ from enum import Enum
17
+ from pathlib import Path
18
+ from typing import Any, Dict, List, Tuple, Union
19
+
20
+ import kaldi_native_fbank as knf
21
+ import numpy as np
22
+ import sentencepiece as spm
23
+ import soundfile as sf
24
+ import yaml
25
+ from onnxruntime import (GraphOptimizationLevel, InferenceSession,
26
+ SessionOptions, get_available_providers, get_device)
27
+ from rknnlite.api.rknn_lite import RKNNLite
28
+
29
+ RKNN_INPUT_LEN = 171
30
+
31
+ SPEECH_SCALE = 1/2 # 因为是fp16推理,如果中间结果太大可能会溢出变inf,所以需要缩放一下
32
+
33
+ class VadOrtInferRuntimeSession:
34
+ def __init__(self, config, root_dir: Path):
35
+ sess_opt = SessionOptions()
36
+ sess_opt.log_severity_level = 4
37
+ sess_opt.enable_cpu_mem_arena = False
38
+ sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
39
+
40
+ cuda_ep = "CUDAExecutionProvider"
41
+ cpu_ep = "CPUExecutionProvider"
42
+ cpu_provider_options = {
43
+ "arena_extend_strategy": "kSameAsRequested",
44
+ }
45
+
46
+ EP_list = []
47
+ if (
48
+ config["use_cuda"]
49
+ and get_device() == "GPU"
50
+ and cuda_ep in get_available_providers()
51
+ ):
52
+ EP_list = [(cuda_ep, config[cuda_ep])]
53
+ EP_list.append((cpu_ep, cpu_provider_options))
54
+
55
+ config["model_path"] = root_dir / str(config["model_path"])
56
+ self._verify_model(config["model_path"])
57
+ logging.info(f"Loading onnx model at {str(config['model_path'])}")
58
+ self.session = InferenceSession(
59
+ str(config["model_path"]), sess_options=sess_opt, providers=EP_list
60
+ )
61
+
62
+ if config["use_cuda"] and cuda_ep not in self.session.get_providers():
63
+ logging.warning(
64
+ f"{cuda_ep} is not available for current env, "
65
+ f"the inference part is automatically shifted to be "
66
+ f"executed under {cpu_ep}.\n "
67
+ "Please ensure the installed onnxruntime-gpu version"
68
+ " matches your cuda and cudnn version, "
69
+ "you can check their relations from the offical web site: "
70
+ "https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html",
71
+ RuntimeWarning,
72
+ )
73
+
74
+ def __call__(
75
+ self, input_content: List[Union[np.ndarray, np.ndarray]]
76
+ ) -> np.ndarray:
77
+ if isinstance(input_content, list):
78
+ input_dict = {
79
+ "speech": input_content[0],
80
+ "in_cache0": input_content[1],
81
+ "in_cache1": input_content[2],
82
+ "in_cache2": input_content[3],
83
+ "in_cache3": input_content[4],
84
+ }
85
+ else:
86
+ input_dict = {"speech": input_content}
87
+
88
+ return self.session.run(None, input_dict)
89
+
90
+ def get_input_names(
91
+ self,
92
+ ):
93
+ return [v.name for v in self.session.get_inputs()]
94
+
95
+ def get_output_names(
96
+ self,
97
+ ):
98
+ return [v.name for v in self.session.get_outputs()]
99
+
100
+ def get_character_list(self, key: str = "character"):
101
+ return self.meta_dict[key].splitlines()
102
+
103
+ def have_key(self, key: str = "character") -> bool:
104
+ self.meta_dict = self.session.get_modelmeta().custom_metadata_map
105
+ if key in self.meta_dict.keys():
106
+ return True
107
+ return False
108
+
109
+ @staticmethod
110
+ def _verify_model(model_path):
111
+ model_path = Path(model_path)
112
+ if not model_path.exists():
113
+ raise FileNotFoundError(f"{model_path} does not exists.")
114
+ if not model_path.is_file():
115
+ raise FileExistsError(f"{model_path} is not a file.")
116
+
117
+ # ```
118
+
119
+ # File: onnx/sense_voice_ort_session.py
120
+ # ```py
121
+ # -*- coding:utf-8 -*-
122
+ # @FileName :sense_voice_onnxruntime.py
123
+ # @Time :2024/7/17 20:53
124
+ # @Author :lovemefan
125
+ # @Email :lovemefan@outlook.com
126
+
127
+
128
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
129
+ logging.basicConfig(format=formatter, level=logging.INFO)
130
+
131
+
132
+ class OrtInferRuntimeSession:
133
+ def __init__(self, model_file, device_id=-1, intra_op_num_threads=4):
134
+ device_id = str(device_id)
135
+ sess_opt = SessionOptions()
136
+ sess_opt.intra_op_num_threads = intra_op_num_threads
137
+ sess_opt.log_severity_level = 4
138
+ sess_opt.enable_cpu_mem_arena = False
139
+ sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
140
+
141
+ cuda_ep = "CUDAExecutionProvider"
142
+ cuda_provider_options = {
143
+ "device_id": device_id,
144
+ "arena_extend_strategy": "kNextPowerOfTwo",
145
+ "cudnn_conv_algo_search": "EXHAUSTIVE",
146
+ "do_copy_in_default_stream": "true",
147
+ }
148
+ cpu_ep = "CPUExecutionProvider"
149
+ cpu_provider_options = {
150
+ "arena_extend_strategy": "kSameAsRequested",
151
+ }
152
+
153
+ EP_list = []
154
+ if (
155
+ device_id != "-1"
156
+ and get_device() == "GPU"
157
+ and cuda_ep in get_available_providers()
158
+ ):
159
+ EP_list = [(cuda_ep, cuda_provider_options)]
160
+ EP_list.append((cpu_ep, cpu_provider_options))
161
+
162
+ self._verify_model(model_file)
163
+
164
+ self.session = InferenceSession(
165
+ model_file, sess_options=sess_opt, providers=EP_list
166
+ )
167
+
168
+ # delete binary of model file to save memory
169
+ del model_file
170
+
171
+ if device_id != "-1" and cuda_ep not in self.session.get_providers():
172
+ warnings.warn(
173
+ f"{cuda_ep} is not avaiable for current env, the inference part is automatically shifted to be executed under {cpu_ep}.\n"
174
+ "Please ensure the installed onnxruntime-gpu version matches your cuda and cudnn version, "
175
+ "you can check their relations from the offical web site: "
176
+ "https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html",
177
+ RuntimeWarning,
178
+ )
179
+
180
+ def __call__(self, input_content) -> np.ndarray:
181
+ input_dict = dict(zip(self.get_input_names(), input_content))
182
+ try:
183
+ result = self.session.run(self.get_output_names(), input_dict)
184
+ return result
185
+ except Exception as e:
186
+ print(e)
187
+ raise RuntimeError(f"ONNXRuntime inferece failed. ") from e
188
+
189
+ def get_input_names(
190
+ self,
191
+ ):
192
+ return [v.name for v in self.session.get_inputs()]
193
+
194
+ def get_output_names(
195
+ self,
196
+ ):
197
+ return [v.name for v in self.session.get_outputs()]
198
+
199
+ def get_character_list(self, key: str = "character"):
200
+ return self.meta_dict[key].splitlines()
201
+
202
+ def have_key(self, key: str = "character") -> bool:
203
+ self.meta_dict = self.session.get_modelmeta().custom_metadata_map
204
+ if key in self.meta_dict.keys():
205
+ return True
206
+ return False
207
+
208
+ @staticmethod
209
+ def _verify_model(model_path):
210
+ model_path = Path(model_path)
211
+ if not model_path.exists():
212
+ raise FileNotFoundError(f"{model_path} does not exists.")
213
+ if not model_path.is_file():
214
+ raise FileExistsError(f"{model_path} is not a file.")
215
+
216
+
217
+ def log_softmax(x: np.ndarray) -> np.ndarray:
218
+ # Subtract the maximum value in each row for numerical stability
219
+ x_max = np.max(x, axis=-1, keepdims=True)
220
+ # Calculate the softmax of x
221
+ softmax = np.exp(x - x_max)
222
+ softmax_sum = np.sum(softmax, axis=-1, keepdims=True)
223
+ softmax = softmax / softmax_sum
224
+ # Calculate the log of the softmax values
225
+ return np.log(softmax)
226
+
227
+
228
+ class SenseVoiceInferenceSession:
229
+ def __init__(
230
+ self,
231
+ embedding_model_file,
232
+ encoder_model_file,
233
+ bpe_model_file,
234
+ device_id=-1,
235
+ intra_op_num_threads=4,
236
+ ):
237
+ logging.info(f"Loading model from {embedding_model_file}")
238
+
239
+ self.embedding = np.load(embedding_model_file)
240
+ logging.info(f"Loading model {encoder_model_file}")
241
+ start = time.time()
242
+ self.encoder = RKNNLite(verbose=False)
243
+ self.encoder.load_rknn(encoder_model_file)
244
+ self.encoder.init_runtime()
245
+
246
+ logging.info(
247
+ f"Loading {encoder_model_file} takes {time.time() - start:.2f} seconds"
248
+ )
249
+ self.blank_id = 0
250
+ self.sp = spm.SentencePieceProcessor()
251
+ self.sp.load(bpe_model_file)
252
+
253
+ def __call__(self, speech, language: int, use_itn: bool) -> np.ndarray:
254
+ language_query = self.embedding[[[language]]]
255
+
256
+ # 14 means with itn, 15 means without itn
257
+ text_norm_query = self.embedding[[[14 if use_itn else 15]]]
258
+ event_emo_query = self.embedding[[[1, 2]]]
259
+
260
+ # scale the speech
261
+ speech = speech * SPEECH_SCALE
262
+
263
+ input_content = np.concatenate(
264
+ [
265
+ language_query,
266
+ event_emo_query,
267
+ text_norm_query,
268
+ speech,
269
+ ],
270
+ axis=1,
271
+ ).astype(np.float32)
272
+ print(input_content.shape)
273
+ # pad [1, len, ...] to [1, RKNN_INPUT_LEN, ... ]
274
+ input_content = np.pad(input_content, ((0, 0), (0, RKNN_INPUT_LEN - input_content.shape[1]), (0, 0)))
275
+ print("padded shape:", input_content.shape)
276
+ start_time = time.time()
277
+ encoder_out = self.encoder.inference(inputs=[input_content])[0]
278
+ end_time = time.time()
279
+ print(f"encoder inference time: {end_time - start_time:.2f} seconds")
280
+ # print(encoder_out)
281
+ def unique_consecutive(arr):
282
+ if len(arr) == 0:
283
+ return arr
284
+ # Create a boolean mask where True indicates the element is different from the previous one
285
+ mask = np.append([True], arr[1:] != arr[:-1])
286
+ out = arr[mask]
287
+ out = out[out != self.blank_id]
288
+ return out.tolist()
289
+
290
+ #现在shape变成了1, n_vocab, n_seq. 这里axis需要改一下
291
+ # hypos = unique_consecutive(encoder_out[0].argmax(axis=-1))
292
+ hypos = unique_consecutive(encoder_out[0].argmax(axis=0))
293
+ text = self.sp.DecodeIds(hypos)
294
+ return text
295
+
296
+ # ```
297
+
298
+ # File: utils/frontend.py
299
+ # ```py
300
+ # -*- coding:utf-8 -*-
301
+ # @FileName :frontend.py
302
+ # @Time :2024/7/18 09:39
303
+ # @Author :lovemefan
304
+ # @Email :lovemefan@outlook.com
305
+
306
+ class WavFrontend:
307
+ """Conventional frontend structure for ASR."""
308
+
309
+ def __init__(
310
+ self,
311
+ cmvn_file: str = None,
312
+ fs: int = 16000,
313
+ window: str = "hamming",
314
+ n_mels: int = 80,
315
+ frame_length: int = 25,
316
+ frame_shift: int = 10,
317
+ lfr_m: int = 7,
318
+ lfr_n: int = 6,
319
+ dither: float = 0,
320
+ **kwargs,
321
+ ) -> None:
322
+ opts = knf.FbankOptions()
323
+ opts.frame_opts.samp_freq = fs
324
+ opts.frame_opts.dither = dither
325
+ opts.frame_opts.window_type = window
326
+ opts.frame_opts.frame_shift_ms = float(frame_shift)
327
+ opts.frame_opts.frame_length_ms = float(frame_length)
328
+ opts.mel_opts.num_bins = n_mels
329
+ opts.energy_floor = 0
330
+ opts.frame_opts.snip_edges = True
331
+ opts.mel_opts.debug_mel = False
332
+ self.opts = opts
333
+
334
+ self.lfr_m = lfr_m
335
+ self.lfr_n = lfr_n
336
+ self.cmvn_file = cmvn_file
337
+
338
+ if self.cmvn_file:
339
+ self.cmvn = self.load_cmvn()
340
+ self.fbank_fn = None
341
+ self.fbank_beg_idx = 0
342
+ self.reset_status()
343
+
344
+ def reset_status(self):
345
+ self.fbank_fn = knf.OnlineFbank(self.opts)
346
+ self.fbank_beg_idx = 0
347
+
348
+ def fbank(self, waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
349
+ waveform = waveform * (1 << 15)
350
+ self.fbank_fn = knf.OnlineFbank(self.opts)
351
+ self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
352
+ frames = self.fbank_fn.num_frames_ready
353
+ mat = np.empty([frames, self.opts.mel_opts.num_bins])
354
+ for i in range(frames):
355
+ mat[i, :] = self.fbank_fn.get_frame(i)
356
+ feat = mat.astype(np.float32)
357
+ feat_len = np.array(mat.shape[0]).astype(np.int32)
358
+ return feat, feat_len
359
+
360
+ def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
361
+ if self.lfr_m != 1 or self.lfr_n != 1:
362
+ feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
363
+
364
+ if self.cmvn_file:
365
+ feat = self.apply_cmvn(feat)
366
+
367
+ feat_len = np.array(feat.shape[0]).astype(np.int32)
368
+ return feat, feat_len
369
+
370
+ def load_audio(self, filename: str) -> Tuple[np.ndarray, int]:
371
+ data, sample_rate = sf.read(
372
+ filename,
373
+ always_2d=True,
374
+ dtype="float32",
375
+ )
376
+ assert (
377
+ sample_rate == 16000
378
+ ), f"Only 16000 Hz is supported, but got {sample_rate}Hz"
379
+ self.sample_rate = sample_rate
380
+ data = data[:, 0] # use only the first channel
381
+ samples = np.ascontiguousarray(data)
382
+
383
+ return samples, sample_rate
384
+
385
+ @staticmethod
386
+ def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
387
+ LFR_inputs = []
388
+
389
+ T = inputs.shape[0]
390
+ T_lfr = int(np.ceil(T / lfr_n))
391
+ left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
392
+ inputs = np.vstack((left_padding, inputs))
393
+ T = T + (lfr_m - 1) // 2
394
+ for i in range(T_lfr):
395
+ if lfr_m <= T - i * lfr_n:
396
+ LFR_inputs.append(
397
+ (inputs[i * lfr_n : i * lfr_n + lfr_m]).reshape(1, -1)
398
+ )
399
+ else:
400
+ # process last LFR frame
401
+ num_padding = lfr_m - (T - i * lfr_n)
402
+ frame = inputs[i * lfr_n :].reshape(-1)
403
+ for _ in range(num_padding):
404
+ frame = np.hstack((frame, inputs[-1]))
405
+
406
+ LFR_inputs.append(frame)
407
+ LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
408
+ return LFR_outputs
409
+
410
+ def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
411
+ """
412
+ Apply CMVN with mvn data
413
+ """
414
+ frame, dim = inputs.shape
415
+ means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
416
+ vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
417
+ inputs = (inputs + means) * vars
418
+ return inputs
419
+
420
+ def get_features(self, inputs: Union[str, np.ndarray]) -> Tuple[np.ndarray, int]:
421
+ if isinstance(inputs, str):
422
+ inputs, _ = self.load_audio(inputs)
423
+
424
+ fbank, _ = self.fbank(inputs)
425
+ feats = self.apply_cmvn(self.apply_lfr(fbank, self.lfr_m, self.lfr_n))
426
+ return feats
427
+
428
+ def load_cmvn(
429
+ self,
430
+ ) -> np.ndarray:
431
+ with open(self.cmvn_file, "r", encoding="utf-8") as f:
432
+ lines = f.readlines()
433
+
434
+ means_list = []
435
+ vars_list = []
436
+ for i in range(len(lines)):
437
+ line_item = lines[i].split()
438
+ if line_item[0] == "<AddShift>":
439
+ line_item = lines[i + 1].split()
440
+ if line_item[0] == "<LearnRateCoef>":
441
+ add_shift_line = line_item[3 : (len(line_item) - 1)]
442
+ means_list = list(add_shift_line)
443
+ continue
444
+ elif line_item[0] == "<Rescale>":
445
+ line_item = lines[i + 1].split()
446
+ if line_item[0] == "<LearnRateCoef>":
447
+ rescale_line = line_item[3 : (len(line_item) - 1)]
448
+ vars_list = list(rescale_line)
449
+ continue
450
+
451
+ means = np.array(means_list).astype(np.float64)
452
+ vars = np.array(vars_list).astype(np.float64)
453
+ cmvn = np.array([means, vars])
454
+ return cmvn
455
+
456
+ # ```
457
+
458
+ # File: utils/fsmn_vad.py
459
+ # ```py
460
+ # -*- coding:utf-8 -*-
461
+ # @FileName :fsmn_vad.py
462
+ # @Time :2024/8/31 16:50
463
+ # @Author :lovemefan
464
+ # @Email :lovemefan@outlook.com
465
+
466
+
467
+
468
+ def read_yaml(yaml_path: Union[str, Path]) -> Dict:
469
+ if not Path(yaml_path).exists():
470
+ raise FileExistsError(f"The {yaml_path} does not exist.")
471
+
472
+ with open(str(yaml_path), "rb") as f:
473
+ data = yaml.load(f, Loader=yaml.Loader)
474
+ return data
475
+
476
+
477
+ class VadStateMachine(Enum):
478
+ kVadInStateStartPointNotDetected = 1
479
+ kVadInStateInSpeechSegment = 2
480
+ kVadInStateEndPointDetected = 3
481
+
482
+
483
+ class FrameState(Enum):
484
+ kFrameStateInvalid = -1
485
+ kFrameStateSpeech = 1
486
+ kFrameStateSil = 0
487
+
488
+
489
+ # final voice/unvoice state per frame
490
+ class AudioChangeState(Enum):
491
+ kChangeStateSpeech2Speech = 0
492
+ kChangeStateSpeech2Sil = 1
493
+ kChangeStateSil2Sil = 2
494
+ kChangeStateSil2Speech = 3
495
+ kChangeStateNoBegin = 4
496
+ kChangeStateInvalid = 5
497
+
498
+
499
+ class VadDetectMode(Enum):
500
+ kVadSingleUtteranceDetectMode = 0
501
+ kVadMutipleUtteranceDetectMode = 1
502
+
503
+
504
+ class VADXOptions:
505
+ def __init__(
506
+ self,
507
+ sample_rate: int = 16000,
508
+ detect_mode: int = VadDetectMode.kVadMutipleUtteranceDetectMode.value,
509
+ snr_mode: int = 0,
510
+ max_end_silence_time: int = 800,
511
+ max_start_silence_time: int = 3000,
512
+ do_start_point_detection: bool = True,
513
+ do_end_point_detection: bool = True,
514
+ window_size_ms: int = 200,
515
+ sil_to_speech_time_thres: int = 150,
516
+ speech_to_sil_time_thres: int = 150,
517
+ speech_2_noise_ratio: float = 1.0,
518
+ do_extend: int = 1,
519
+ lookback_time_start_point: int = 200,
520
+ lookahead_time_end_point: int = 100,
521
+ max_single_segment_time: int = 60000,
522
+ nn_eval_block_size: int = 8,
523
+ dcd_block_size: int = 4,
524
+ snr_thres: int = -100.0,
525
+ noise_frame_num_used_for_snr: int = 100,
526
+ decibel_thres: int = -100.0,
527
+ speech_noise_thres: float = 0.6,
528
+ fe_prior_thres: float = 1e-4,
529
+ silence_pdf_num: int = 1,
530
+ sil_pdf_ids: List[int] = [0],
531
+ speech_noise_thresh_low: float = -0.1,
532
+ speech_noise_thresh_high: float = 0.3,
533
+ output_frame_probs: bool = False,
534
+ frame_in_ms: int = 10,
535
+ frame_length_ms: int = 25,
536
+ ):
537
+ self.sample_rate = sample_rate
538
+ self.detect_mode = detect_mode
539
+ self.snr_mode = snr_mode
540
+ self.max_end_silence_time = max_end_silence_time
541
+ self.max_start_silence_time = max_start_silence_time
542
+ self.do_start_point_detection = do_start_point_detection
543
+ self.do_end_point_detection = do_end_point_detection
544
+ self.window_size_ms = window_size_ms
545
+ self.sil_to_speech_time_thres = sil_to_speech_time_thres
546
+ self.speech_to_sil_time_thres = speech_to_sil_time_thres
547
+ self.speech_2_noise_ratio = speech_2_noise_ratio
548
+ self.do_extend = do_extend
549
+ self.lookback_time_start_point = lookback_time_start_point
550
+ self.lookahead_time_end_point = lookahead_time_end_point
551
+ self.max_single_segment_time = max_single_segment_time
552
+ self.nn_eval_block_size = nn_eval_block_size
553
+ self.dcd_block_size = dcd_block_size
554
+ self.snr_thres = snr_thres
555
+ self.noise_frame_num_used_for_snr = noise_frame_num_used_for_snr
556
+ self.decibel_thres = decibel_thres
557
+ self.speech_noise_thres = speech_noise_thres
558
+ self.fe_prior_thres = fe_prior_thres
559
+ self.silence_pdf_num = silence_pdf_num
560
+ self.sil_pdf_ids = sil_pdf_ids
561
+ self.speech_noise_thresh_low = speech_noise_thresh_low
562
+ self.speech_noise_thresh_high = speech_noise_thresh_high
563
+ self.output_frame_probs = output_frame_probs
564
+ self.frame_in_ms = frame_in_ms
565
+ self.frame_length_ms = frame_length_ms
566
+
567
+
568
+ class E2EVadSpeechBufWithDoa(object):
569
+ def __init__(self):
570
+ self.start_ms = 0
571
+ self.end_ms = 0
572
+ self.buffer = []
573
+ self.contain_seg_start_point = False
574
+ self.contain_seg_end_point = False
575
+ self.doa = 0
576
+
577
+ def reset(self):
578
+ self.start_ms = 0
579
+ self.end_ms = 0
580
+ self.buffer = []
581
+ self.contain_seg_start_point = False
582
+ self.contain_seg_end_point = False
583
+ self.doa = 0
584
+
585
+
586
+ class E2EVadFrameProb(object):
587
+ def __init__(self):
588
+ self.noise_prob = 0.0
589
+ self.speech_prob = 0.0
590
+ self.score = 0.0
591
+ self.frame_id = 0
592
+ self.frm_state = 0
593
+
594
+
595
+ class WindowDetector(object):
596
+ def __init__(
597
+ self,
598
+ window_size_ms: int,
599
+ sil_to_speech_time: int,
600
+ speech_to_sil_time: int,
601
+ frame_size_ms: int,
602
+ ):
603
+ self.window_size_ms = window_size_ms
604
+ self.sil_to_speech_time = sil_to_speech_time
605
+ self.speech_to_sil_time = speech_to_sil_time
606
+ self.frame_size_ms = frame_size_ms
607
+
608
+ self.win_size_frame = int(window_size_ms / frame_size_ms)
609
+ self.win_sum = 0
610
+ self.win_state = [0] * self.win_size_frame # 初始化窗
611
+
612
+ self.cur_win_pos = 0
613
+ self.pre_frame_state = FrameState.kFrameStateSil
614
+ self.cur_frame_state = FrameState.kFrameStateSil
615
+ self.sil_to_speech_frmcnt_thres = int(sil_to_speech_time / frame_size_ms)
616
+ self.speech_to_sil_frmcnt_thres = int(speech_to_sil_time / frame_size_ms)
617
+
618
+ self.voice_last_frame_count = 0
619
+ self.noise_last_frame_count = 0
620
+ self.hydre_frame_count = 0
621
+
622
+ def reset(self) -> None:
623
+ self.cur_win_pos = 0
624
+ self.win_sum = 0
625
+ self.win_state = [0] * self.win_size_frame
626
+ self.pre_frame_state = FrameState.kFrameStateSil
627
+ self.cur_frame_state = FrameState.kFrameStateSil
628
+ self.voice_last_frame_count = 0
629
+ self.noise_last_frame_count = 0
630
+ self.hydre_frame_count = 0
631
+
632
+ def get_win_size(self) -> int:
633
+ return int(self.win_size_frame)
634
+
635
+ def detect_one_frame(
636
+ self, frameState: FrameState, frame_count: int
637
+ ) -> AudioChangeState:
638
+ cur_frame_state = FrameState.kFrameStateSil
639
+ if frameState == FrameState.kFrameStateSpeech:
640
+ cur_frame_state = 1
641
+ elif frameState == FrameState.kFrameStateSil:
642
+ cur_frame_state = 0
643
+ else:
644
+ return AudioChangeState.kChangeStateInvalid
645
+ self.win_sum -= self.win_state[self.cur_win_pos]
646
+ self.win_sum += cur_frame_state
647
+ self.win_state[self.cur_win_pos] = cur_frame_state
648
+ self.cur_win_pos = (self.cur_win_pos + 1) % self.win_size_frame
649
+
650
+ if (
651
+ self.pre_frame_state == FrameState.kFrameStateSil
652
+ and self.win_sum >= self.sil_to_speech_frmcnt_thres
653
+ ):
654
+ self.pre_frame_state = FrameState.kFrameStateSpeech
655
+ return AudioChangeState.kChangeStateSil2Speech
656
+
657
+ if (
658
+ self.pre_frame_state == FrameState.kFrameStateSpeech
659
+ and self.win_sum <= self.speech_to_sil_frmcnt_thres
660
+ ):
661
+ self.pre_frame_state = FrameState.kFrameStateSil
662
+ return AudioChangeState.kChangeStateSpeech2Sil
663
+
664
+ if self.pre_frame_state == FrameState.kFrameStateSil:
665
+ return AudioChangeState.kChangeStateSil2Sil
666
+ if self.pre_frame_state == FrameState.kFrameStateSpeech:
667
+ return AudioChangeState.kChangeStateSpeech2Speech
668
+ return AudioChangeState.kChangeStateInvalid
669
+
670
+ def frame_size_ms(self) -> int:
671
+ return int(self.frame_size_ms)
672
+
673
+
674
+ class E2EVadModel:
675
+ def __init__(self, config, vad_post_args: Dict[str, Any], root_dir: Path):
676
+ super(E2EVadModel, self).__init__()
677
+ self.vad_opts = VADXOptions(**vad_post_args)
678
+ self.windows_detector = WindowDetector(
679
+ self.vad_opts.window_size_ms,
680
+ self.vad_opts.sil_to_speech_time_thres,
681
+ self.vad_opts.speech_to_sil_time_thres,
682
+ self.vad_opts.frame_in_ms,
683
+ )
684
+ self.model = VadOrtInferRuntimeSession(config, root_dir)
685
+ self.all_reset_detection()
686
+
687
+ def all_reset_detection(self):
688
+ # init variables
689
+ self.is_final = False
690
+ self.data_buf_start_frame = 0
691
+ self.frm_cnt = 0
692
+ self.latest_confirmed_speech_frame = 0
693
+ self.lastest_confirmed_silence_frame = -1
694
+ self.continous_silence_frame_count = 0
695
+ self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
696
+ self.confirmed_start_frame = -1
697
+ self.confirmed_end_frame = -1
698
+ self.number_end_time_detected = 0
699
+ self.sil_frame = 0
700
+ self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
701
+ self.noise_average_decibel = -100.0
702
+ self.pre_end_silence_detected = False
703
+ self.next_seg = True
704
+
705
+ self.output_data_buf = []
706
+ self.output_data_buf_offset = 0
707
+ self.frame_probs = []
708
+ self.max_end_sil_frame_cnt_thresh = (
709
+ self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
710
+ )
711
+ self.speech_noise_thres = self.vad_opts.speech_noise_thres
712
+ self.scores = None
713
+ self.scores_offset = 0
714
+ self.max_time_out = False
715
+ self.decibel = []
716
+ self.decibel_offset = 0
717
+ self.data_buf_size = 0
718
+ self.data_buf_all_size = 0
719
+ self.waveform = None
720
+ self.reset_detection()
721
+
722
+ def reset_detection(self):
723
+ self.continous_silence_frame_count = 0
724
+ self.latest_confirmed_speech_frame = 0
725
+ self.lastest_confirmed_silence_frame = -1
726
+ self.confirmed_start_frame = -1
727
+ self.confirmed_end_frame = -1
728
+ self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
729
+ self.windows_detector.reset()
730
+ self.sil_frame = 0
731
+ self.frame_probs = []
732
+
733
+ def compute_decibel(self) -> None:
734
+ frame_sample_length = int(
735
+ self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000
736
+ )
737
+ frame_shift_length = int(
738
+ self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000
739
+ )
740
+ if self.data_buf_all_size == 0:
741
+ self.data_buf_all_size = len(self.waveform[0])
742
+ self.data_buf_size = self.data_buf_all_size
743
+ else:
744
+ self.data_buf_all_size += len(self.waveform[0])
745
+
746
+ for offset in range(
747
+ 0, self.waveform.shape[1] - frame_sample_length + 1, frame_shift_length
748
+ ):
749
+ self.decibel.append(
750
+ 10
751
+ * np.log10(
752
+ np.square(
753
+ self.waveform[0][offset : offset + frame_sample_length]
754
+ ).sum()
755
+ + 1e-6
756
+ )
757
+ )
758
+
759
+ def compute_scores(self, feats: np.ndarray) -> None:
760
+ scores = self.model(feats)
761
+ self.vad_opts.nn_eval_block_size = scores[0].shape[1]
762
+ self.frm_cnt += scores[0].shape[1] # count total frames
763
+ if isinstance(feats, list):
764
+ # return B * T * D
765
+ feats = feats[0]
766
+
767
+ assert (
768
+ scores[0].shape[1] == feats.shape[1]
769
+ ), "The shape between feats and scores does not match"
770
+
771
+ self.scores = scores[0] # the first calculation
772
+ self.scores_offset += self.scores.shape[1]
773
+
774
+ return scores[1:]
775
+
776
+ def pop_data_buf_till_frame(self, frame_idx: int) -> None: # need check again
777
+ while self.data_buf_start_frame < frame_idx:
778
+ if self.data_buf_size >= int(
779
+ self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000
780
+ ):
781
+ self.data_buf_start_frame += 1
782
+ self.data_buf_size = (
783
+ self.data_buf_all_size
784
+ - self.data_buf_start_frame
785
+ * int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
786
+ )
787
+
788
+ def pop_data_to_output_buf(
789
+ self,
790
+ start_frm: int,
791
+ frm_cnt: int,
792
+ first_frm_is_start_point: bool,
793
+ last_frm_is_end_point: bool,
794
+ end_point_is_sent_end: bool,
795
+ ) -> None:
796
+ self.pop_data_buf_till_frame(start_frm)
797
+ expected_sample_number = int(
798
+ frm_cnt * self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000
799
+ )
800
+ if last_frm_is_end_point:
801
+ extra_sample = max(
802
+ 0,
803
+ int(
804
+ self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000
805
+ - self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000
806
+ ),
807
+ )
808
+ expected_sample_number += int(extra_sample)
809
+ if end_point_is_sent_end:
810
+ expected_sample_number = max(expected_sample_number, self.data_buf_size)
811
+ if self.data_buf_size < expected_sample_number:
812
+ logging.error("error in calling pop data_buf\n")
813
+
814
+ if len(self.output_data_buf) == 0 or first_frm_is_start_point:
815
+ self.output_data_buf.append(E2EVadSpeechBufWithDoa())
816
+ self.output_data_buf[-1].reset()
817
+ self.output_data_buf[-1].start_ms = start_frm * self.vad_opts.frame_in_ms
818
+ self.output_data_buf[-1].end_ms = self.output_data_buf[-1].start_ms
819
+ self.output_data_buf[-1].doa = 0
820
+ cur_seg = self.output_data_buf[-1]
821
+ if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
822
+ logging.error("warning\n")
823
+ out_pos = len(cur_seg.buffer) # cur_seg.buff现在没做任何操作
824
+ data_to_pop = 0
825
+ if end_point_is_sent_end:
826
+ data_to_pop = expected_sample_number
827
+ else:
828
+ data_to_pop = int(
829
+ frm_cnt * self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000
830
+ )
831
+ if data_to_pop > self.data_buf_size:
832
+ logging.error("VAD data_to_pop is bigger than self.data_buf.size()!!!\n")
833
+ data_to_pop = self.data_buf_size
834
+ expected_sample_number = self.data_buf_size
835
+
836
+ cur_seg.doa = 0
837
+ for sample_cpy_out in range(0, data_to_pop):
838
+ # cur_seg.buffer[out_pos ++] = data_buf_.back();
839
+ out_pos += 1
840
+ for sample_cpy_out in range(data_to_pop, expected_sample_number):
841
+ # cur_seg.buffer[out_pos++] = data_buf_.back()
842
+ out_pos += 1
843
+ if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
844
+ logging.error("Something wrong with the VAD algorithm\n")
845
+ self.data_buf_start_frame += frm_cnt
846
+ cur_seg.end_ms = (start_frm + frm_cnt) * self.vad_opts.frame_in_ms
847
+ if first_frm_is_start_point:
848
+ cur_seg.contain_seg_start_point = True
849
+ if last_frm_is_end_point:
850
+ cur_seg.contain_seg_end_point = True
851
+
852
+ def on_silence_detected(self, valid_frame: int):
853
+ self.lastest_confirmed_silence_frame = valid_frame
854
+ if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
855
+ self.pop_data_buf_till_frame(valid_frame)
856
+ # silence_detected_callback_
857
+ # pass
858
+
859
+ def on_voice_detected(self, valid_frame: int) -> None:
860
+ self.latest_confirmed_speech_frame = valid_frame
861
+ self.pop_data_to_output_buf(valid_frame, 1, False, False, False)
862
+
863
+ def on_voice_start(self, start_frame: int, fake_result: bool = False) -> None:
864
+ if self.vad_opts.do_start_point_detection:
865
+ pass
866
+ if self.confirmed_start_frame != -1:
867
+ logging.error("not reset vad properly\n")
868
+ else:
869
+ self.confirmed_start_frame = start_frame
870
+
871
+ if (
872
+ not fake_result
873
+ and self.vad_state_machine
874
+ == VadStateMachine.kVadInStateStartPointNotDetected
875
+ ):
876
+ self.pop_data_to_output_buf(
877
+ self.confirmed_start_frame, 1, True, False, False
878
+ )
879
+
880
+ def on_voice_end(
881
+ self, end_frame: int, fake_result: bool, is_last_frame: bool
882
+ ) -> None:
883
+ for t in range(self.latest_confirmed_speech_frame + 1, end_frame):
884
+ self.on_voice_detected(t)
885
+ if self.vad_opts.do_end_point_detection:
886
+ pass
887
+ if self.confirmed_end_frame != -1:
888
+ logging.error("not reset vad properly\n")
889
+ else:
890
+ self.confirmed_end_frame = end_frame
891
+ if not fake_result:
892
+ self.sil_frame = 0
893
+ self.pop_data_to_output_buf(
894
+ self.confirmed_end_frame, 1, False, True, is_last_frame
895
+ )
896
+ self.number_end_time_detected += 1
897
+
898
+ def maybe_on_voice_end_last_frame(
899
+ self, is_final_frame: bool, cur_frm_idx: int
900
+ ) -> None:
901
+ if is_final_frame:
902
+ self.on_voice_end(cur_frm_idx, False, True)
903
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
904
+
905
+ def get_latency(self) -> int:
906
+ return int(self.latency_frm_num_at_start_point() * self.vad_opts.frame_in_ms)
907
+
908
+ def latency_frm_num_at_start_point(self) -> int:
909
+ vad_latency = self.windows_detector.get_win_size()
910
+ if self.vad_opts.do_extend:
911
+ vad_latency += int(
912
+ self.vad_opts.lookback_time_start_point / self.vad_opts.frame_in_ms
913
+ )
914
+ return vad_latency
915
+
916
+ def get_frame_state(self, t: int) -> FrameState:
917
+ frame_state = FrameState.kFrameStateInvalid
918
+ cur_decibel = self.decibel[t - self.decibel_offset]
919
+ cur_snr = cur_decibel - self.noise_average_decibel
920
+ # for each frame, calc log posterior probability of each state
921
+ if cur_decibel < self.vad_opts.decibel_thres:
922
+ frame_state = FrameState.kFrameStateSil
923
+ self.detect_one_frame(frame_state, t, False)
924
+ return frame_state
925
+
926
+ sum_score = 0.0
927
+ noise_prob = 0.0
928
+ assert len(self.sil_pdf_ids) == self.vad_opts.silence_pdf_num
929
+ if len(self.sil_pdf_ids) > 0:
930
+ assert len(self.scores) == 1 # 只支持batch_size = 1的测试
931
+ sil_pdf_scores = [
932
+ self.scores[0][t - self.scores_offset][sil_pdf_id]
933
+ for sil_pdf_id in self.sil_pdf_ids
934
+ ]
935
+ sum_score = sum(sil_pdf_scores)
936
+ noise_prob = math.log(sum_score) * self.vad_opts.speech_2_noise_ratio
937
+ total_score = 1.0
938
+ sum_score = total_score - sum_score
939
+ speech_prob = math.log(sum_score)
940
+ if self.vad_opts.output_frame_probs:
941
+ frame_prob = E2EVadFrameProb()
942
+ frame_prob.noise_prob = noise_prob
943
+ frame_prob.speech_prob = speech_prob
944
+ frame_prob.score = sum_score
945
+ frame_prob.frame_id = t
946
+ self.frame_probs.append(frame_prob)
947
+ if math.exp(speech_prob) >= math.exp(noise_prob) + self.speech_noise_thres:
948
+ if (
949
+ cur_snr >= self.vad_opts.snr_thres
950
+ and cur_decibel >= self.vad_opts.decibel_thres
951
+ ):
952
+ frame_state = FrameState.kFrameStateSpeech
953
+ else:
954
+ frame_state = FrameState.kFrameStateSil
955
+ else:
956
+ frame_state = FrameState.kFrameStateSil
957
+ if self.noise_average_decibel < -99.9:
958
+ self.noise_average_decibel = cur_decibel
959
+ else:
960
+ self.noise_average_decibel = (
961
+ cur_decibel
962
+ + self.noise_average_decibel
963
+ * (self.vad_opts.noise_frame_num_used_for_snr - 1)
964
+ ) / self.vad_opts.noise_frame_num_used_for_snr
965
+
966
+ return frame_state
967
+
968
+ def infer_offline(
969
+ self,
970
+ feats: np.ndarray,
971
+ waveform: np.ndarray,
972
+ in_cache: Dict[str, np.ndarray] = dict(),
973
+ is_final: bool = False,
974
+ ) -> Tuple[List[List[List[int]]], Dict[str, np.ndarray]]:
975
+ self.waveform = waveform
976
+ self.compute_decibel()
977
+
978
+ self.compute_scores(feats)
979
+ if not is_final:
980
+ self.detect_common_frames()
981
+ else:
982
+ self.detect_last_frames()
983
+ segments = []
984
+ for batch_num in range(0, feats.shape[0]): # only support batch_size = 1 now
985
+ segment_batch = []
986
+ if len(self.output_data_buf) > 0:
987
+ for i in range(self.output_data_buf_offset, len(self.output_data_buf)):
988
+ if (
989
+ not self.output_data_buf[i].contain_seg_start_point
990
+ or not self.output_data_buf[i].contain_seg_end_point
991
+ ):
992
+ continue
993
+ segment = [
994
+ self.output_data_buf[i].start_ms,
995
+ self.output_data_buf[i].end_ms,
996
+ ]
997
+ segment_batch.append(segment)
998
+ self.output_data_buf_offset += 1 # need update this parameter
999
+ if segment_batch:
1000
+ segments.append(segment_batch)
1001
+
1002
+ if is_final:
1003
+ # reset class variables and clear the dict for the next query
1004
+ self.all_reset_detection()
1005
+ return segments, in_cache
1006
+
1007
+ def infer_online(
1008
+ self,
1009
+ feats: np.ndarray,
1010
+ waveform: np.ndarray,
1011
+ in_cache: list = None,
1012
+ is_final: bool = False,
1013
+ max_end_sil: int = 800,
1014
+ ) -> Tuple[List[List[List[int]]], Dict[str, np.ndarray]]:
1015
+ feats = [feats]
1016
+ if in_cache is None:
1017
+ in_cache = []
1018
+
1019
+ self.max_end_sil_frame_cnt_thresh = (
1020
+ max_end_sil - self.vad_opts.speech_to_sil_time_thres
1021
+ )
1022
+ self.waveform = waveform # compute decibel for each frame
1023
+ feats.extend(in_cache)
1024
+ in_cache = self.compute_scores(feats)
1025
+ self.compute_decibel()
1026
+
1027
+ if is_final:
1028
+ self.detect_last_frames()
1029
+ else:
1030
+ self.detect_common_frames()
1031
+
1032
+ segments = []
1033
+ # only support batch_size = 1 now
1034
+ for batch_num in range(0, feats[0].shape[0]):
1035
+ if len(self.output_data_buf) > 0:
1036
+ for i in range(self.output_data_buf_offset, len(self.output_data_buf)):
1037
+ if not self.output_data_buf[i].contain_seg_start_point:
1038
+ continue
1039
+ if (
1040
+ not self.next_seg
1041
+ and not self.output_data_buf[i].contain_seg_end_point
1042
+ ):
1043
+ continue
1044
+ start_ms = self.output_data_buf[i].start_ms if self.next_seg else -1
1045
+ if self.output_data_buf[i].contain_seg_end_point:
1046
+ end_ms = self.output_data_buf[i].end_ms
1047
+ self.next_seg = True
1048
+ self.output_data_buf_offset += 1
1049
+ else:
1050
+ end_ms = -1
1051
+ self.next_seg = False
1052
+ segments.append([start_ms, end_ms])
1053
+
1054
+ return segments, in_cache
1055
+
1056
+ def get_frames_state(
1057
+ self,
1058
+ feats: np.ndarray,
1059
+ waveform: np.ndarray,
1060
+ in_cache: list = None,
1061
+ is_final: bool = False,
1062
+ max_end_sil: int = 800,
1063
+ ):
1064
+ feats = [feats]
1065
+ states = []
1066
+ if in_cache is None:
1067
+ in_cache = []
1068
+
1069
+ self.max_end_sil_frame_cnt_thresh = (
1070
+ max_end_sil - self.vad_opts.speech_to_sil_time_thres
1071
+ )
1072
+ self.waveform = waveform # compute decibel for each frame
1073
+ feats.extend(in_cache)
1074
+ in_cache = self.compute_scores(feats)
1075
+ self.compute_decibel()
1076
+
1077
+ if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
1078
+ return states
1079
+
1080
+ for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
1081
+ frame_state = FrameState.kFrameStateInvalid
1082
+ frame_state = self.get_frame_state(self.frm_cnt - 1 - i)
1083
+ states.append(frame_state)
1084
+ if i == 0 and is_final:
1085
+ logging.info("last frame detected")
1086
+ self.detect_one_frame(frame_state, self.frm_cnt - 1, True)
1087
+ else:
1088
+ self.detect_one_frame(frame_state, self.frm_cnt - 1 - i, False)
1089
+
1090
+ return states
1091
+
1092
+ def detect_common_frames(self) -> int:
1093
+ if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
1094
+ return 0
1095
+ for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
1096
+ frame_state = FrameState.kFrameStateInvalid
1097
+ frame_state = self.get_frame_state(self.frm_cnt - 1 - i)
1098
+ # print(f"cur frame: {self.frm_cnt - 1 - i}, state is {frame_state}")
1099
+ self.detect_one_frame(frame_state, self.frm_cnt - 1 - i, False)
1100
+
1101
+ self.decibel = self.decibel[self.vad_opts.nn_eval_block_size - 1 :]
1102
+ self.decibel_offset = self.frm_cnt - 1 - i
1103
+ return 0
1104
+
1105
+ def detect_last_frames(self) -> int:
1106
+ if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
1107
+ return 0
1108
+ for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
1109
+ frame_state = FrameState.kFrameStateInvalid
1110
+ frame_state = self.get_frame_state(self.frm_cnt - 1 - i)
1111
+ if i != 0:
1112
+ self.detect_one_frame(frame_state, self.frm_cnt - 1 - i, False)
1113
+ else:
1114
+ self.detect_one_frame(frame_state, self.frm_cnt - 1, True)
1115
+
1116
+ return 0
1117
+
1118
+ def detect_one_frame(
1119
+ self, cur_frm_state: FrameState, cur_frm_idx: int, is_final_frame: bool
1120
+ ) -> None:
1121
+ tmp_cur_frm_state = FrameState.kFrameStateInvalid
1122
+ if cur_frm_state == FrameState.kFrameStateSpeech:
1123
+ if math.fabs(1.0) > float(self.vad_opts.fe_prior_thres):
1124
+ tmp_cur_frm_state = FrameState.kFrameStateSpeech
1125
+ else:
1126
+ tmp_cur_frm_state = FrameState.kFrameStateSil
1127
+ elif cur_frm_state == FrameState.kFrameStateSil:
1128
+ tmp_cur_frm_state = FrameState.kFrameStateSil
1129
+ state_change = self.windows_detector.detect_one_frame(
1130
+ tmp_cur_frm_state, cur_frm_idx
1131
+ )
1132
+ frm_shift_in_ms = self.vad_opts.frame_in_ms
1133
+ if AudioChangeState.kChangeStateSil2Speech == state_change:
1134
+ self.continous_silence_frame_count = 0
1135
+ self.pre_end_silence_detected = False
1136
+
1137
+ if (
1138
+ self.vad_state_machine
1139
+ == VadStateMachine.kVadInStateStartPointNotDetected
1140
+ ):
1141
+ start_frame = max(
1142
+ self.data_buf_start_frame,
1143
+ cur_frm_idx - self.latency_frm_num_at_start_point(),
1144
+ )
1145
+ self.on_voice_start(start_frame)
1146
+ self.vad_state_machine = VadStateMachine.kVadInStateInSpeechSegment
1147
+ for t in range(start_frame + 1, cur_frm_idx + 1):
1148
+ self.on_voice_detected(t)
1149
+ elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
1150
+ for t in range(self.latest_confirmed_speech_frame + 1, cur_frm_idx):
1151
+ self.on_voice_detected(t)
1152
+ if (
1153
+ cur_frm_idx - self.confirmed_start_frame + 1
1154
+ > self.vad_opts.max_single_segment_time / frm_shift_in_ms
1155
+ ):
1156
+ self.on_voice_end(cur_frm_idx, False, False)
1157
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
1158
+ elif not is_final_frame:
1159
+ self.on_voice_detected(cur_frm_idx)
1160
+ else:
1161
+ self.maybe_on_voice_end_last_frame(is_final_frame, cur_frm_idx)
1162
+ else:
1163
+ pass
1164
+ elif AudioChangeState.kChangeStateSpeech2Sil == state_change:
1165
+ self.continous_silence_frame_count = 0
1166
+ if (
1167
+ self.vad_state_machine
1168
+ == VadStateMachine.kVadInStateStartPointNotDetected
1169
+ ):
1170
+ pass
1171
+ elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
1172
+ if (
1173
+ cur_frm_idx - self.confirmed_start_frame + 1
1174
+ > self.vad_opts.max_single_segment_time / frm_shift_in_ms
1175
+ ):
1176
+ self.on_voice_end(cur_frm_idx, False, False)
1177
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
1178
+ elif not is_final_frame:
1179
+ self.on_voice_detected(cur_frm_idx)
1180
+ else:
1181
+ self.maybe_on_voice_end_last_frame(is_final_frame, cur_frm_idx)
1182
+ else:
1183
+ pass
1184
+ elif AudioChangeState.kChangeStateSpeech2Speech == state_change:
1185
+ self.continous_silence_frame_count = 0
1186
+ if self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
1187
+ if (
1188
+ cur_frm_idx - self.confirmed_start_frame + 1
1189
+ > self.vad_opts.max_single_segment_time / frm_shift_in_ms
1190
+ ):
1191
+ self.max_time_out = True
1192
+ self.on_voice_end(cur_frm_idx, False, False)
1193
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
1194
+ elif not is_final_frame:
1195
+ self.on_voice_detected(cur_frm_idx)
1196
+ else:
1197
+ self.maybe_on_voice_end_last_frame(is_final_frame, cur_frm_idx)
1198
+ else:
1199
+ pass
1200
+ elif AudioChangeState.kChangeStateSil2Sil == state_change:
1201
+ self.continous_silence_frame_count += 1
1202
+ if (
1203
+ self.vad_state_machine
1204
+ == VadStateMachine.kVadInStateStartPointNotDetected
1205
+ ):
1206
+ # silence timeout, return zero length decision
1207
+ if (
1208
+ (
1209
+ self.vad_opts.detect_mode
1210
+ == VadDetectMode.kVadSingleUtteranceDetectMode.value
1211
+ )
1212
+ and (
1213
+ self.continous_silence_frame_count * frm_shift_in_ms
1214
+ > self.vad_opts.max_start_silence_time
1215
+ )
1216
+ ) or (is_final_frame and self.number_end_time_detected == 0):
1217
+ for t in range(
1218
+ self.lastest_confirmed_silence_frame + 1, cur_frm_idx
1219
+ ):
1220
+ self.on_silence_detected(t)
1221
+ self.on_voice_start(0, True)
1222
+ self.on_voice_end(0, True, False)
1223
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
1224
+ else:
1225
+ if cur_frm_idx >= self.latency_frm_num_at_start_point():
1226
+ self.on_silence_detected(
1227
+ cur_frm_idx - self.latency_frm_num_at_start_point()
1228
+ )
1229
+ elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
1230
+ if (
1231
+ self.continous_silence_frame_count * frm_shift_in_ms
1232
+ >= self.max_end_sil_frame_cnt_thresh
1233
+ ):
1234
+ lookback_frame = int(
1235
+ self.max_end_sil_frame_cnt_thresh / frm_shift_in_ms
1236
+ )
1237
+ if self.vad_opts.do_extend:
1238
+ lookback_frame -= int(
1239
+ self.vad_opts.lookahead_time_end_point / frm_shift_in_ms
1240
+ )
1241
+ lookback_frame -= 1
1242
+ lookback_frame = max(0, lookback_frame)
1243
+ self.on_voice_end(cur_frm_idx - lookback_frame, False, False)
1244
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
1245
+ elif (
1246
+ cur_frm_idx - self.confirmed_start_frame + 1
1247
+ > self.vad_opts.max_single_segment_time / frm_shift_in_ms
1248
+ ):
1249
+ self.on_voice_end(cur_frm_idx, False, False)
1250
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
1251
+ elif self.vad_opts.do_extend and not is_final_frame:
1252
+ if self.continous_silence_frame_count <= int(
1253
+ self.vad_opts.lookahead_time_end_point / frm_shift_in_ms
1254
+ ):
1255
+ self.on_voice_detected(cur_frm_idx)
1256
+ else:
1257
+ self.maybe_on_voice_end_last_frame(is_final_frame, cur_frm_idx)
1258
+ else:
1259
+ pass
1260
+
1261
+ if (
1262
+ self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected
1263
+ and self.vad_opts.detect_mode
1264
+ == VadDetectMode.kVadMutipleUtteranceDetectMode.value
1265
+ ):
1266
+ self.reset_detection()
1267
+
1268
+
1269
+ class FSMNVad(object):
1270
+ def __init__(self, config_dir: str):
1271
+ config_dir = Path(config_dir)
1272
+ self.config = read_yaml(config_dir / "fsmn-config.yaml")
1273
+ self.frontend = WavFrontend(
1274
+ cmvn_file=config_dir / "fsmn-am.mvn",
1275
+ **self.config["WavFrontend"]["frontend_conf"],
1276
+ )
1277
+ self.config["FSMN"]["model_path"] = config_dir / "fsmnvad-offline.onnx"
1278
+
1279
+ self.vad = E2EVadModel(
1280
+ self.config["FSMN"], self.config["vadPostArgs"], config_dir
1281
+ )
1282
+
1283
+ def set_parameters(self, mode):
1284
+ pass
1285
+
1286
+ def extract_feature(self, waveform):
1287
+ fbank, _ = self.frontend.fbank(waveform)
1288
+ feats, feats_len = self.frontend.lfr_cmvn(fbank)
1289
+ return feats.astype(np.float32), feats_len
1290
+
1291
+ def is_speech(self, buf, sample_rate=16000):
1292
+ assert sample_rate == 16000, "only support 16k sample rate"
1293
+
1294
+ def segments_offline(self, waveform_path: Union[str, Path, np.ndarray]):
1295
+ """get sements of audio"""
1296
+
1297
+ if isinstance(waveform_path, np.ndarray):
1298
+ waveform = waveform_path
1299
+ else:
1300
+ if not os.path.exists(waveform_path):
1301
+ raise FileExistsError(f"{waveform_path} is not exist.")
1302
+ if os.path.isfile(waveform_path):
1303
+ logging.info(f"load audio {waveform_path}")
1304
+ waveform, _sample_rate = sf.read(
1305
+ waveform_path,
1306
+ dtype="float32",
1307
+ )
1308
+ else:
1309
+ raise FileNotFoundError(str(Path))
1310
+ assert (
1311
+ _sample_rate == 16000
1312
+ ), f"only support 16k sample rate, current sample rate is {_sample_rate}"
1313
+
1314
+ feats, feats_len = self.extract_feature(waveform)
1315
+ waveform = waveform[None, ...]
1316
+ segments_part, in_cache = self.vad.infer_offline(
1317
+ feats[None, ...], waveform, is_final=True
1318
+ )
1319
+ return segments_part[0]
1320
+
1321
+ # ```
1322
+
1323
+ # File: sense_voice.py
1324
+ # ```py
1325
+ # -*- coding:utf-8 -*-
1326
+ # @FileName :sense_voice.py.py
1327
+ # @Time :2024/7/18 15:40
1328
+ # @Author :lovemefan
1329
+ # @Email :lovemefan@outlook.com
1330
+
1331
+ languages = {"auto": 0, "zh": 3, "en": 4, "yue": 7, "ja": 11, "ko": 12, "nospeech": 13}
1332
+ formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
1333
+ logging.basicConfig(format=formatter, level=logging.INFO)
1334
+
1335
+ def main():
1336
+ arg_parser = argparse.ArgumentParser(description="Sense Voice")
1337
+ arg_parser.add_argument("-a", "--audio_file", required=True, type=str, help="Model")
1338
+ download_model_path = os.path.dirname(__file__)
1339
+ arg_parser.add_argument(
1340
+ "-dp",
1341
+ "--download_path",
1342
+ default=download_model_path,
1343
+ type=str,
1344
+ help="dir path of resource downloaded",
1345
+ )
1346
+ arg_parser.add_argument("-d", "--device", default=-1, type=int, help="Device")
1347
+ arg_parser.add_argument(
1348
+ "-n", "--num_threads", default=4, type=int, help="Num threads"
1349
+ )
1350
+ arg_parser.add_argument(
1351
+ "-l",
1352
+ "--language",
1353
+ choices=languages.keys(),
1354
+ default="auto",
1355
+ type=str,
1356
+ help="Language",
1357
+ )
1358
+ arg_parser.add_argument("--use_itn", action="store_true", help="Use ITN")
1359
+ args = arg_parser.parse_args()
1360
+
1361
+ front = WavFrontend(os.path.join(download_model_path, "am.mvn"))
1362
+
1363
+ model = SenseVoiceInferenceSession(
1364
+ os.path.join(download_model_path, "embedding.npy"),
1365
+ os.path.join(
1366
+ download_model_path,
1367
+ "sense-voice-encoder.rknn",
1368
+ ),
1369
+ os.path.join(download_model_path, "chn_jpn_yue_eng_ko_spectok.bpe.model"),
1370
+ args.device,
1371
+ args.num_threads,
1372
+ )
1373
+ waveform, _sample_rate = sf.read(
1374
+ args.audio_file,
1375
+ dtype="float32",
1376
+ always_2d=True
1377
+ )
1378
+
1379
+ logging.info(f"Audio {args.audio_file} is {len(waveform) / _sample_rate} seconds, {waveform.shape[1]} channel")
1380
+ # load vad model
1381
+ start = time.time()
1382
+ vad = FSMNVad(download_model_path)
1383
+ for channel_id, channel_data in enumerate(waveform.T):
1384
+ segments = vad.segments_offline(channel_data)
1385
+ results = ""
1386
+ for part in segments:
1387
+ audio_feats = front.get_features(channel_data[part[0] * 16 : part[1] * 16])
1388
+ asr_result = model(
1389
+ audio_feats[None, ...],
1390
+ language=languages[args.language],
1391
+ use_itn=args.use_itn,
1392
+ )
1393
+ logging.info(f"[Channel {channel_id}] [{part[0] / 1000}s - {part[1] / 1000}s] {asr_result}")
1394
+ vad.vad.all_reset_detection()
1395
+ decoding_time = time.time() - start
1396
+ logging.info(f"Decoder audio takes {decoding_time} seconds")
1397
+ logging.info(f"The RTF is {decoding_time/(waveform.shape[1] * len(waveform) / _sample_rate)}.")
1398
+
1399
+
1400
+ if __name__ == "__main__":
1401
+ main()
1402
+
SenseVoice/SenseVoiceSmall-RKNN2/server.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import logging
4
+ import re
5
+ print(f"Initial logging._nameToLevel: {logging._nameToLevel}")
6
+ from pathlib import Path
7
+ from typing import List, Dict, Any, Optional
8
+
9
+ import soundfile as sf
10
+ import numpy as np
11
+ from fastapi import FastAPI, HTTPException
12
+ from pydantic import BaseModel
13
+
14
+ # Ensure sensevoice_rknn.py is in the same directory or PYTHONPATH
15
+ # Add the directory of this script to sys.path if sensevoice_rknn is not found directly
16
+ import sys
17
+ SCRIPT_DIR = Path(__file__).resolve().parent
18
+ if str(SCRIPT_DIR) not in sys.path:
19
+ sys.path.append(str(SCRIPT_DIR))
20
+
21
+ try:
22
+ from sensevoice_rknn import WavFrontend, SenseVoiceInferenceSession, FSMNVad, languages
23
+ except ImportError as e:
24
+ logging.error(f"Error importing from sensevoice_rknn.py: {e}")
25
+ logging.error("Please ensure sensevoice_rknn.py is in the same directory as server.py or in your PYTHONPATH.")
26
+ # Fallback for critical components if import fails, to allow FastAPI to at least start and show an error
27
+ class WavFrontend:
28
+ def __init__(self, *args, **kwargs): raise NotImplementedError("WavFrontend not loaded")
29
+ def get_features(self, *args, **kwargs): raise NotImplementedError("WavFrontend not loaded")
30
+ class SenseVoiceInferenceSession:
31
+ def __init__(self, *args, **kwargs): raise NotImplementedError("SenseVoiceInferenceSession not loaded")
32
+ def __call__(self, *args, **kwargs): raise NotImplementedError("SenseVoiceInferenceSession not loaded")
33
+ class FSMNVad:
34
+ def __init__(self, *args, **kwargs): raise NotImplementedError("FSMNVad not loaded")
35
+ def segments_offline(self, *args, **kwargs): raise NotImplementedError("FSMNVad not loaded")
36
+ class Vad:
37
+ def all_reset_detection(self, *args, **kwargs): raise NotImplementedError("FSMNVad not loaded")
38
+ vad = Vad()
39
+
40
+ languages = {"en": 4} # Default fallback
41
+
42
+ app = FastAPI()
43
+
44
+ # Logging will be handled by Uvicorn's default configuration or a custom log_config if provided to uvicorn.run
45
+ # Get a logger instance for application-specific logs if needed
46
+ logger = logging.getLogger(__name__)
47
+ logger.setLevel(logging.INFO) # Set level for this specific logger
48
+
49
+ # --- Model Configuration & Loading ---
50
+ MODEL_BASE_PATH = Path(__file__).resolve().parent
51
+
52
+ # These paths should match those used in sensevoice_rknn.py's main function
53
+ # or be configurable if they differ.
54
+ MVN_PATH = MODEL_BASE_PATH / "am.mvn"
55
+ EMBEDDING_NPY_PATH = MODEL_BASE_PATH / "embedding.npy"
56
+ ENCODER_RKNN_PATH = MODEL_BASE_PATH / "sense-voice-encoder.rknn"
57
+ BPE_MODEL_PATH = MODEL_BASE_PATH / "chn_jpn_yue_eng_ko_spectok.bpe.model"
58
+ VAD_CONFIG_DIR = MODEL_BASE_PATH # Assuming fsmn-config.yaml and fsmnvad-offline.onnx are here
59
+
60
+ # Global model instances
61
+ w_frontend: Optional[WavFrontend] = None
62
+ asr_model: Optional[SenseVoiceInferenceSession] = None
63
+ vad_model: Optional[FSMNVad] = None
64
+
65
+ @app.on_event("startup")
66
+ def load_models():
67
+ global w_frontend, asr_model, vad_model
68
+ logging.info("Loading models...")
69
+ start_time = time.time()
70
+ try:
71
+ if not MVN_PATH.exists():
72
+ raise FileNotFoundError(f"CMVN file not found: {MVN_PATH}")
73
+ w_frontend = WavFrontend(cmvn_file=str(MVN_PATH))
74
+
75
+ if not EMBEDDING_NPY_PATH.exists() or not ENCODER_RKNN_PATH.exists() or not BPE_MODEL_PATH.exists():
76
+ raise FileNotFoundError(
77
+ f"One or more ASR model files not found: "
78
+ f"Embedding: {EMBEDDING_NPY_PATH}, Encoder: {ENCODER_RKNN_PATH}, BPE: {BPE_MODEL_PATH}"
79
+ )
80
+ asr_model = SenseVoiceInferenceSession(
81
+ embedding_model_file=str(EMBEDDING_NPY_PATH),
82
+ encoder_model_file=str(ENCODER_RKNN_PATH),
83
+ bpe_model_file=str(BPE_MODEL_PATH),
84
+ # Assuming default device_id and num_threads as in sensevoice_rknn.py's main
85
+ device_id=-1,
86
+ intra_op_num_threads=4
87
+ )
88
+
89
+ # Check for VAD model files (fsmn-config.yaml, fsmnvad-offline.onnx)
90
+ if not (VAD_CONFIG_DIR / "fsmn-config.yaml").exists() or not (VAD_CONFIG_DIR / "fsmnvad-offline.onnx").exists():
91
+ raise FileNotFoundError(f"VAD config or model not found in {VAD_CONFIG_DIR}")
92
+ vad_model = FSMNVad(config_dir=str(VAD_CONFIG_DIR))
93
+
94
+ logging.info(f"Models loaded successfully in {time.time() - start_time:.2f} seconds.")
95
+ except FileNotFoundError as e:
96
+ logging.error(f"Model loading failed: {e}")
97
+ # Keep models as None, endpoints will raise errors
98
+ except Exception as e:
99
+ logging.error(f"An unexpected error occurred during model loading: {e}")
100
+ # Keep models as None
101
+
102
+ class TranscribeRequest(BaseModel):
103
+ audio_file_path: str
104
+ language: str = "en" # Default to English
105
+ use_itn: bool = False
106
+
107
+ class Segment(BaseModel):
108
+ start_time_s: float
109
+ end_time_s: float
110
+ text: str
111
+
112
+ class TranscribeResponse(BaseModel):
113
+ full_transcription: str
114
+ segments: List[Segment]
115
+
116
+ @app.post("/transcribe", response_model=str)
117
+ async def transcribe_audio(request: TranscribeRequest):
118
+ if w_frontend is None or asr_model is None or vad_model is None:
119
+ logging.error("Models not loaded. Transcription cannot proceed.")
120
+ raise HTTPException(status_code=503, detail="Models are not loaded. Please check server logs.")
121
+
122
+ audio_path = Path(request.audio_file_path)
123
+ if not audio_path.exists() or not audio_path.is_file():
124
+ logging.error(f"Audio file not found: {audio_path}")
125
+ raise HTTPException(status_code=404, detail=f"Audio file not found: {audio_path}")
126
+
127
+ try:
128
+ waveform, sample_rate = sf.read(
129
+ str(audio_path),
130
+ dtype="float32",
131
+ always_2d=True
132
+ )
133
+ except Exception as e:
134
+ logging.error(f"Error reading audio file {audio_path}: {e}")
135
+ raise HTTPException(status_code=400, detail=f"Could not read audio file: {e}")
136
+
137
+ if sample_rate != 16000:
138
+ # Basic resampling could be added here if needed, or just raise an error
139
+ logging.warning(f"Audio sample rate is {sample_rate}Hz, expected 16000Hz. Results may be suboptimal.")
140
+ # For now, we proceed but log a warning. For critical applications, convert or reject.
141
+
142
+ logging.info(f"Processing audio: {audio_path}, Duration: {len(waveform) / sample_rate:.2f}s, Channels: {waveform.shape[1]}")
143
+
144
+ lang_code = languages.get(request.language.lower())
145
+ if lang_code is None:
146
+ logging.warning(f"Unsupported language: {request.language}. Defaulting to 'en'. Supported: {list(languages.keys())}")
147
+ lang_code = languages.get("en", 0) # Fallback to 'en' or 'auto' if 'en' isn't in languages
148
+
149
+ all_segments_text: List[str] = []
150
+ detailed_segments: List[Segment] = []
151
+ processing_start_time = time.time()
152
+
153
+ for channel_id in range(waveform.shape[1]):
154
+ channel_data = waveform[:, channel_id]
155
+ logging.info(f"Processing channel {channel_id + 1}/{waveform.shape[1]}")
156
+
157
+ try:
158
+ # Ensure channel_data is 1D for VAD if it expects that
159
+ speech_segments = vad_model.segments_offline(channel_data) # segments_offline expects 1D array
160
+ except Exception as e:
161
+ logging.error(f"VAD processing failed for channel {channel_id}: {e}")
162
+ # Optionally skip this channel or raise an error for the whole request
163
+ continue # Skip to next channel
164
+
165
+ for part_idx, part in enumerate(speech_segments):
166
+ start_sample = int(part[0] * 16) # VAD returns ms, convert to samples (16 samples/ms for 16kHz)
167
+ end_sample = int(part[1] * 16)
168
+ segment_audio = channel_data[start_sample:end_sample]
169
+
170
+ if len(segment_audio) == 0:
171
+ logging.info(f"Empty audio segment for channel {channel_id}, part {part_idx}. Skipping.")
172
+ continue
173
+
174
+ try:
175
+ # Ensure get_features expects 1D array
176
+ audio_feats = w_frontend.get_features(segment_audio)
177
+ # ASR model expects batch dimension, add [None, ...]
178
+ asr_result_text_raw = asr_model(
179
+ audio_feats[None, ...],
180
+ language=lang_code,
181
+ use_itn=request.use_itn,
182
+ )
183
+ # Remove tags like <|en|>, <|HAPPY|>, etc.
184
+ asr_result_text_cleaned = re.sub(r"<\|[^\|]+\|>", "", asr_result_text_raw).strip()
185
+
186
+ segment_start_s = part[0] / 1000.0
187
+ segment_end_s = part[1] / 1000.0
188
+ logging.info(f"[Ch{channel_id}] [{segment_start_s:.2f}s - {segment_end_s:.2f}s] Raw: {asr_result_text_raw} Cleaned: {asr_result_text_cleaned}")
189
+ all_segments_text.append(asr_result_text_cleaned)
190
+ detailed_segments.append(Segment(start_time_s=segment_start_s, end_time_s=segment_end_s, text=asr_result_text_cleaned))
191
+ except Exception as e:
192
+ logging.error(f"ASR processing failed for segment {part_idx} in channel {channel_id}: {e}")
193
+ # Optionally add a placeholder or skip this segment's text
194
+ detailed_segments.append(Segment(start_time_s=part[0]/1000.0, end_time_s=part[1]/1000.0, text="[ASR_ERROR]"))
195
+
196
+ vad_model.vad.all_reset_detection() # Reset VAD state for next channel or call
197
+
198
+ full_transcription = " ".join(all_segments_text).strip()
199
+ logging.info(f"Transcription complete in {time.time() - processing_start_time:.2f}s. Result: {full_transcription}")
200
+
201
+ return full_transcription
202
+
203
+ if __name__ == "__main__":
204
+ import uvicorn
205
+
206
+ MINIMAL_LOGGING_CONFIG = {
207
+ "version": 1,
208
+ "disable_existing_loggers": False, # Let other loggers (like our app logger) exist
209
+ "formatters": {
210
+ "default": {
211
+ "()": "uvicorn.logging.DefaultFormatter",
212
+ "fmt": "%(levelprefix)s %(message)s",
213
+ "use_colors": None,
214
+ },
215
+ },
216
+ "handlers": {
217
+ "default": {
218
+ "formatter": "default",
219
+ "class": "logging.StreamHandler",
220
+ "stream": "ext://sys.stderr",
221
+ },
222
+ },
223
+ "loggers": {
224
+ "uvicorn": { # Uvicorn's own operational logs
225
+ "handlers": ["default"],
226
+ "level": logging.INFO, # Explicitly use integer
227
+ "propagate": False,
228
+ },
229
+ "uvicorn.error": { # Logs for errors within Uvicorn
230
+ "handlers": ["default"],
231
+ "level": logging.INFO, # Explicitly use integer
232
+ "propagate": False,
233
+ },
234
+ # We are deliberately not configuring uvicorn.access here for simplicity
235
+ # It might default to INFO or be silent if not configured and no parent handler catches it.
236
+ },
237
+ # Ensure our application logger also works if needed
238
+ __name__: {
239
+ "handlers": ["default"],
240
+ "level": logging.INFO,
241
+ "propagate": False,
242
+ }
243
+ }
244
+
245
+ logger.info(f"Attempting to run Uvicorn with minimal explicit log_config.")
246
+ uvicorn.run(app, host="0.0.0.0", port=8000, log_config=MINIMAL_LOGGING_CONFIG)
SenseVoice/SenseVoiceSmall-RKNN2/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/ThomasTheMaker/SenseVoiceSmall-RKNN2
SenseVoice/SenseVoiceSmall-RKNN2/wget-log ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ --2025-06-18 23:24:33-- https://storage.googleapis.com/kagglesdsdata/datasets/829978/1417968/harvard.wav?X-Goog-Algorithm=GOOG4-RSA-SHA256
2
+ Resolving storage.googleapis.com (storage.googleapis.com)... 2607:f8b0:4008:805::201b, 2607:f8b0:4008:80e::201b, 2607:f8b0:4008:804::201b, ...
3
+ Connecting to storage.googleapis.com (storage.googleapis.com)|2607:f8b0:4008:805::201b|:443... connected.
4
+ HTTP request sent, awaiting response... 400 Bad Request
5
+ 2025-06-18 23:24:33 ERROR 400: Bad Request.
6
+