lxowalle commited on Dec 2, 2025

Commit

ee4406b

0 Parent(s):

* support for maixcam2

Browse files

Files changed (30) hide show

.gitattributes +39 -0
.gitignore +2 -0
LICENSE +21 -0
README.md +96 -0
SenseVoiceAx.py +415 -0
am.mvn +8 -0
chn_jpn_yue_eng_ko_spectok.bpe.model +3 -0
client.py +146 -0
config.json +0 -0
download_dataset.sh +2 -0
download_utils.py +33 -0
embeddings.npy +3 -0
example/en.mp3 +3 -0
example/ja.mp3 +3 -0
example/ko.mp3 +3 -0
example/yue.mp3 +3 -0
example/zh.mp3 +3 -0
frontend.py +460 -0
gradio_demo.py +62 -0
main.py +79 -0
model.mud +15 -0
pe_nonstream.npy +3 -0
pe_streaming.npy +3 -0
print_utils.py +131 -0
requirements.txt +11 -0
sensevoice_ax630c/sensevoice.axmodel +3 -0
sensevoice_ax630c/streaming_sensevoice.axmodel +3 -0
server.py +270 -0
test_wer.py +296 -0
tokenizer.py +135 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,39 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+sensevoice.axmodel filter=lfs diff=lfs merge=lfs -text
+*.axmodel filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text
+*.mp3 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+ .gradio

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 祈Inory
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,96 @@

+---
+license: mit
+language:
+- en
+pipeline_tag: automatic-speech-recognition
+---
+# sensevoice.axera
+FunASR SenseVoice on Axera, official repo: https://github.com/FunAudioLLM/SenseVoice
+## TODO
+- [x] 支持AX630C
+- [ ] 支持C++
+- [x] 支持FastAPI
+## 功能
+ - 语音识别
+ - 自动识别语言(支持中文、英文、粤语、日语、韩语)
+ - 情感识别
+ - 自动标点
+ - 支持流式识别
+## 支持平台
+- [x] AX650N
+- [x] AX630C
+## 环境安装
+```
+pip3 install -r requirements.txt
+```
+如果空间不足可以使用 --prefix 指定别的安装路径
+## 使用
+```
+# 首次运行会自动从huggingface上下载模型, 保存到models中
+python3 main.py -i 输入音频文件
+```
+运行参数说明:
+| 参数名称 | 说明 | 默认值 |
+| --- | --- | --- |
+| --input/-i | 输入音频文件 | |
+| --language/-l | 识别语言，支持auto, zh, en, yue, ja, ko | auto |
+| --streaming | 流式识别 | |
+### 示例:
+example下有测试音频
+如 粤语测试
+```
+python3 main.py -i example/yue.mp3
+```
+输出
+```
+RTF: 0.03026517820946964    Latency: 0.15689468383789062s  Total length: 5.184s
+['呢几个字。', '都表达唔到，我想讲嘅意。', '思。']
+```
+流式识别
+```
+python3 main.py -i example/zh.mp3 --streaming
+```
+输出
+```
+{'timestamps': [540], 'text': '开'}
+{'timestamps': [540, 780, 1080], 'text': '开放时'}
+{'timestamps': [540, 780, 1080, 1260, 1740], 'text': '开放时间早'}
+{'timestamps': [540, 780, 1080, 1260, 1740, 1920, 2340], 'text': '开放时间早上9'}
+{'timestamps': [540, 780, 1080, 1260, 1740, 1920, 2340, 2640], 'text': '开放时间早上9点'}
+{'timestamps': [540, 780, 1080, 1260, 1740, 1920, 2340, 2640, 3060], 'text': '开放时间早上9点至'}
+{'timestamps': [540, 780, 1080, 1260, 1740, 1920, 2340, 2640, 3060, 3780, 4020], 'text': '开放时间早上9点至下午'}
+{'timestamps': [540, 780, 1080, 1260, 1740, 1920, 2340, 2640, 3060, 3780, 4020, 4440, 4620], 'text': '开放时间早上9点至下午五点'}
+RTF: 0.03678379235444246
+```
+## 准确率
+使用WER(Word-Error-Rate)作为评价标准
+**WER = 0.0389**
+### 复现测试结果
+```
+./download_datasets.sh
+python test_wer.py -d datasets -l zh
+```
+## 技术讨论
+- Github issues
+- QQ 群: 139953715

SenseVoiceAx.py ADDED Viewed

	@@ -0,0 +1,415 @@

+import axengine as axe
+import numpy as np
+import librosa
+from frontend import WavFrontend
+import os
+import time
+from typing import List, Union, Optional
+from asr_decoder import CTCDecoder
+from tokenizer import SentencepiecesTokenizer
+from online_fbank import OnlineFbank
+import torch
+def sequence_mask(lengths, maxlen=None, dtype=np.float32):
+    # 如果 maxlen 未指定，则取 lengths 中的最大值
+    if maxlen is None:
+        maxlen = np.max(lengths)
+    # 创建一个从 0 到 maxlen-1 的行向量
+    row_vector = np.arange(0, maxlen, 1)
+    # 将 lengths 转换为列向量
+    matrix = np.expand_dims(lengths, axis=-1)
+    # 比较生成掩码
+    mask = row_vector < matrix
+    if mask.shape[-1] < lengths[0]:
+        mask = np.concatenate(
+            [
+                mask,
+                np.zeros(
+                    (mask.shape[0], lengths[0] - mask.shape[-1]), dtype=np.float32
+                ),
+            ],
+            axis=-1,
+        )
+    # 返回指定数据类型的掩码
+    return mask.astype(dtype)[None, ...]
+def unique_consecutive_np(arr):
+    """
+    找出数组中连续的唯一值，模拟 torch.unique_consecutive(yseq, dim=-1)
+    参数:
+    arr: 一维numpy数组
+    返回:
+    unique_values: 去除连续重复值后的数组
+    """
+    if len(arr) == 0:
+        return np.array([])
+    if len(arr) == 1:
+        return arr.copy()
+    # 找出变化的位置
+    diff = np.diff(arr)
+    change_positions = np.where(diff != 0)[0] + 1
+    # 添加起始位置
+    start_positions = np.concatenate(([0], change_positions))
+    # 获取唯一值（每个连续段的第一个值）
+    unique_values = arr[start_positions]
+    return unique_values
+class SenseVoiceAx:
+    """SenseVoice axmodel runner"""
+    def __init__(
+        self,
+        model_path: str,
+        max_len: int = 256,
+        beam_size: int = 3,
+        language: str = "auto",
+        hot_words: Optional[List[str]] = None,
+        use_itn: bool = True,
+        streaming: bool = False,
+    ):
+        """
+        Initialize SenseVoiceAx
+        Args:
+            model_path: Path of axmodel
+            max_len:    Fixed shape of input of axmodel
+            beam_size:  Max number of hypos to hold after each decode step
+            language:   Support auto, zh(Chinese), en(English), yue(Cantonese), ja(Japanese), ko(Korean)
+            hot_words:  Words that may fail to recognize,
+                        special words/phrases (aka hotwords) like rare words, personalized information etc.
+            use_itn:    Allow Invert Text Normalization if True,
+                        ITN converts ASR model output into its written form to improve text readability,
+                        For example, the ITN module replaces “one hundred and twenty-three dollars” transcribed by an ASR model with “$123.”
+            streaming:  Processes audio in small segments or "chunks" sequentially and outputs text on the fly.
+                        Use stream_infer method if streaming is true otherwise infer.
+        """
+        model_path_root = os.path.dirname(model_path)
+        emb_path = os.path.join(model_path_root, "../embeddings.npy")
+        cmvn_file = os.path.join(model_path_root, "../am.mvn")
+        bpe_model = os.path.join(
+            model_path_root, "../chn_jpn_yue_eng_ko_spectok.bpe.model"
+        )
+        if streaming:
+            self.position_encoding = np.load(
+                os.path.join(model_path_root, "../pe_streaming.npy")
+            )
+        else:
+            self.position_encoding = np.load(
+                os.path.join(model_path_root, "../pe_nonstream.npy")
+            )
+        self.streaming = streaming
+        self.tokenizer = SentencepiecesTokenizer(bpemodel=bpe_model)
+        self.frontend = WavFrontend(
+            cmvn_file=cmvn_file,
+            fs=16000,
+            window="hamming",
+            n_mels=80,
+            frame_length=25,
+            frame_shift=10,
+            lfr_m=7,
+            lfr_n=6,
+        )
+        self.model = axe.InferenceSession(model_path)
+        self.sample_rate = 16000
+        self.blank_id = 0
+        self.max_len = max_len
+        self.padding = 16
+        self.input_size = 560
+        self.lid_dict = {
+            "auto": 0,
+            "zh": 3,
+            "en": 4,
+            "yue": 7,
+            "ja": 11,
+            "ko": 12,
+            "nospeech": 13,
+        }
+        self.lid_int_dict = {
+            24884: 3,
+            24885: 4,
+            24888: 7,
+            24892: 11,
+            24896: 12,
+            24992: 13,
+        }
+        self.textnorm_dict = {"withitn": 14, "woitn": 15}
+        self.textnorm_int_dict = {25016: 14, 25017: 15}
+        self.emo_dict = {
+            "unk": 25009,
+            "happy": 25001,
+            "sad": 25002,
+            "angry": 25003,
+            "neutral": 25004,
+        }
+        self.load_embeddings(emb_path, language, use_itn)
+        self.language = language
+        # decoder
+        if beam_size > 1 and hot_words is not None:
+            self.beam_size = beam_size
+            symbol_table = {}
+            for i in range(self.tokenizer.get_vocab_size()):
+                symbol_table[self.tokenizer.decode(i)] = i
+            self.decoder = CTCDecoder(hot_words, symbol_table, bpe_model)
+        else:
+            self.beam_size = 1
+            self.decoder = CTCDecoder()
+        if streaming:
+            self.cur_idx = -1
+            self.chunk_size = max_len - self.padding
+            self.caches_shape = (max_len, self.input_size)
+            self.caches = np.zeros(self.caches_shape, dtype=np.float32)
+            self.zeros = np.zeros((1, self.input_size), dtype=np.float32)
+            self.neg_mean, self.inv_stddev = (
+                self.frontend.cmvn[0, :],
+                self.frontend.cmvn[1, :],
+            )
+            self.fbank = OnlineFbank(window_type="hamming")
+            self.masks = sequence_mask(
+                np.array([self.max_len], dtype=np.int32),
+                maxlen=self.max_len,
+                dtype=np.float32,
+            )
+    @property
+    def language_options(self):
+        return list(self.lid_dict.keys())
+    @property
+    def textnorm_options(self):
+        return list(self.textnorm_dict.keys())
+    def load_embeddings(self, emb_path, language, use_itn):
+        self.embeddings = np.load(emb_path, allow_pickle=True).item()
+        self.language_query = self.embeddings[language]
+        self.textnorm_query = (
+            self.embeddings["withitn"] if use_itn else self.embeddings["woitn"]
+        )
+        self.event_emo_query = self.embeddings["event_emo"]
+        self.input_query = np.concatenate(
+            (self.textnorm_query, self.language_query, self.event_emo_query), axis=1
+        )
+        self.query_num = self.input_query.shape[1]
+    def choose_language(self, language):
+        self.language_query = self.embeddings[language]
+        self.input_query = np.concatenate(
+            (self.textnorm_query, self.language_query, self.event_emo_query), axis=1
+        )
+        self.language = language
+    def load_data(self, filepath: str) -> np.ndarray:
+        waveform, _ = librosa.load(filepath, sr=self.sample_rate)
+        return waveform.flatten()
+    @staticmethod
+    def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
+        def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
+            pad_width = ((0, max_feat_len - cur_len), (0, 0))
+            return np.pad(feat, pad_width, "constant", constant_values=0)
+        feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
+        feats = np.array(feat_res).astype(np.float32)
+        return feats
+    def preprocess(self, waveform):
+        feats, feats_len = [], []
+        for wf in [waveform]:
+            speech, _ = self.frontend.fbank(wf)
+            feat, feat_len = self.frontend.lfr_cmvn(speech)
+            feats.append(feat)
+            feats_len.append(feat_len)
+        feats = self.pad_feats(feats, np.max(feats_len))
+        feats_len = np.array(feats_len).astype(np.int32)
+        return feats, feats_len
+    def postprocess(self, ctc_logits, encoder_out_lens):
+        # 提取数据
+        x = ctc_logits[0, 4 : encoder_out_lens[0], :]
+        # 获取最大值索引
+        yseq = np.argmax(x, axis=-1)
+        # 去除连续重复元素
+        yseq = unique_consecutive_np(yseq)
+        # 创建掩码并过滤 blank_id
+        mask = yseq != self.blank_id
+        token_int = yseq[mask].tolist()
+        return token_int
+    def infer_waveform(self, waveform: np.ndarray, language="auto"):
+        if language != self.language:
+            self.choose_language(language)
+        # start = time.time()
+        feat, feat_len = self.preprocess(waveform)
+        # print(f"Preprocess take {time.time() - start}s")
+        slice_len = self.max_len - self.query_num
+        slice_num = int(np.ceil(feat.shape[1] / slice_len))
+        asr_res = []
+        for i in range(slice_num):
+            if i == 0:
+                sub_feat = feat[:, i * slice_len : (i + 1) * slice_len, :]
+            else:
+                sub_feat = feat[
+                    :,
+                    i * slice_len - self.padding : (i + 1) * slice_len - self.padding,
+                    :,
+                ]
+            # concat query
+            sub_feat = np.concatenate([self.input_query, sub_feat], axis=1)
+            real_len = sub_feat.shape[1]
+            if real_len < self.max_len:
+                sub_feat = np.concatenate(
+                    [
+                        sub_feat,
+                        np.zeros(
+                            (1, self.max_len - real_len, sub_feat.shape[-1]),
+                            dtype=np.float32,
+                        ),
+                    ],
+                    axis=1,
+                )
+            masks = sequence_mask(
+                np.array([self.max_len], dtype=np.int32),
+                maxlen=real_len,
+                dtype=np.float32,
+            )
+            # start = time.time()
+            outputs = self.model.run(
+                None,
+                {
+                    "speech": sub_feat,
+                    "masks": masks,
+                    "position_encoding": self.position_encoding,
+                },
+            )
+            ctc_logits, encoder_out_lens = outputs
+            token_int = self.postprocess(ctc_logits, encoder_out_lens)
+            if self.tokenizer is not None:
+                asr_res.append(self.tokenizer.tokens2text(token_int))
+            else:
+                asr_res.append(token_int)
+        return asr_res
+    def infer(
+        self, filepath_or_data: Union[np.ndarray, str], language="auto", print_rtf=False
+    ):
+        assert not self.streaming, "This method is for non-streaming model"
+        if isinstance(filepath_or_data, str):
+            waveform = self.load_data(filepath_or_data)
+        else:
+            waveform = filepath_or_data
+        total_time = waveform.shape[-1] / self.sample_rate
+        start = time.time()
+        asr_res = self.infer_waveform(waveform, language)
+        latency = time.time() - start
+        if print_rtf:
+            rtf = latency / total_time
+            print(f"RTF: {rtf}    Latency: {latency}s  Total length: {total_time}s")
+        return "".join(asr_res)
+    def decode(self, times, tokens):
+        times_ms = []
+        for step, token in zip(times, tokens):
+            if len(self.tokenizer.decode(token).strip()) == 0:
+                continue
+            times_ms.append(step * 60)
+        return times_ms, self.tokenizer.decode(tokens)
+    def reset(self):
+        self.cur_idx = -1
+        self.decoder.reset()
+        self.fbank = OnlineFbank(window_type="hamming")
+        self.caches = np.zeros(self.caches_shape)
+    def get_size(self):
+        effective_size = self.cur_idx + 1 - self.padding
+        if effective_size <= 0:
+            return 0
+        return effective_size % self.chunk_size or self.chunk_size
+    def stream_infer(self, audio, is_last, language="auto"):
+        assert self.streaming, "This method is for streaming model"
+        if language != self.language:
+            self.choose_language(language)
+        self.fbank.accept_waveform(audio, is_last)
+        features = self.fbank.get_lfr_frames(
+            neg_mean=self.neg_mean, inv_stddev=self.inv_stddev
+        )
+        if is_last and len(features) == 0:
+            features = self.zeros
+        for idx, feature in enumerate(features):
+            is_last = is_last and idx == features.shape[0] - 1
+            self.caches = np.roll(self.caches, -1, axis=0)
+            self.caches[-1, :] = feature
+            self.cur_idx += 1
+            cur_size = self.get_size()
+            if cur_size != self.chunk_size and not is_last:
+                continue
+            speech = self.caches[None, ...]
+            outputs = self.model.run(
+                None,
+                {
+                    "speech": speech,
+                    "masks": self.masks,
+                    "position_encoding": self.position_encoding,
+                },
+            )
+            ctc_logits, encoder_out_lens = outputs
+            probs = ctc_logits[0, 4 : encoder_out_lens[0]]
+            probs = torch.from_numpy(probs)
+            if cur_size != self.chunk_size:
+                probs = probs[self.chunk_size - cur_size :]
+            if not is_last:
+                probs = probs[: self.chunk_size]
+            if self.beam_size > 1:
+                res = self.decoder.ctc_prefix_beam_search(
+                    probs, beam_size=self.beam_size, is_last=is_last
+                )
+                times_ms, text = self.decode(res["times"][0], res["tokens"][0])
+            else:
+                res = self.decoder.ctc_greedy_search(probs, is_last=is_last)
+                times_ms, text = self.decode(res["times"], res["tokens"])
+            yield {"timestamps": times_ms, "text": text}

am.mvn ADDED Viewed

	@@ -0,0 +1,8 @@

+<Nnet>
+<Splice> 560 560
+[ 0 ]
+<AddShift> 560 560
+<LearnRateCoef> 0 [ -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 -8.311879 -8.600912 -9.615928 -10.43595 -11.21292 -11.88333 -12.36243 -12.63706 -12.8818 -12.83066 -12.89103 -12.95666 -13.19763 -13.40598 -13.49113 -13.5546 -13.55639 -13.51915 -13.68284 -13.53289 -13.42107 -13.65519 -13.50713 -13.75251 -13.76715 -13.87408 -13.73109 -13.70412 -13.56073 -13.53488 -13.54895 -13.56228 -13.59408 -13.62047 -13.64198 -13.66109 -13.62669 -13.58297 -13.57387 -13.4739 -13.53063 -13.48348 -13.61047 -13.64716 -13.71546 -13.79184 -13.90614 -14.03098 -14.18205 -14.35881 -14.48419 -14.60172 -14.70591 -14.83362 -14.92122 -15.00622 -15.05122 -15.03119 -14.99028 -14.92302 -14.86927 -14.82691 -14.7972 -14.76909 -14.71356 -14.61277 -14.51696 -14.42252 -14.36405 -14.30451 -14.23161 -14.19851 -14.16633 -14.15649 -14.10504 -13.99518 -13.79562 -13.3996 -12.7767 -11.71208 ]
+<Rescale> 560 560
+<LearnRateCoef> 0 [ 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 0.155775 0.154484 0.1527379 0.1518718 0.1506028 0.1489256 0.147067 0.1447061 0.1436307 0.1443568 0.1451849 0.1455157 0.1452821 0.1445717 0.1439195 0.1435867 0.1436018 0.1438781 0.1442086 0.1448844 0.1454756 0.145663 0.146268 0.1467386 0.1472724 0.147664 0.1480913 0.1483739 0.1488841 0.1493636 0.1497088 0.1500379 0.1502916 0.1505389 0.1506787 0.1507102 0.1505992 0.1505445 0.1505938 0.1508133 0.1509569 0.1512396 0.1514625 0.1516195 0.1516156 0.1515561 0.1514966 0.1513976 0.1512612 0.151076 0.1510596 0.1510431 0.151077 0.1511168 0.1511917 0.151023 0.1508045 0.1505885 0.1503493 0.1502373 0.1501726 0.1500762 0.1500065 0.1499782 0.150057 0.1502658 0.150469 0.1505335 0.1505505 0.1505328 0.1504275 0.1502438 0.1499674 0.1497118 0.1494661 0.1493102 0.1493681 0.1495501 0.1499738 0.1509654 ]
+</Nnet>

chn_jpn_yue_eng_ko_spectok.bpe.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa87f86064c3730d799ddf7af3c04659151102cba548bce325cf06ba4da4e6a8
+size 377341

client.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import requests, json, os
+import librosa
+class SensevoiceClient:
+    def __init__(self, model = "", url="http://0.0.0.0:12347", lauguage="auto", stream=False):
+        self.model = model
+        self.url = url
+        self.stream = stream
+        self.launguage = lauguage
+    def _check_service(self):
+        try:
+            response = requests.get(self.url + '/status')
+            if response.status_code == 200:
+                return True
+        except:
+            return False
+    def _start_service(self):
+        import time
+        if not self._check_service():
+            os.system("systemctl start sensevoice.service")
+        while not self._check_service():
+            print("Waiting for service to start...")
+            time.sleep(1)
+        return True
+    def _stop_service(self):
+        os.system("systemctl stop sensevoice.service")
+    def _get_status(self):
+        try:
+            response = requests.get(self.url + '/status')
+            if response.status_code == 200:
+                res = json.loads(response.text)
+                return res["status"]
+        except:
+            return "not loaded"
+    def _start_model(self):
+        try:
+            data = {
+                "model_path": self.model,
+                "sample_rate": 16000,
+                "language": self.launguage,
+                "stream": self.stream
+            }
+            response = requests.post(self.url + '/start_model', json=data)
+            if response.status_code == 200:
+                res = json.loads(response.text)
+                return True if res["status"] == 'loaded' else False
+        except Exception as e:
+            return False
+    def _stop_model(self):
+        try:
+            response = requests.post(self.url + '/_stop_model')
+            if response.status_code == 200:
+                res = json.loads(response.text)
+                return True if res["status"] == 'not loaded' else False
+        except Exception as e:
+            return False
+    def start(self):
+        if self._start_service():
+            print("Service started successfully.")
+        else:
+            print("Failed to start service.")
+            return False
+        if self._start_model():
+            print("Model started successfully.")
+        else:
+            print("Failed to start model.")
+            return False
+        return True
+    def stop_model(self):
+        self._stop_model()
+    def stop(self):
+        self._stop_model()
+        self._stop_service()
+    def get_wave_form(self, path):
+        waveform, _ = librosa.load(path, sr=16000)
+        return waveform
+    def refer(self, filepath):
+        if self.stream:
+            print("Streaming mode, use refer_stream() instead.")
+            return ""
+        waveform = self.get_wave_form(filepath)
+        data = {
+            "audio_data": waveform.tolist(),
+            "sample_rate": 16000,
+            "launguage": "auto"
+        }
+        try:
+            response = requests.post(self.url + '/asr', json=data)
+            if response.status_code == 200:
+                res = json.loads(response.text)
+                return res.get("text", "")
+            else:
+                print(f"Requests failed: {response.status_code}")
+                return ""
+        except Exception as e:
+            print("Requests failed:", e)
+            return ""
+    def refer_stream(self, filepath):
+        if not self.stream:
+            print("Streaming mode, use refer() instead.")
+            return ""
+        waveform = self.get_wave_form(filepath)
+        data = {
+            "audio_data": waveform.tolist(),
+            "sample_rate": 16000,
+            "launguage": "auto",
+            "step": 0.1,
+        }
+        print('start post')
+        try:
+            response = requests.post(self.url + '/asr_stream', json=data, stream=True)
+            for line in response.iter_lines():
+                if line:
+                    chunk = json.loads(line)
+                    yield chunk.get("text", "")
+        except Exception as e:
+            print("Requests failed:", e)
+            return ""
+stream = True
+client = SensevoiceClient(model="/root/models/sensevoice-maixcam2/model.mud", stream=stream)
+if client.start() is False:
+    print("Failed to start service or model.")
+    exit()
+if not stream:
+    print('start refer')
+    text = client.refer("example/zh.mp3")
+    print(text)
+else:
+    print('start refer stream')
+    for text in client.refer_stream("example/zh.mp3"):
+        print(text)

config.json ADDED Viewed

File without changes

download_dataset.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ wget https://github.com/ml-inory/whisper.axera/releases/download/v1.0/datasets.zip
2	+ unzip datasets.zip -d ./

download_utils.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os
+# Speed up hf download using mirror url
+os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
+from huggingface_hub import snapshot_download
+current_file_path = os.path.dirname(__file__)
+REPO_ROOT = "AXERA-TECH"
+CACHE_PATH = os.path.join(current_file_path, "models")
+def download_model(model_name: str) -> str:
+    """
+    Download model from AXERA-TECH's huggingface space.
+    model_name: str
+        Available model names could be checked on https://huggingface.co/AXERA-TECH.
+    Returns:
+        str: Path to model_name
+    """
+    os.makedirs(CACHE_PATH, exist_ok=True)
+    model_path = os.path.join(CACHE_PATH, model_name)
+    if not os.path.exists(model_path):
+        print(f"Downloading {model_name}...")
+        snapshot_download(
+            repo_id=f"{REPO_ROOT}/{model_name}",
+            local_dir=os.path.join(CACHE_PATH, model_name),
+        )
+    return model_path

embeddings.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a453244ab037744531b97bcb8574c8442301dac11f6406fdab208dddb83b93e
+size 25523

example/en.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f10378336a4e584f3f63799e62f99d5add3c2a401b51d3abe7d3a3a82f255ada
+size 57441

example/ja.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:496dbc43b289e1d0d0cb916df9737450bca56acd8aaca046a7a2472363b1be53
+size 57837

example/ko.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8612f62db8319a6cb4ab4b1d2039bfc32f174f89611889ddafdeb5c0a6070b5f
+size 27909

example/yue.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5098eebc13530a66e4eac1f30d3246e65c9cfc4e096665f9d395aca8eff0d181
+size 31246

example/zh.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e64de19e4ff9a02e682955c9112f32d2317cfdbb5bc2f3504664044c993f195
+size 44973

frontend.py ADDED Viewed

	@@ -0,0 +1,460 @@

+# -*- encoding: utf-8 -*-
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
+import copy
+import numpy as np
+import kaldi_native_fbank as knf
+class WavFrontend:
+    """Conventional frontend structure for ASR."""
+    def __init__(
+        self,
+        cmvn_file: str = None,
+        fs: int = 16000,
+        window: str = "hamming",
+        n_mels: int = 80,
+        frame_length: int = 25,
+        frame_shift: int = 10,
+        lfr_m: int = 1,
+        lfr_n: int = 1,
+        dither: float = 1.0,
+        **kwargs,
+    ) -> None:
+        opts = knf.FbankOptions()
+        opts.frame_opts.samp_freq = fs
+        opts.frame_opts.dither = dither
+        opts.frame_opts.window_type = window
+        opts.frame_opts.frame_shift_ms = float(frame_shift)
+        opts.frame_opts.frame_length_ms = float(frame_length)
+        opts.mel_opts.num_bins = n_mels
+        opts.energy_floor = 0
+        opts.frame_opts.snip_edges = True
+        opts.mel_opts.debug_mel = False
+        self.opts = opts
+        self.lfr_m = lfr_m
+        self.lfr_n = lfr_n
+        self.cmvn_file = cmvn_file
+        if self.cmvn_file:
+            self.cmvn = self.load_cmvn()
+        self.fbank_fn = None
+        self.fbank_beg_idx = 0
+        self.reset_status()
+    def fbank(self, waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        waveform = waveform * (1 << 15)
+        self.fbank_fn = knf.OnlineFbank(self.opts)
+        self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
+        frames = self.fbank_fn.num_frames_ready
+        mat = np.empty([frames, self.opts.mel_opts.num_bins])
+        for i in range(frames):
+            mat[i, :] = self.fbank_fn.get_frame(i)
+        feat = mat.astype(np.float32)
+        feat_len = np.array(mat.shape[0]).astype(np.int32)
+        return feat, feat_len
+    def fbank_online(self, waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        waveform = waveform * (1 << 15)
+        # self.fbank_fn = knf.OnlineFbank(self.opts)
+        self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
+        frames = self.fbank_fn.num_frames_ready
+        mat = np.empty([frames, self.opts.mel_opts.num_bins])
+        for i in range(self.fbank_beg_idx, frames):
+            mat[i, :] = self.fbank_fn.get_frame(i)
+        # self.fbank_beg_idx += (frames-self.fbank_beg_idx)
+        feat = mat.astype(np.float32)
+        feat_len = np.array(mat.shape[0]).astype(np.int32)
+        return feat, feat_len
+    def reset_status(self):
+        self.fbank_fn = knf.OnlineFbank(self.opts)
+        self.fbank_beg_idx = 0
+    def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        if self.lfr_m != 1 or self.lfr_n != 1:
+            feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
+        if self.cmvn_file:
+            feat = self.apply_cmvn(feat)
+        feat_len = np.array(feat.shape[0]).astype(np.int32)
+        return feat, feat_len
+    @staticmethod
+    def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
+        LFR_inputs = []
+        T = inputs.shape[0]
+        T_lfr = int(np.ceil(T / lfr_n))
+        left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
+        inputs = np.vstack((left_padding, inputs))
+        T = T + (lfr_m - 1) // 2
+        for i in range(T_lfr):
+            if lfr_m <= T - i * lfr_n:
+                LFR_inputs.append(
+                    (inputs[i * lfr_n : i * lfr_n + lfr_m]).reshape(1, -1)
+                )
+            else:
+                # process last LFR frame
+                num_padding = lfr_m - (T - i * lfr_n)
+                frame = inputs[i * lfr_n :].reshape(-1)
+                for _ in range(num_padding):
+                    frame = np.hstack((frame, inputs[-1]))
+                LFR_inputs.append(frame)
+        LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
+        return LFR_outputs
+    def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
+        """
+        Apply CMVN with mvn data
+        """
+        frame, dim = inputs.shape
+        means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
+        vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
+        inputs = (inputs + means) * vars
+        return inputs
+    def load_cmvn(
+        self,
+    ) -> np.ndarray:
+        with open(self.cmvn_file, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        means_list = []
+        vars_list = []
+        for i in range(len(lines)):
+            line_item = lines[i].split()
+            if line_item[0] == "<AddShift>":
+                line_item = lines[i + 1].split()
+                if line_item[0] == "<LearnRateCoef>":
+                    add_shift_line = line_item[3 : (len(line_item) - 1)]
+                    means_list = list(add_shift_line)
+                    continue
+            elif line_item[0] == "<Rescale>":
+                line_item = lines[i + 1].split()
+                if line_item[0] == "<LearnRateCoef>":
+                    rescale_line = line_item[3 : (len(line_item) - 1)]
+                    vars_list = list(rescale_line)
+                    continue
+        means = np.array(means_list).astype(np.float64)
+        vars = np.array(vars_list).astype(np.float64)
+        cmvn = np.array([means, vars])
+        return cmvn
+class WavFrontendOnline(WavFrontend):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # self.fbank_fn = knf.OnlineFbank(self.opts)
+        # add variables
+        self.frame_sample_length = int(
+            self.opts.frame_opts.frame_length_ms * self.opts.frame_opts.samp_freq / 1000
+        )
+        self.frame_shift_sample_length = int(
+            self.opts.frame_opts.frame_shift_ms * self.opts.frame_opts.samp_freq / 1000
+        )
+        self.waveform = None
+        self.reserve_waveforms = None
+        self.input_cache = None
+        self.lfr_splice_cache = []
+    @staticmethod
+    # inputs has catted the cache
+    def apply_lfr(
+        inputs: np.ndarray, lfr_m: int, lfr_n: int, is_final: bool = False
+    ) -> Tuple[np.ndarray, np.ndarray, int]:
+        """
+        Apply lfr with data
+        """
+        LFR_inputs = []
+        T = inputs.shape[0]  # include the right context
+        T_lfr = int(
+            np.ceil((T - (lfr_m - 1) // 2) / lfr_n)
+        )  # minus the right context: (lfr_m - 1) // 2
+        splice_idx = T_lfr
+        for i in range(T_lfr):
+            if lfr_m <= T - i * lfr_n:
+                LFR_inputs.append(
+                    (inputs[i * lfr_n : i * lfr_n + lfr_m]).reshape(1, -1)
+                )
+            else:  # process last LFR frame
+                if is_final:
+                    num_padding = lfr_m - (T - i * lfr_n)
+                    frame = (inputs[i * lfr_n :]).reshape(-1)
+                    for _ in range(num_padding):
+                        frame = np.hstack((frame, inputs[-1]))
+                    LFR_inputs.append(frame)
+                else:
+                    # update splice_idx and break the circle
+                    splice_idx = i
+                    break
+        splice_idx = min(T - 1, splice_idx * lfr_n)
+        lfr_splice_cache = inputs[splice_idx:, :]
+        LFR_outputs = np.vstack(LFR_inputs)
+        return LFR_outputs.astype(np.float32), lfr_splice_cache, splice_idx
+    @staticmethod
+    def compute_frame_num(
+        sample_length: int, frame_sample_length: int, frame_shift_sample_length: int
+    ) -> int:
+        frame_num = int(
+            (sample_length - frame_sample_length) / frame_shift_sample_length + 1
+        )
+        return (
+            frame_num if frame_num >= 1 and sample_length >= frame_sample_length else 0
+        )
+    def fbank(
+        self, input: np.ndarray, input_lengths: np.ndarray
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        self.fbank_fn = knf.OnlineFbank(self.opts)
+        batch_size = input.shape[0]
+        if self.input_cache is None:
+            self.input_cache = np.empty((batch_size, 0), dtype=np.float32)
+        input = np.concatenate((self.input_cache, input), axis=1)
+        frame_num = self.compute_frame_num(
+            input.shape[-1], self.frame_sample_length, self.frame_shift_sample_length
+        )
+        # update self.in_cache
+        self.input_cache = input[
+            :, -(input.shape[-1] - frame_num * self.frame_shift_sample_length) :
+        ]
+        waveforms = np.empty(0, dtype=np.float32)
+        feats_pad = np.empty(0, dtype=np.float32)
+        feats_lens = np.empty(0, dtype=np.int32)
+        if frame_num:
+            waveforms = []
+            feats = []
+            feats_lens = []
+            for i in range(batch_size):
+                waveform = input[i]
+                waveforms.append(
+                    waveform[
+                        : (
+                            (frame_num - 1) * self.frame_shift_sample_length
+                            + self.frame_sample_length
+                        )
+                    ]
+                )
+                waveform = waveform * (1 << 15)
+                self.fbank_fn.accept_waveform(
+                    self.opts.frame_opts.samp_freq, waveform.tolist()
+                )
+                frames = self.fbank_fn.num_frames_ready
+                mat = np.empty([frames, self.opts.mel_opts.num_bins])
+                for i in range(frames):
+                    mat[i, :] = self.fbank_fn.get_frame(i)
+                feat = mat.astype(np.float32)
+                feat_len = np.array(mat.shape[0]).astype(np.int32)
+                feats.append(feat)
+                feats_lens.append(feat_len)
+            waveforms = np.stack(waveforms)
+            feats_lens = np.array(feats_lens)
+            feats_pad = np.array(feats)
+        self.fbanks = feats_pad
+        self.fbanks_lens = copy.deepcopy(feats_lens)
+        return waveforms, feats_pad, feats_lens
+    def get_fbank(self) -> Tuple[np.ndarray, np.ndarray]:
+        return self.fbanks, self.fbanks_lens
+    def lfr_cmvn(
+        self, input: np.ndarray, input_lengths: np.ndarray, is_final: bool = False
+    ) -> Tuple[np.ndarray, np.ndarray, List[int]]:
+        batch_size = input.shape[0]
+        feats = []
+        feats_lens = []
+        lfr_splice_frame_idxs = []
+        for i in range(batch_size):
+            mat = input[i, : input_lengths[i], :]
+            lfr_splice_frame_idx = -1
+            if self.lfr_m != 1 or self.lfr_n != 1:
+                # update self.lfr_splice_cache in self.apply_lfr
+                mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(
+                    mat, self.lfr_m, self.lfr_n, is_final
+                )
+            if self.cmvn_file is not None:
+                mat = self.apply_cmvn(mat)
+            feat_length = mat.shape[0]
+            feats.append(mat)
+            feats_lens.append(feat_length)
+            lfr_splice_frame_idxs.append(lfr_splice_frame_idx)
+        feats_lens = np.array(feats_lens)
+        feats_pad = np.array(feats)
+        return feats_pad, feats_lens, lfr_splice_frame_idxs
+    def extract_fbank(
+        self, input: np.ndarray, input_lengths: np.ndarray, is_final: bool = False
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        batch_size = input.shape[0]
+        assert (
+            batch_size == 1
+        ), "we support to extract feature online only when the batch size is equal to 1 now"
+        waveforms, feats, feats_lengths = self.fbank(
+            input, input_lengths
+        )  # input shape: B T D
+        if feats.shape[0]:
+            self.waveforms = (
+                waveforms
+                if self.reserve_waveforms is None
+                else np.concatenate((self.reserve_waveforms, waveforms), axis=1)
+            )
+            if not self.lfr_splice_cache:
+                for i in range(batch_size):
+                    self.lfr_splice_cache.append(
+                        np.expand_dims(feats[i][0, :], axis=0).repeat(
+                            (self.lfr_m - 1) // 2, axis=0
+                        )
+                    )
+            if feats_lengths[0] + self.lfr_splice_cache[0].shape[0] >= self.lfr_m:
+                lfr_splice_cache_np = np.stack(self.lfr_splice_cache)  # B T D
+                feats = np.concatenate((lfr_splice_cache_np, feats), axis=1)
+                feats_lengths += lfr_splice_cache_np[0].shape[0]
+                frame_from_waveforms = int(
+                    (self.waveforms.shape[1] - self.frame_sample_length)
+                    / self.frame_shift_sample_length
+                    + 1
+                )
+                minus_frame = (
+                    (self.lfr_m - 1) // 2 if self.reserve_waveforms is None else 0
+                )
+                feats, feats_lengths, lfr_splice_frame_idxs = self.lfr_cmvn(
+                    feats, feats_lengths, is_final
+                )
+                if self.lfr_m == 1:
+                    self.reserve_waveforms = None
+                else:
+                    reserve_frame_idx = lfr_splice_frame_idxs[0] - minus_frame
+                    # print('reserve_frame_idx:  ' + str(reserve_frame_idx))
+                    # print('frame_frame:  ' + str(frame_from_waveforms))
+                    self.reserve_waveforms = self.waveforms[
+                        :,
+                        reserve_frame_idx
+                        * self.frame_shift_sample_length : frame_from_waveforms
+                        * self.frame_shift_sample_length,
+                    ]
+                    sample_length = (
+                        frame_from_waveforms - 1
+                    ) * self.frame_shift_sample_length + self.frame_sample_length
+                    self.waveforms = self.waveforms[:, :sample_length]
+            else:
+                # update self.reserve_waveforms and self.lfr_splice_cache
+                self.reserve_waveforms = self.waveforms[
+                    :, : -(self.frame_sample_length - self.frame_shift_sample_length)
+                ]
+                for i in range(batch_size):
+                    self.lfr_splice_cache[i] = np.concatenate(
+                        (self.lfr_splice_cache[i], feats[i]), axis=0
+                    )
+                return np.empty(0, dtype=np.float32), feats_lengths
+        else:
+            if is_final:
+                self.waveforms = (
+                    waveforms
+                    if self.reserve_waveforms is None
+                    else self.reserve_waveforms
+                )
+                feats = np.stack(self.lfr_splice_cache)
+                feats_lengths = np.zeros(batch_size, dtype=np.int32) + feats.shape[1]
+                feats, feats_lengths, _ = self.lfr_cmvn(feats, feats_lengths, is_final)
+        if is_final:
+            self.cache_reset()
+        return feats, feats_lengths
+    def get_waveforms(self):
+        return self.waveforms
+    def cache_reset(self):
+        self.fbank_fn = knf.OnlineFbank(self.opts)
+        self.reserve_waveforms = None
+        self.input_cache = None
+        self.lfr_splice_cache = []
+def load_bytes(input):
+    middle_data = np.frombuffer(input, dtype=np.int16)
+    middle_data = np.asarray(middle_data)
+    if middle_data.dtype.kind not in "iu":
+        raise TypeError("'middle_data' must be an array of integers")
+    dtype = np.dtype("float32")
+    if dtype.kind != "f":
+        raise TypeError("'dtype' must be a floating point type")
+    i = np.iinfo(middle_data.dtype)
+    abs_max = 2 ** (i.bits - 1)
+    offset = i.min + abs_max
+    array = np.frombuffer(
+        (middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32
+    )
+    return array
+class SinusoidalPositionEncoderOnline:
+    """Streaming Positional encoding."""
+    def encode(
+        self,
+        positions: np.ndarray = None,
+        depth: int = None,
+        dtype: np.dtype = np.float32,
+    ):
+        batch_size = positions.shape[0]
+        positions = positions.astype(dtype)
+        log_timescale_increment = np.log(np.array([10000], dtype=dtype)) / (
+            depth / 2 - 1
+        )
+        inv_timescales = np.exp(
+            np.arange(depth / 2).astype(dtype) * (-log_timescale_increment)
+        )
+        inv_timescales = np.reshape(inv_timescales, [batch_size, -1])
+        scaled_time = np.reshape(positions, [1, -1, 1]) * np.reshape(
+            inv_timescales, [1, 1, -1]
+        )
+        encoding = np.concatenate((np.sin(scaled_time), np.cos(scaled_time)), axis=2)
+        return encoding.astype(dtype)
+    def forward(self, x, start_idx=0):
+        batch_size, timesteps, input_dim = x.shape
+        positions = np.arange(1, timesteps + 1 + start_idx)[None, :]
+        position_encoding = self.encode(positions, input_dim, x.dtype)
+        return x + position_encoding[:, start_idx : start_idx + timesteps]
+def test():
+    path = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav"
+    import librosa
+    cmvn_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/am.mvn"
+    config_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/config.yaml"
+    from funasr.runtime.python.onnxruntime.rapid_paraformer.utils.utils import read_yaml
+    config = read_yaml(config_file)
+    waveform, _ = librosa.load(path, sr=None)
+    frontend = WavFrontend(
+        cmvn_file=cmvn_file,
+        **config["frontend_conf"],
+    )
+    speech, _ = frontend.fbank_online(waveform)  # 1d, (sample,), numpy
+    feat, feat_len = frontend.lfr_cmvn(
+        speech
+    )  # 2d, (frame, 450), np.float32 -> torch, torch.from_numpy(), dtype, (1, frame, 450)
+    frontend.reset_status()  # clear cache
+    return feat, feat_len
+if __name__ == "__main__":
+    test()

gradio_demo.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import gradio as gr
+import os
+from SenseVoiceAx import SenseVoiceAx
+from print_utils import rich_transcription_postprocess
+max_len = 256
+model_path = os.path.join("sensevoice_ax650", "sensevoice.axmodel")
+assert os.path.exists(model_path), f"model {model_path} not exist"
+pipeline = SenseVoiceAx(
+    model_path,
+    max_len=max_len,
+    beam_size=3,
+    language="auto",
+    hot_words=None,
+    use_itn=True,
+    streaming=False,
+)
+def speech_to_text(audio_path, lang):
+    """
+    audio_path: 音频文件路径
+    lang: 语言类型 "auto", "zh", "en", "yue", "ja", "ko"
+    """
+    if not audio_path:
+        return "无音频"
+    pipeline.choose_language(language=lang)
+    asr_res = pipeline.infer(audio_path, print_rtf=False)
+    return asr_res
+def main():
+    with gr.Blocks() as demo:
+        with gr.Row():
+            output_text = gr.Textbox(label="识别结果", lines=5)
+        with gr.Row():
+            audio_input = gr.Audio(
+                sources=["upload"], type="filepath", label="录制或上传音频", format="mp3"
+            )
+            lang_dropdown = gr.Dropdown(
+                choices=["auto", "zh", "en", "yue", "ja", "ko"],
+                value="auto",
+                label="选择音频语言",
+            )
+        audio_input.change(
+            fn=speech_to_text, inputs=[audio_input, lang_dropdown], outputs=output_text
+        )
+    demo.launch(
+        server_name="0.0.0.0",
+    )
+if __name__ == "__main__":
+    main()

main.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+import argparse
+from SenseVoiceAx import SenseVoiceAx
+import librosa
+import numpy as np
+import time
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input", "-i", required=True, type=str, help="Input audio file"
+    )
+    parser.add_argument(
+        "--language",
+        "-l",
+        required=False,
+        type=str,
+        default="auto",
+        choices=["auto", "zh", "en", "yue", "ja", "ko"],
+    )
+    parser.add_argument("--streaming", action="store_true")
+    return parser.parse_args()
+def main():
+    args = get_args()
+    input_audio = args.input
+    language = args.language
+    use_itn = True  # 标点符号预测
+    if not args.streaming:
+        max_len = 256
+        model_path = os.path.join("sensevoice_ax630c", "sensevoice.axmodel")
+    else:
+        max_len = 26
+        model_path = os.path.join("sensevoice_ax630c", "streaming_sensevoice.axmodel")
+    assert os.path.exists(model_path), f"model {model_path} not exist"
+    print(f"input_audio: {input_audio}")
+    print(f"language: {language}")
+    print(f"use_itn: {use_itn}")
+    print(f"model_path: {model_path}")
+    print(f"streaming: {args.streaming}")
+    pipeline = SenseVoiceAx(
+        model_path,
+        max_len=max_len,
+        beam_size=3,
+        language="auto",
+        hot_words=None,
+        use_itn=True,
+        streaming=args.streaming,
+    )
+    if not args.streaming:
+        asr_res = pipeline.infer(input_audio, print_rtf=True)
+        print("ASR result: " + asr_res)
+    else:
+        samples, sr = librosa.load(input_audio, sr=16000)
+        samples = (samples * 32768).tolist()
+        duration = len(samples) / 16000
+        start = time.time()
+        step = int(0.1 * sr)
+        for i in range(0, len(samples), step):
+            is_last = i + step >= len(samples)
+            for res in pipeline.stream_infer(samples[i : i + step], is_last):
+                print(res)
+        end = time.time()
+        cost_time = end - start
+        print(f"RTF: {cost_time / duration}")
+if __name__ == "__main__":
+    main()

model.mud ADDED Viewed

	@@ -0,0 +1,15 @@

+[basic]
+type = axmodel
+model_npu = sensevoice_ax630c/sensevoice.axmodel
+model_vnpu =
+[extra]
+model_type = sensevoice
+input_cache = true
+output_cache = true
+beam_size = 3
+language = auto
+hot_words = None,
+use_itn = True
+stream_model = sensevoice_ax630c/streaming_sensevoice.axmodel
+server_url = http://127.0.0.1:12345

pe_nonstream.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f1c9c550bd62fa164a959517f52d46a28591812fafdf002df0df2bd998f44b5
+size 573568

pe_streaming.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54fec2fe2670168d36678c5857e65c459c634e6b6d6df928b7d415399ce2c291
+size 58368

print_utils.py ADDED Viewed

	@@ -0,0 +1,131 @@

+emo_dict = {
+    "<|HAPPY|>": "😊",
+    "<|SAD|>": "😔",
+    "<|ANGRY|>": "😡",
+    "<|NEUTRAL|>": "",
+    "<|FEARFUL|>": "😰",
+    "<|DISGUSTED|>": "🤢",
+    "<|SURPRISED|>": "😮",
+}
+event_dict = {
+    "<|BGM|>": "🎼",
+    "<|Speech|>": "",
+    "<|Applause|>": "👏",
+    "<|Laughter|>": "😀",
+    "<|Cry|>": "😭",
+    "<|Sneeze|>": "🤧",
+    "<|Breath|>": "",
+    "<|Cough|>": "🤧",
+}
+lang_dict = {
+    "<|zh|>": "<|lang|>",
+    "<|en|>": "<|lang|>",
+    "<|yue|>": "<|lang|>",
+    "<|ja|>": "<|lang|>",
+    "<|ko|>": "<|lang|>",
+    "<|nospeech|>": "<|lang|>",
+}
+emoji_dict = {
+    "<|nospeech|><|Event_UNK|>": "❓",
+    "<|zh|>": "",
+    "<|en|>": "",
+    "<|yue|>": "",
+    "<|ja|>": "",
+    "<|ko|>": "",
+    "<|nospeech|>": "",
+    "<|HAPPY|>": "😊",
+    "<|SAD|>": "😔",
+    "<|ANGRY|>": "😡",
+    "<|NEUTRAL|>": "",
+    "<|BGM|>": "🎼",
+    "<|Speech|>": "",
+    "<|Applause|>": "👏",
+    "<|Laughter|>": "😀",
+    "<|FEARFUL|>": "😰",
+    "<|DISGUSTED|>": "🤢",
+    "<|SURPRISED|>": "😮",
+    "<|Cry|>": "😭",
+    "<|EMO_UNKNOWN|>": "",
+    "<|Sneeze|>": "🤧",
+    "<|Breath|>": "",
+    "<|Cough|>": "😷",
+    "<|Sing|>": "",
+    "<|Speech_Noise|>": "",
+    "<|withitn|>": "",
+    "<|woitn|>": "",
+    "<|GBG|>": "",
+    "<|Event_UNK|>": "",
+}
+emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
+event_set = {
+    "🎼",
+    "👏",
+    "😀",
+    "😭",
+    "🤧",
+    "😷",
+}
+def format_str_v2(s):
+    sptk_dict = {}
+    for sptk in emoji_dict:
+        sptk_dict[sptk] = s.count(sptk)
+        s = s.replace(sptk, "")
+    emo = "<|NEUTRAL|>"
+    for e in emo_dict:
+        if sptk_dict[e] > sptk_dict[emo]:
+            emo = e
+    for e in event_dict:
+        if sptk_dict[e] > 0:
+            s = event_dict[e] + s
+    s = s + emo_dict[emo]
+    for emoji in emo_set.union(event_set):
+        s = s.replace(" " + emoji, emoji)
+        s = s.replace(emoji + " ", emoji)
+    return s.strip()
+def rich_transcription_postprocess(s):
+    def get_emo(s):
+        return s[-1] if s[-1] in emo_set else None
+    def get_event(s):
+        return s[0] if s[0] in event_set else None
+    s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
+    for lang in lang_dict:
+        s = s.replace(lang, "<|lang|>")
+    s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
+    new_s = " " + s_list[0]
+    cur_ent_event = get_event(new_s)
+    for i in range(1, len(s_list)):
+        if len(s_list[i]) == 0:
+            continue
+        if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
+            s_list[i] = s_list[i][1:]
+        # else:
+        cur_ent_event = get_event(s_list[i])
+        if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
+            new_s = new_s[:-1]
+        new_s += s_list[i].strip().lstrip()
+    new_s = new_s.replace("The.", " ")
+    return new_s.strip()
+def rich_print_asr_res(asr_res, will_print=True, remove_punc=False):
+    res = "".join([rich_transcription_postprocess(i) for i in asr_res])
+    if remove_punc:
+        res = res.replace("，", "")
+        res = res.replace("。", "")
+    if will_print:
+        print(res)
+    return res

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+huggingface_hub
+numpy<2
+kaldi-native-fbank
+librosa==0.9.1
+sentencepiece
+fastapi
+gradio
+emoji
+asr-decoder
+online-fbank
+torch

sensevoice_ax630c/sensevoice.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67d290cf7cebf45db5f37b2e93b8bdfff44dc35110bb29d84204a5f9eae9fd4d
+size 256550253

sensevoice_ax630c/streaming_sensevoice.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba1ddd60841297903bfdae059ad88092d0fd1c543e1d80d7f64199d4e27b8263
+size 249023211

server.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import numpy as np
+from fastapi import FastAPI, HTTPException, Body
+from fastapi.responses import JSONResponse, StreamingResponse
+from typing import List, Optional
+import logging
+import json
+import configparser
+from SenseVoiceAx import SenseVoiceAx
+import os
+import librosa
+# 初始化日志
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+app = FastAPI(title="ASR Server", description="Automatic Speech Recognition API")
+# 全局变量存储模型
+asr_model = None
+asr_model_is_loaded = False
+mud_configs = None
+def parse_config_file_to_json(file_path):
+    """从文件读取配置并解析为JSON"""
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"配置文件不存在: {file_path}")
+    config = configparser.ConfigParser()
+    config.read(file_path, encoding='utf-8')
+    result = {}
+    for section in config.sections():
+        result[section] = {}
+        for key, value in config[section].items():
+            # 简单类型转换
+            value = value.strip()
+            if value.lower() == 'true':
+                result[section][key] = True
+            elif value.lower() == 'false':
+                result[section][key] = False
+            elif value.lower() == 'none' or value == '':
+                result[section][key] = None
+            elif value.isdigit():
+                result[section][key] = int(value)
+            else:
+                result[section][key] = value
+    return result
+@app.on_event("startup")
+async def load_model():
+    pass
+def validate_audio_data(audio_data: List[float]) -> np.ndarray:
+    """
+    验证并转换音频数据为numpy数组
+    参数:
+    - audio_data: 浮点数列表表示的音频数据
+    返回:
+    - 验证后的numpy数组
+    """
+    try:
+        # 转换为numpy数组
+        np_array = np.array(audio_data, dtype=np.float32)
+        # 验证数据有效性
+        if np_array.ndim != 1:
+            raise ValueError("Audio data must be 1-dimensional")
+        if len(np_array) == 0:
+            raise ValueError("Audio data cannot be empty")
+        return np_array
+    except Exception as e:
+        raise ValueError(f"Invalid audio data: {str(e)}")
+@app.get("/get_language", summary="Get current language")
+async def get_language():
+    return JSONResponse(content={"language": asr_model.language})
+@app.get(
+    "/get_language_options",
+    summary="Get possible language options, possible options include [auto, zh, en, yue, ja, ko]",
+)
+async def get_language_options():
+    return JSONResponse(content={"language_options": asr_model.language_options})
+@app.get("/status", summary="Get ASR model status")
+async def get_status():
+    global asr_model_is_loaded
+    return JSONResponse(content={"status": "loaded" if asr_model_is_loaded else "not loaded"})
+@app.post("/start_model", summary="Load model")
+async def start_model(
+    model_path: str = Body(
+        "sensevoice_ax630c/sensevoice.axmodel",
+        description="Path to the model file",
+    ),
+    language: str = Body("auto", description="Language"),
+    stream: bool = Body(False, description="streaming or not"),
+):
+    """
+    服务启动时加载ASR模型
+    """
+    global asr_model
+    global asr_model_is_loaded
+    logger.info("Loading ASR model...")
+    if asr_model_is_loaded:
+        return JSONResponse(content={"status": "loaded"})
+    try:
+        mud_configs = parse_config_file_to_json(model_path)
+        axmodel_path = mud_configs.get("basic", {}).get("model_npu", None)
+        streaming_axmodel_path = mud_configs.get("extra", {}).get("stream_model", None)
+        model_dir_path = os.path.dirname(model_path)
+        if stream:
+            if streaming_axmodel_path is None:
+                logger.error(f"Not found model:{streaming_axmodel_path}")
+                raise HTTPException(status_code=400, detail=f"Not found model:{streaming_axmodel_path}")
+            model_path = os.path.join(model_dir_path, streaming_axmodel_path)
+        else:
+            if axmodel_path is None:
+                logger.error(f"Not found model:{axmodel_path}")
+                raise HTTPException(status_code=400, detail=f"Not found model:{axmodel_path}")
+            model_path = os.path.join(model_dir_path, axmodel_path)
+        # 模型加载
+        use_itn = mud_configs.get("extra", {}).get("use_itn", True)  # 逆文本规范
+        beam_size = mud_configs.get("extra", {}).get("beam_size", 3)
+        hot_words = mud_configs.get("extra", {}).get("hot_words", None)
+        use_itn = mud_configs.get("extra", {}).get("use_itn", True)
+        streaming = stream
+        max_len = 26 if streaming else 256
+        print(f'model path: {model_path}')
+        print(f'max_len: {max_len}')
+        print(f'beam_size: {beam_size}')
+        print(f"language: {language}")
+        print(f'hot_words: {hot_words}')
+        print(f"use_itn: {use_itn}")
+        print(f'streaming: {streaming}')
+        if not os.path.exists(model_path):
+            raise HTTPException(status_code=400, detail=f"model {model_path} not exist")
+        asr_model = SenseVoiceAx(
+            model_path,
+            max_len=max_len,
+            beam_size=beam_size,
+            language=language,
+            hot_words=hot_words,
+            use_itn=use_itn,
+            streaming=streaming,
+        )
+        logger.info("ASR model loaded successfully")
+    except Exception as e:
+        logger.error(f"Failed to load ASR model: {str(e)}")
+        raise
+    return JSONResponse(content={"status": "loaded"})
+@app.post("/stop_model", summary="Load model")
+async def stop_model(
+):
+    global asr_model
+    global asr_model_is_loaded
+    del asr_model
+    asr_model = None
+    asr_model_is_loaded = False
+@app.post("/asr", summary="Recognize speech from numpy audio data")
+async def recognize_speech(
+    audio_data: List[float] = Body(
+        ..., embed=True, description="Audio data as list of floats"
+    ),
+    sample_rate: Optional[int] = Body(16000, description="Audio sample rate in Hz"),
+    language: Optional[str] = Body("auto", description="Language"),
+):
+    """
+    接收numpy数组格式的音频数据并返回识别结果
+    参数:
+    - audio_data: 浮点数列表表示的音频数据
+    - sample_rate: 音频采样率(默认16000Hz)
+    返回:
+    - JSON包含识别文本
+    """
+    try:
+        # 检查模型是否已加载
+        if asr_model is None:
+            raise HTTPException(status_code=503, detail="ASR model not loaded")
+        logger.info(f"Received audio data with length: {len(audio_data)}")
+        # 验证并转换数据
+        np_audio = validate_audio_data(audio_data)
+        if sample_rate != asr_model.sample_rate:
+            np_audio = librosa.resample(np_audio, sample_rate, asr_model.sample_rate)
+        # 调用模型进行识别
+        result = asr_model.infer_waveform(np_audio, language)
+        return JSONResponse(content={"text": result})
+    except ValueError as e:
+        logger.error(f"Validation error: {str(e)}")
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        logger.error(f"Recognition error: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/asr_stream", summary="Recognize speech from numpy audio data")
+async def recognize_speech_stream(
+    audio_data: List[float] = Body(
+        ..., embed=True, description="Audio data as list of floats"
+    ),
+    sample_rate: Optional[int] = Body(16000, description="Audio sample rate in Hz"),
+    language: Optional[str] = Body("auto", description="Language"),
+    step: Optional[float] = Body(0.1, description="step in seconds"),
+):
+    """
+    接收numpy数组格式的音频数据并返回识别结果
+    参数:
+    - audio_data: 浮点数列表表示的音频数据
+    - sample_rate: 音频采样率(默认16000Hz)
+    返回:
+    - JSON包含识别文本
+    """
+    try:
+        # 检查模型是否已加载
+        if asr_model is None:
+            raise HTTPException(status_code=503, detail="ASR model not loaded")
+        logger.info(f"Received audio data with length: {len(audio_data)}")
+        # 验证并转换数据
+        np_audio = validate_audio_data(audio_data)
+        if sample_rate != asr_model.sample_rate:
+            np_audio = librosa.resample(np_audio, sample_rate, asr_model.sample_rate)
+        # 调用模型进行识别
+        def stream_infer(np_audio, step):
+            samples = (np_audio * 32768).tolist()
+            step = int(step * 16000)
+            for i in range(0, len(samples), step):
+                is_last = i + step >= len(samples)
+                for res in asr_model.stream_infer(samples[i : i + step], is_last, language):
+                    yield json.dumps(res) + "\n"
+        return StreamingResponse(stream_infer(np_audio, step), media_type="application/json")
+    except ValueError as e:
+        logger.error(f"Validation error: {str(e)}")
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        logger.error(f"Recognition error: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=12347)

test_wer.py ADDED Viewed

	@@ -0,0 +1,296 @@

+import os, sys
+import argparse
+from SenseVoiceAx import SenseVoiceAx
+from tokenizer import SentencepiecesTokenizer
+from print_utils import rich_transcription_postprocess, rich_print_asr_res
+from download_utils import download_model
+import logging
+import re
+import emoji
+def setup_logging():
+    """配置日志系统，同时输出到控制台和文件"""
+    # 获取脚本所在目录
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    log_file = os.path.join(script_dir, "test_wer.log")
+    # 配置日志格式
+    log_format = "%(asctime)s - %(levelname)s - %(message)s"
+    date_format = "%Y-%m-%d %H:%M:%S"
+    # 创建logger
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    # 清除现有的handler
+    for handler in logger.handlers[:]:
+        logger.removeHandler(handler)
+    # 创建文件handler
+    file_handler = logging.FileHandler(log_file, mode="w", encoding="utf-8")
+    file_handler.setLevel(logging.INFO)
+    file_formatter = logging.Formatter(log_format, date_format)
+    file_handler.setFormatter(file_formatter)
+    # 创建控制台handler
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+    console_formatter = logging.Formatter(log_format, date_format)
+    console_handler.setFormatter(console_formatter)
+    # 添加handler到logger
+    logger.addHandler(file_handler)
+    logger.addHandler(console_handler)
+    return logger
+class AIShellDataset:
+    def __init__(self, gt_path: str):
+        """
+        初始化数据集
+        Args:
+            json_path: voice.json文件的路径
+        """
+        self.gt_path = gt_path
+        self.dataset_dir = os.path.dirname(gt_path)
+        self.voice_dir = os.path.join(self.dataset_dir, "aishell_S0764")
+        # 检查必要文件和文件夹是否存在
+        assert os.path.exists(gt_path), f"gt文件不存在: {gt_path}"
+        assert os.path.exists(self.voice_dir), f"aishell_S0764文件夹不存在: {self.voice_dir}"
+        # 加载数据
+        self.data = []
+        with open(gt_path, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                audio_path, gt = line.split(" ")
+                audio_path = os.path.join(self.voice_dir, audio_path + ".wav")
+                self.data.append({"audio_path": audio_path, "gt": gt})
+        # 使用logging而不是print
+        logger = logging.getLogger()
+        logger.info(f"加载了 {len(self.data)} 条数据")
+    def __iter__(self):
+        """返回迭代器"""
+        self.index = 0
+        return self
+    def __next__(self):
+        """返回下一个数据项"""
+        if self.index >= len(self.data):
+            raise StopIteration
+        item = self.data[self.index]
+        audio_path = item["audio_path"]
+        ground_truth = item["gt"]
+        self.index += 1
+        return audio_path, ground_truth
+    def __len__(self):
+        """返回数据集大小"""
+        return len(self.data)
+class CommonVoiceDataset:
+    """Common Voice数据集解析器"""
+    def __init__(self, tsv_path: str):
+        """
+        初始化数据集
+        Args:
+            json_path: voice.json文件的路径
+        """
+        self.tsv_path = tsv_path
+        self.dataset_dir = os.path.dirname(tsv_path)
+        self.voice_dir = os.path.join(self.dataset_dir, "clips")
+        # 检查必要文件和文件夹是否存在
+        assert os.path.exists(tsv_path), f"{tsv_path}文件不存在: {tsv_path}"
+        assert os.path.exists(self.voice_dir), f"voice文件夹不存在: {self.voice_dir}"
+        # 加载JSON数据
+        self.data = []
+        with open(tsv_path, "r", encoding="utf-8") as f:
+            f.readline()
+            for line in f:
+                line = line.strip()
+                splits = line.split("\t")
+                audio_path = splits[1]
+                gt = splits[3]
+                audio_path = os.path.join(self.voice_dir, audio_path)
+                self.data.append({"audio_path": audio_path, "gt": gt})
+        # 使用logging而不是print
+        logger = logging.getLogger()
+        logger.info(f"加载了 {len(self.data)} 条数据")
+    def __iter__(self):
+        """返回迭代器"""
+        self.index = 0
+        return self
+    def __next__(self):
+        """返回下一个数据项"""
+        if self.index >= len(self.data):
+            raise StopIteration
+        item = self.data[self.index]
+        audio_path = item["audio_path"]
+        ground_truth = item["gt"]
+        self.index += 1
+        return audio_path, ground_truth
+    def __len__(self):
+        """返回数据集大小"""
+        return len(self.data)
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset",
+        "-d",
+        type=str,
+        required=True,
+        choices=["aishell", "common_voice"],
+        help="Test dataset",
+    )
+    parser.add_argument(
+        "--gt_path",
+        "-g",
+        type=str,
+        required=True,
+        help="Test dataset ground truth file",
+    )
+    parser.add_argument(
+        "--language",
+        "-l",
+        required=False,
+        type=str,
+        default="auto",
+        choices=["auto", "zh", "en", "yue", "ja", "ko"],
+    )
+    parser.add_argument(
+        "--max_num", type=int, default=-1, required=False, help="Maximum test data num"
+    )
+    return parser.parse_args()
+def min_distance(word1: str, word2: str) -> int:
+    row = len(word1) + 1
+    column = len(word2) + 1
+    cache = [[0] * column for i in range(row)]
+    for i in range(row):
+        for j in range(column):
+            if i == 0 and j == 0:
+                cache[i][j] = 0
+            elif i == 0 and j != 0:
+                cache[i][j] = j
+            elif j == 0 and i != 0:
+                cache[i][j] = i
+            else:
+                if word1[i - 1] == word2[j - 1]:
+                    cache[i][j] = cache[i - 1][j - 1]
+                else:
+                    replace = cache[i - 1][j - 1] + 1
+                    insert = cache[i][j - 1] + 1
+                    remove = cache[i - 1][j] + 1
+                    cache[i][j] = min(replace, insert, remove)
+    return cache[row - 1][column - 1]
+def remove_punctuation(text):
+    # 定义正则表达式模式，匹配所有标点符号
+    # 这个模式包括常见的标点符号和中文标点
+    pattern = r"[^\w\s]|_"
+    # 使用sub方法将所有匹配的标点符号替换为空字符串
+    cleaned_text = re.sub(pattern, "", text)
+    return cleaned_text
+def main():
+    logger = setup_logging()
+    args = get_args()
+    language = args.language
+    use_itn = False  # 标点符号预测
+    max_num = args.max_num
+    dataset_type = args.dataset.lower()
+    if dataset_type == "aishell":
+        dataset = AIShellDataset(args.gt_path)
+    elif dataset_type == "common_voice":
+        dataset = CommonVoiceDataset(args.gt_path)
+    else:
+        raise ValueError(f"Unknown dataset type {dataset_type}")
+    # model_path_root = download_model("SenseVoice")
+    model_path = os.path.join("sensevoice_ax650", "sensevoice.axmodel")
+    bpemodel = "chn_jpn_yue_eng_ko_spectok.bpe.model"
+    assert os.path.exists(model_path), f"model {model_path} not exist"
+    logger.info(f"dataset: {args.dataset}")
+    logger.info(f"language: {language}")
+    logger.info(f"use_itn: {use_itn}")
+    logger.info(f"model_path: {model_path}")
+    tokenizer = SentencepiecesTokenizer(bpemodel=bpemodel)
+    pipeline = SenseVoiceAx(
+        model_path, language=language, use_itn=use_itn, tokenizer=tokenizer, max_len=256
+    )
+    # Iterate over dataset
+    hyp = []
+    references = []
+    all_character_error_num = 0
+    all_character_num = 0
+    max_data_num = max_num if max_num > 0 else len(dataset)
+    for n, (audio_path, reference) in enumerate(dataset):
+        reference = remove_punctuation(reference).lower()
+        asr_res = pipeline.infer(audio_path, print_rtf=False)
+        hypothesis = rich_print_asr_res(
+            asr_res, will_print=False, remove_punc=True
+        ).lower()
+        hypothesis = emoji.replace_emoji(hypothesis, replace="")
+        character_error_num = min_distance(reference, hypothesis)
+        character_num = len(reference)
+        character_error_rate = character_error_num / character_num * 100
+        all_character_error_num += character_error_num
+        all_character_num += character_num
+        hyp.append(hypothesis)
+        references.append(reference)
+        line_content = f"({n+1}/{max_data_num}) {os.path.basename(audio_path)}  gt: {reference}  predict: {hypothesis}  WER: {character_error_rate}%"
+        logger.info(line_content)
+        if n + 1 >= max_data_num:
+            break
+    total_character_error_rate = all_character_error_num / all_character_num * 100
+    logger.info(f"Total WER: {total_character_error_rate}%")
+if __name__ == "__main__":
+    main()

tokenizer.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import sentencepiece as spm
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
+import json
+from abc import abstractmethod
+from abc import ABC
+import numpy as np
+class BaseTokenizer(ABC):
+    def __init__(
+        self,
+        token_list: Union[Path, str, Iterable[str]] = None,
+        unk_symbol: str = "<unk>",
+        **kwargs,
+    ):
+        if token_list is not None:
+            if isinstance(token_list, (Path, str)) and token_list.endswith(".txt"):
+                token_list = Path(token_list)
+                self.token_list_repr = str(token_list)
+                self.token_list: List[str] = []
+                with token_list.open("r", encoding="utf-8") as f:
+                    for idx, line in enumerate(f):
+                        line = line.rstrip()
+                        self.token_list.append(line)
+            elif isinstance(token_list, (Path, str)) and token_list.endswith(".json"):
+                token_list = Path(token_list)
+                self.token_list_repr = str(token_list)
+                self.token_list: List[str] = []
+                with open(token_list, "r", encoding="utf-8") as f:
+                    self.token_list = json.load(f)
+            else:
+                self.token_list: List[str] = list(token_list)
+                self.token_list_repr = ""
+                for i, t in enumerate(self.token_list):
+                    if i == 3:
+                        break
+                    self.token_list_repr += f"{t}, "
+                self.token_list_repr += f"... (NVocab={(len(self.token_list))})"
+            self.token2id: Dict[str, int] = {}
+            for i, t in enumerate(self.token_list):
+                if t in self.token2id:
+                    raise RuntimeError(f'Symbol "{t}" is duplicated')
+                self.token2id[t] = i
+            self.unk_symbol = unk_symbol
+            if self.unk_symbol not in self.token2id:
+                raise RuntimeError(
+                    f"Unknown symbol '{unk_symbol}' doesn't exist in the token_list"
+                )
+            self.unk_id = self.token2id[self.unk_symbol]
+    def encode(self, text, **kwargs):
+        tokens = self.text2tokens(text)
+        text_ints = self.tokens2ids(tokens)
+        return text_ints
+    def decode(self, text_ints):
+        token = self.ids2tokens(text_ints)
+        text = self.tokens2text(token)
+        return text
+    def get_num_vocabulary_size(self) -> int:
+        return len(self.token_list)
+    def ids2tokens(self, integers: Union[np.ndarray, Iterable[int]]) -> List[str]:
+        if isinstance(integers, np.ndarray) and integers.ndim != 1:
+            raise ValueError(f"Must be 1 dim ndarray, but got {integers.ndim}")
+        return [self.token_list[i] for i in integers]
+    def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
+        return [self.token2id.get(i, self.unk_id) for i in tokens]
+    @abstractmethod
+    def text2tokens(self, line: str) -> List[str]:
+        raise NotImplementedError
+    @abstractmethod
+    def tokens2text(self, tokens: Iterable[str]) -> str:
+        raise NotImplementedError
+class SentencepiecesTokenizer(BaseTokenizer):
+    def __init__(self, bpemodel: Union[Path, str], **kwargs):
+        super().__init__(**kwargs)
+        self.bpemodel = str(bpemodel)
+        # NOTE(kamo):
+        # Don't build SentencePieceProcessor in __init__()
+        # because it's not picklable and it may cause following error,
+        # "TypeError: can't pickle SwigPyObject objects",
+        # when giving it as argument of "multiprocessing.Process()".
+        self.sp = None
+        self._build_sentence_piece_processor()
+    def __repr__(self):
+        return f'{self.__class__.__name__}(model="{self.bpemodel}")'
+    def _build_sentence_piece_processor(self):
+        # Build SentencePieceProcessor lazily.
+        if self.sp is None:
+            self.sp = spm.SentencePieceProcessor()
+            self.sp.load(self.bpemodel)
+    def text2tokens(self, line: str) -> List[str]:
+        self._build_sentence_piece_processor()
+        return self.sp.EncodeAsPieces(line)
+    def tokens2text(self, tokens: Iterable[str]) -> str:
+        self._build_sentence_piece_processor()
+        return self.sp.DecodePieces(list(tokens))
+    def encode(self, line: str, **kwargs) -> List[int]:
+        self._build_sentence_piece_processor()
+        return self.sp.EncodeAsIds(line)
+    def decode(self, line: List[int], **kwargs):
+        self._build_sentence_piece_processor()
+        return self.sp.DecodeIds(line)
+    def get_vocab_size(self):
+        return self.sp.GetPieceSize()
+    def ids2tokens(self, *args, **kwargs):
+        return self.decode(*args, **kwargs)
+    def tokens2ids(self, *args, **kwargs):
+        return self.encode(*args, **kwargs)