inoryQwQ commited on Apr 25, 2025

Commit

f3ecff1

1 Parent(s): 5a86cb7

change structure

Browse files

Files changed (21) hide show

LICENSE +21 -0
README.md +50 -3
SenseVoiceAx.py +192 -0
download_utils.py +29 -0
auto.npy → embeddings/auto.npy +0 -0
en.npy → embeddings/en.npy +0 -0
event_emo.npy → embeddings/event_emo.npy +0 -0
ja.npy → embeddings/ja.npy +0 -0
ko.npy → embeddings/ko.npy +0 -0
nospeech.npy → embeddings/nospeech.npy +0 -0
position_encoding.npy → embeddings/position_encoding.npy +0 -0
withitn.npy → embeddings/withitn.npy +0 -0
woitn.npy → embeddings/woitn.npy +0 -0
yue.npy → embeddings/yue.npy +0 -0
zh.npy → embeddings/zh.npy +0 -0
frontend.py +429 -0
main.py +40 -0
print_utils.py +121 -0
requirements.txt +5 -0
sensevoice.axmodel → sensevoice_ax650/sensevoice.axmodel +0 -0
tokenizer.py +133 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 祈Inory
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,50 @@
----
-license: mit
----

+# sensevoice.axera
+FunASR SenseVoice on Axera, official repo: https://github.com/FunAudioLLM/SenseVoice
+## 功能
+ - 语音识别
+ - 自动识别语言
+ - 情感识别
+ - 自动标点
+## 支持平台
+- [x] AX650N
+- [ ] AX630C
+## 环境安装
+```
+pip3 install -r requirements.txt
+```
+如果空间不足可以使用 --prefix 指定别的安装路径
+## 使用
+```
+# 首次运行会自动从huggingface上下载模型, 保存到models中
+python3 main.py -i 输入音频文件
+```
+运行参数说明:
+| 参数名称 | 说明 | 默认值 |
+| --- | --- | --- |
+| --input/-i | 输入音频文件 | |
+| --language/-l | 识别语言，支持auto, zh, en, yue, ja, ko | auto |
+### 示例:
+example下有测试音频
+如 粤语测试
+```
+python3 main.py -i example/yue.mp3
+```
+输出
+```
+RTF: 0.03026517820946964    Latency: 0.15689468383789062s  Total length: 5.184s
+['呢几个字。', '都表达唔到，我想讲嘅意。', '思。']
+```
+## 技术讨论
+- Github issues
+- QQ 群: 139953715

SenseVoiceAx.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import axengine as axe
+import numpy as np
+import librosa
+from frontend import WavFrontend
+import os
+import time
+from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
+def sequence_mask(lengths, maxlen=None, dtype=np.float32):
+    # 如果 maxlen 未指定，则取 lengths 中的最大值
+    if maxlen is None:
+        maxlen = np.max(lengths)
+    # 创建一个从 0 到 maxlen-1 的行向量
+    row_vector = np.arange(0, maxlen, 1)
+    # 将 lengths 转换为列向量
+    matrix = np.expand_dims(lengths, axis=-1)
+    # 比较生成掩码
+    mask = row_vector < matrix
+    # 返回指定数据类型的掩码
+    return mask.astype(dtype)[None, ...]
+def unique_consecutive_np(x, dim=None, return_inverse=False, return_counts=False):
+    if dim is None:
+        # 默认情况，展平后去重
+        x_flat = x.ravel()
+        mask = np.concatenate(([True], x_flat[1:] != x_flat[:-1]))
+        unique_data = x_flat[mask]
+    else:
+        # 沿着指定维度去重
+        axis = dim if dim >= 0 else x.ndim + dim
+        if axis >= x.ndim:
+            raise ValueError(f"dim {dim} is out of range for array of dimension {x.ndim}")
+        # 使用 np.diff 检查相邻元素是否相同
+        mask = np.ones(x.shape[axis], dtype=bool)
+        if x.shape[axis] > 1:
+            # 比较当前元素和前一个元素是否不同
+            diff = np.diff(x, axis=axis)
+            mask[1:] = np.any(diff != 0, axis=tuple(range(diff.ndim))[axis:])
+        # 使用 mask 索引提取唯一元素
+        unique_data = np.take(x, np.where(mask)[0], axis=axis)
+    # 处理 return_inverse 和 return_counts
+    results = (unique_data,)
+    if return_inverse:
+        if dim is None:
+            inv_idx = np.cumsum(mask) - 1
+        else:
+            inv_idx = np.cumsum(mask) - 1
+            # 需要调整形状以匹配输入
+            inv_idx = np.expand_dims(inv_idx, axis=axis)
+            inv_idx = np.broadcast_to(inv_idx, x.shape)
+        results += (inv_idx,)
+    if return_counts:
+        if dim is None:
+            counts = np.diff(np.where(np.concatenate((mask, [True])))[0])
+        else:
+            counts = np.diff(np.where(np.concatenate((mask, [True])))[0])
+        results += (counts,)
+    return results[0] if len(results) == 1 else results
+class SenseVoiceAx:
+    def __init__(self, model_path, language="auto", use_itn=True, tokenizer=None):
+        model_path_root = os.path.join(os.path.dirname(model_path), "../embeddings")
+        self.frontend = WavFrontend(cmvn_file="am.mvn",
+                                    fs=16000,
+                                    window="hamming",
+                                    n_mels=80,
+                                    frame_length=25,
+                                    frame_shift=10,
+                                    lfr_m=7,
+                                    lfr_n=6,)
+        self.model = axe.InferenceSession(model_path)
+        self.sample_rate = 16000
+        self.tokenizer = tokenizer
+        self.blank_id = 0
+        self.max_len = 34
+        self.lid_dict = {"auto": 0, "zh": 3, "en": 4, "yue": 7, "ja": 11, "ko": 12, "nospeech": 13}
+        self.lid_int_dict = {24884: 3, 24885: 4, 24888: 7, 24892: 11, 24896: 12, 24992: 13}
+        self.textnorm_dict = {"withitn": 14, "woitn": 15}
+        self.textnorm_int_dict = {25016: 14, 25017: 15}
+        self.emo_dict = {"unk": 25009, "happy": 25001, "sad": 25002, "angry": 25003, "neutral": 25004}
+        self.position_encoding = np.load(f"{model_path_root}/position_encoding.npy")
+        language_query = np.load(f"{model_path_root}/{language}.npy")
+        textnorm_query = np.load(f"{model_path_root}/withitn.npy") if use_itn else np.load(f"{model_path_root}/woitn.npy")
+        event_emo_query = np.load(f"{model_path_root}/event_emo.npy")
+        self.input_query = np.concatenate((textnorm_query, language_query, event_emo_query), axis=1)
+        self.query_num = self.input_query.shape[1]
+        self.masks = sequence_mask(np.array([self.max_len], dtype=np.int32), dtype=np.float32)
+    def load_data(self, filepath: str) -> np.ndarray:
+        waveform, _ = librosa.load(filepath, sr=self.sample_rate)
+        return waveform.flatten()
+    @staticmethod
+    def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
+        def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
+            pad_width = ((0, max_feat_len - cur_len), (0, 0))
+            return np.pad(feat, pad_width, "constant", constant_values=0)
+        feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
+        feats = np.array(feat_res).astype(np.float32)
+        return feats
+    def preprocess(self, waveform):
+        feats, feats_len = [], []
+        for wf in [waveform]:
+            speech, _ = self.frontend.fbank(wf)
+            feat, feat_len = self.frontend.lfr_cmvn(speech)
+            feats.append(feat)
+            feats_len.append(feat_len)
+        feats = self.pad_feats(feats, np.max(feats_len))
+        feats_len = np.array(feats_len).astype(np.int32)
+        return feats, feats_len
+    def postprocess(self, ctc_logits, encoder_out_lens):
+        # 提取数据
+        x = ctc_logits[0, :encoder_out_lens[0], :]
+        # 获取最大值索引
+        yseq = np.argmax(x, axis=-1)
+        # 去除连续重复元素
+        yseq = unique_consecutive_np(yseq, dim=-1)
+        # 创建掩码并过滤 blank_id
+        mask = yseq != self.blank_id
+        token_int = yseq[mask].tolist()
+        return token_int
+    def infer_waveform(self, waveform: np.ndarray):
+        feat, feat_len = self.preprocess(waveform)
+        slice_len = self.max_len - self.query_num
+        slice_num = int(np.ceil(feat.shape[1] / slice_len))
+        asr_res = []
+        for i in range(slice_num):
+            sub_feat = feat[:, i*slice_len:(i+1)*slice_len, :]
+            # concat query
+            sub_feat = np.concatenate([self.input_query, sub_feat], axis=1)
+            if sub_feat.shape[1] < self.max_len:
+                sub_feat = np.concatenate([
+                        sub_feat,
+                        np.zeros((1, self.max_len - sub_feat.shape[1], sub_feat.shape[-1]), dtype=np.float32)
+                    ],
+                    axis=1)
+            outputs = self.model.run(None, {"speech": sub_feat,
+                                            "masks": self.masks,
+                                            "position_encoding": self.position_encoding})
+            ctc_logits, encoder_out_lens = outputs
+            token_int = self.postprocess(ctc_logits, encoder_out_lens)
+            if self.tokenizer is not None:
+                asr_res.append(self.tokenizer.tokens2text(token_int))
+            else:
+                asr_res.append(token_int)
+        return asr_res
+    def infer(self, filepath_or_data: Union[np.ndarray, str], print_rtf=True):
+        if isinstance(filepath_or_data, str):
+            waveform = self.load_data(filepath_or_data)
+        else:
+            waveform = filepath_or_data
+        total_time = waveform.shape[-1] / self.sample_rate
+        start = time.time()
+        asr_res = self.infer_waveform(waveform)
+        latency = time.time() - start
+        if print_rtf:
+            rtf = latency / total_time
+            print(f"RTF: {rtf}    Latency: {latency}s  Total length: {total_time}s")
+        return asr_res

download_utils.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import os
+# Speed up hf download using mirror url
+os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
+from huggingface_hub import snapshot_download
+current_file_path = os.path.dirname(__file__)
+REPO_ROOT = "AXERA-TECH"
+CACHE_PATH = os.path.join(current_file_path, "models")
+def download_model(model_name: str) -> str:
+    """
+    Download model from AXERA-TECH's huggingface space.
+    model_name: str
+        Available model names could be checked on https://huggingface.co/AXERA-TECH.
+    Returns:
+        str: Path to model_name
+    """
+    os.makedirs(CACHE_PATH, exist_ok=True)
+    model_path = os.path.join(CACHE_PATH, model_name)
+    if not os.path.exists(model_path):
+        print(f"Downloading {model_name}...")
+        snapshot_download(repo_id=f"{REPO_ROOT}/{model_name}",
+                          local_dir=os.path.join(CACHE_PATH, model_name))
+    return model_path

auto.npy → embeddings/auto.npy RENAMED Viewed

File without changes

en.npy → embeddings/en.npy RENAMED Viewed

File without changes

event_emo.npy → embeddings/event_emo.npy RENAMED Viewed

File without changes

ja.npy → embeddings/ja.npy RENAMED Viewed

File without changes

ko.npy → embeddings/ko.npy RENAMED Viewed

File without changes

nospeech.npy → embeddings/nospeech.npy RENAMED Viewed

File without changes

position_encoding.npy → embeddings/position_encoding.npy RENAMED Viewed

File without changes

withitn.npy → embeddings/withitn.npy RENAMED Viewed

File without changes

woitn.npy → embeddings/woitn.npy RENAMED Viewed

File without changes

yue.npy → embeddings/yue.npy RENAMED Viewed

File without changes

zh.npy → embeddings/zh.npy RENAMED Viewed

File without changes

frontend.py ADDED Viewed

	@@ -0,0 +1,429 @@

+# -*- encoding: utf-8 -*-
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
+import copy
+import numpy as np
+import kaldi_native_fbank as knf
+class WavFrontend:
+    """Conventional frontend structure for ASR."""
+    def __init__(
+        self,
+        cmvn_file: str = None,
+        fs: int = 16000,
+        window: str = "hamming",
+        n_mels: int = 80,
+        frame_length: int = 25,
+        frame_shift: int = 10,
+        lfr_m: int = 1,
+        lfr_n: int = 1,
+        dither: float = 1.0,
+        **kwargs,
+    ) -> None:
+        opts = knf.FbankOptions()
+        opts.frame_opts.samp_freq = fs
+        opts.frame_opts.dither = dither
+        opts.frame_opts.window_type = window
+        opts.frame_opts.frame_shift_ms = float(frame_shift)
+        opts.frame_opts.frame_length_ms = float(frame_length)
+        opts.mel_opts.num_bins = n_mels
+        opts.energy_floor = 0
+        opts.frame_opts.snip_edges = True
+        opts.mel_opts.debug_mel = False
+        self.opts = opts
+        self.lfr_m = lfr_m
+        self.lfr_n = lfr_n
+        self.cmvn_file = cmvn_file
+        if self.cmvn_file:
+            self.cmvn = self.load_cmvn()
+        self.fbank_fn = None
+        self.fbank_beg_idx = 0
+        self.reset_status()
+    def fbank(self, waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        waveform = waveform * (1 << 15)
+        self.fbank_fn = knf.OnlineFbank(self.opts)
+        self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
+        frames = self.fbank_fn.num_frames_ready
+        mat = np.empty([frames, self.opts.mel_opts.num_bins])
+        for i in range(frames):
+            mat[i, :] = self.fbank_fn.get_frame(i)
+        feat = mat.astype(np.float32)
+        feat_len = np.array(mat.shape[0]).astype(np.int32)
+        return feat, feat_len
+    def fbank_online(self, waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        waveform = waveform * (1 << 15)
+        # self.fbank_fn = knf.OnlineFbank(self.opts)
+        self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
+        frames = self.fbank_fn.num_frames_ready
+        mat = np.empty([frames, self.opts.mel_opts.num_bins])
+        for i in range(self.fbank_beg_idx, frames):
+            mat[i, :] = self.fbank_fn.get_frame(i)
+        # self.fbank_beg_idx += (frames-self.fbank_beg_idx)
+        feat = mat.astype(np.float32)
+        feat_len = np.array(mat.shape[0]).astype(np.int32)
+        return feat, feat_len
+    def reset_status(self):
+        self.fbank_fn = knf.OnlineFbank(self.opts)
+        self.fbank_beg_idx = 0
+    def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        if self.lfr_m != 1 or self.lfr_n != 1:
+            feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
+        if self.cmvn_file:
+            feat = self.apply_cmvn(feat)
+        feat_len = np.array(feat.shape[0]).astype(np.int32)
+        return feat, feat_len
+    @staticmethod
+    def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
+        LFR_inputs = []
+        T = inputs.shape[0]
+        T_lfr = int(np.ceil(T / lfr_n))
+        left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
+        inputs = np.vstack((left_padding, inputs))
+        T = T + (lfr_m - 1) // 2
+        for i in range(T_lfr):
+            if lfr_m <= T - i * lfr_n:
+                LFR_inputs.append((inputs[i * lfr_n : i * lfr_n + lfr_m]).reshape(1, -1))
+            else:
+                # process last LFR frame
+                num_padding = lfr_m - (T - i * lfr_n)
+                frame = inputs[i * lfr_n :].reshape(-1)
+                for _ in range(num_padding):
+                    frame = np.hstack((frame, inputs[-1]))
+                LFR_inputs.append(frame)
+        LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
+        return LFR_outputs
+    def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
+        """
+        Apply CMVN with mvn data
+        """
+        frame, dim = inputs.shape
+        means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
+        vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
+        inputs = (inputs + means) * vars
+        return inputs
+    def load_cmvn(
+        self,
+    ) -> np.ndarray:
+        with open(self.cmvn_file, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        means_list = []
+        vars_list = []
+        for i in range(len(lines)):
+            line_item = lines[i].split()
+            if line_item[0] == "<AddShift>":
+                line_item = lines[i + 1].split()
+                if line_item[0] == "<LearnRateCoef>":
+                    add_shift_line = line_item[3 : (len(line_item) - 1)]
+                    means_list = list(add_shift_line)
+                    continue
+            elif line_item[0] == "<Rescale>":
+                line_item = lines[i + 1].split()
+                if line_item[0] == "<LearnRateCoef>":
+                    rescale_line = line_item[3 : (len(line_item) - 1)]
+                    vars_list = list(rescale_line)
+                    continue
+        means = np.array(means_list).astype(np.float64)
+        vars = np.array(vars_list).astype(np.float64)
+        cmvn = np.array([means, vars])
+        return cmvn
+class WavFrontendOnline(WavFrontend):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # self.fbank_fn = knf.OnlineFbank(self.opts)
+        # add variables
+        self.frame_sample_length = int(
+            self.opts.frame_opts.frame_length_ms * self.opts.frame_opts.samp_freq / 1000
+        )
+        self.frame_shift_sample_length = int(
+            self.opts.frame_opts.frame_shift_ms * self.opts.frame_opts.samp_freq / 1000
+        )
+        self.waveform = None
+        self.reserve_waveforms = None
+        self.input_cache = None
+        self.lfr_splice_cache = []
+    @staticmethod
+    # inputs has catted the cache
+    def apply_lfr(
+        inputs: np.ndarray, lfr_m: int, lfr_n: int, is_final: bool = False
+    ) -> Tuple[np.ndarray, np.ndarray, int]:
+        """
+        Apply lfr with data
+        """
+        LFR_inputs = []
+        T = inputs.shape[0]  # include the right context
+        T_lfr = int(
+            np.ceil((T - (lfr_m - 1) // 2) / lfr_n)
+        )  # minus the right context: (lfr_m - 1) // 2
+        splice_idx = T_lfr
+        for i in range(T_lfr):
+            if lfr_m <= T - i * lfr_n:
+                LFR_inputs.append((inputs[i * lfr_n : i * lfr_n + lfr_m]).reshape(1, -1))
+            else:  # process last LFR frame
+                if is_final:
+                    num_padding = lfr_m - (T - i * lfr_n)
+                    frame = (inputs[i * lfr_n :]).reshape(-1)
+                    for _ in range(num_padding):
+                        frame = np.hstack((frame, inputs[-1]))
+                    LFR_inputs.append(frame)
+                else:
+                    # update splice_idx and break the circle
+                    splice_idx = i
+                    break
+        splice_idx = min(T - 1, splice_idx * lfr_n)
+        lfr_splice_cache = inputs[splice_idx:, :]
+        LFR_outputs = np.vstack(LFR_inputs)
+        return LFR_outputs.astype(np.float32), lfr_splice_cache, splice_idx
+    @staticmethod
+    def compute_frame_num(
+        sample_length: int, frame_sample_length: int, frame_shift_sample_length: int
+    ) -> int:
+        frame_num = int((sample_length - frame_sample_length) / frame_shift_sample_length + 1)
+        return frame_num if frame_num >= 1 and sample_length >= frame_sample_length else 0
+    def fbank(
+        self, input: np.ndarray, input_lengths: np.ndarray
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+        self.fbank_fn = knf.OnlineFbank(self.opts)
+        batch_size = input.shape[0]
+        if self.input_cache is None:
+            self.input_cache = np.empty((batch_size, 0), dtype=np.float32)
+        input = np.concatenate((self.input_cache, input), axis=1)
+        frame_num = self.compute_frame_num(
+            input.shape[-1], self.frame_sample_length, self.frame_shift_sample_length
+        )
+        # update self.in_cache
+        self.input_cache = input[
+            :, -(input.shape[-1] - frame_num * self.frame_shift_sample_length) :
+        ]
+        waveforms = np.empty(0, dtype=np.float32)
+        feats_pad = np.empty(0, dtype=np.float32)
+        feats_lens = np.empty(0, dtype=np.int32)
+        if frame_num:
+            waveforms = []
+            feats = []
+            feats_lens = []
+            for i in range(batch_size):
+                waveform = input[i]
+                waveforms.append(
+                    waveform[
+                        : (
+                            (frame_num - 1) * self.frame_shift_sample_length
+                            + self.frame_sample_length
+                        )
+                    ]
+                )
+                waveform = waveform * (1 << 15)
+                self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
+                frames = self.fbank_fn.num_frames_ready
+                mat = np.empty([frames, self.opts.mel_opts.num_bins])
+                for i in range(frames):
+                    mat[i, :] = self.fbank_fn.get_frame(i)
+                feat = mat.astype(np.float32)
+                feat_len = np.array(mat.shape[0]).astype(np.int32)
+                feats.append(feat)
+                feats_lens.append(feat_len)
+            waveforms = np.stack(waveforms)
+            feats_lens = np.array(feats_lens)
+            feats_pad = np.array(feats)
+        self.fbanks = feats_pad
+        self.fbanks_lens = copy.deepcopy(feats_lens)
+        return waveforms, feats_pad, feats_lens
+    def get_fbank(self) -> Tuple[np.ndarray, np.ndarray]:
+        return self.fbanks, self.fbanks_lens
+    def lfr_cmvn(
+        self, input: np.ndarray, input_lengths: np.ndarray, is_final: bool = False
+    ) -> Tuple[np.ndarray, np.ndarray, List[int]]:
+        batch_size = input.shape[0]
+        feats = []
+        feats_lens = []
+        lfr_splice_frame_idxs = []
+        for i in range(batch_size):
+            mat = input[i, : input_lengths[i], :]
+            lfr_splice_frame_idx = -1
+            if self.lfr_m != 1 or self.lfr_n != 1:
+                # update self.lfr_splice_cache in self.apply_lfr
+                mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(
+                    mat, self.lfr_m, self.lfr_n, is_final
+                )
+            if self.cmvn_file is not None:
+                mat = self.apply_cmvn(mat)
+            feat_length = mat.shape[0]
+            feats.append(mat)
+            feats_lens.append(feat_length)
+            lfr_splice_frame_idxs.append(lfr_splice_frame_idx)
+        feats_lens = np.array(feats_lens)
+        feats_pad = np.array(feats)
+        return feats_pad, feats_lens, lfr_splice_frame_idxs
+    def extract_fbank(
+        self, input: np.ndarray, input_lengths: np.ndarray, is_final: bool = False
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        batch_size = input.shape[0]
+        assert (
+            batch_size == 1
+        ), "we support to extract feature online only when the batch size is equal to 1 now"
+        waveforms, feats, feats_lengths = self.fbank(input, input_lengths)  # input shape: B T D
+        if feats.shape[0]:
+            self.waveforms = (
+                waveforms
+                if self.reserve_waveforms is None
+                else np.concatenate((self.reserve_waveforms, waveforms), axis=1)
+            )
+            if not self.lfr_splice_cache:
+                for i in range(batch_size):
+                    self.lfr_splice_cache.append(
+                        np.expand_dims(feats[i][0, :], axis=0).repeat((self.lfr_m - 1) // 2, axis=0)
+                    )
+            if feats_lengths[0] + self.lfr_splice_cache[0].shape[0] >= self.lfr_m:
+                lfr_splice_cache_np = np.stack(self.lfr_splice_cache)  # B T D
+                feats = np.concatenate((lfr_splice_cache_np, feats), axis=1)
+                feats_lengths += lfr_splice_cache_np[0].shape[0]
+                frame_from_waveforms = int(
+                    (self.waveforms.shape[1] - self.frame_sample_length)
+                    / self.frame_shift_sample_length
+                    + 1
+                )
+                minus_frame = (self.lfr_m - 1) // 2 if self.reserve_waveforms is None else 0
+                feats, feats_lengths, lfr_splice_frame_idxs = self.lfr_cmvn(
+                    feats, feats_lengths, is_final
+                )
+                if self.lfr_m == 1:
+                    self.reserve_waveforms = None
+                else:
+                    reserve_frame_idx = lfr_splice_frame_idxs[0] - minus_frame
+                    # print('reserve_frame_idx:  ' + str(reserve_frame_idx))
+                    # print('frame_frame:  ' + str(frame_from_waveforms))
+                    self.reserve_waveforms = self.waveforms[
+                        :,
+                        reserve_frame_idx
+                        * self.frame_shift_sample_length : frame_from_waveforms
+                        * self.frame_shift_sample_length,
+                    ]
+                    sample_length = (
+                        frame_from_waveforms - 1
+                    ) * self.frame_shift_sample_length + self.frame_sample_length
+                    self.waveforms = self.waveforms[:, :sample_length]
+            else:
+                # update self.reserve_waveforms and self.lfr_splice_cache
+                self.reserve_waveforms = self.waveforms[
+                    :, : -(self.frame_sample_length - self.frame_shift_sample_length)
+                ]
+                for i in range(batch_size):
+                    self.lfr_splice_cache[i] = np.concatenate(
+                        (self.lfr_splice_cache[i], feats[i]), axis=0
+                    )
+                return np.empty(0, dtype=np.float32), feats_lengths
+        else:
+            if is_final:
+                self.waveforms = (
+                    waveforms if self.reserve_waveforms is None else self.reserve_waveforms
+                )
+                feats = np.stack(self.lfr_splice_cache)
+                feats_lengths = np.zeros(batch_size, dtype=np.int32) + feats.shape[1]
+                feats, feats_lengths, _ = self.lfr_cmvn(feats, feats_lengths, is_final)
+        if is_final:
+            self.cache_reset()
+        return feats, feats_lengths
+    def get_waveforms(self):
+        return self.waveforms
+    def cache_reset(self):
+        self.fbank_fn = knf.OnlineFbank(self.opts)
+        self.reserve_waveforms = None
+        self.input_cache = None
+        self.lfr_splice_cache = []
+def load_bytes(input):
+    middle_data = np.frombuffer(input, dtype=np.int16)
+    middle_data = np.asarray(middle_data)
+    if middle_data.dtype.kind not in "iu":
+        raise TypeError("'middle_data' must be an array of integers")
+    dtype = np.dtype("float32")
+    if dtype.kind != "f":
+        raise TypeError("'dtype' must be a floating point type")
+    i = np.iinfo(middle_data.dtype)
+    abs_max = 2 ** (i.bits - 1)
+    offset = i.min + abs_max
+    array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
+    return array
+class SinusoidalPositionEncoderOnline:
+    """Streaming Positional encoding."""
+    def encode(self, positions: np.ndarray = None, depth: int = None, dtype: np.dtype = np.float32):
+        batch_size = positions.shape[0]
+        positions = positions.astype(dtype)
+        log_timescale_increment = np.log(np.array([10000], dtype=dtype)) / (depth / 2 - 1)
+        inv_timescales = np.exp(np.arange(depth / 2).astype(dtype) * (-log_timescale_increment))
+        inv_timescales = np.reshape(inv_timescales, [batch_size, -1])
+        scaled_time = np.reshape(positions, [1, -1, 1]) * np.reshape(inv_timescales, [1, 1, -1])
+        encoding = np.concatenate((np.sin(scaled_time), np.cos(scaled_time)), axis=2)
+        return encoding.astype(dtype)
+    def forward(self, x, start_idx=0):
+        batch_size, timesteps, input_dim = x.shape
+        positions = np.arange(1, timesteps + 1 + start_idx)[None, :]
+        position_encoding = self.encode(positions, input_dim, x.dtype)
+        return x + position_encoding[:, start_idx : start_idx + timesteps]
+def test():
+    path = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav"
+    import librosa
+    cmvn_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/am.mvn"
+    config_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/config.yaml"
+    from funasr.runtime.python.onnxruntime.rapid_paraformer.utils.utils import read_yaml
+    config = read_yaml(config_file)
+    waveform, _ = librosa.load(path, sr=None)
+    frontend = WavFrontend(
+        cmvn_file=cmvn_file,
+        **config["frontend_conf"],
+    )
+    speech, _ = frontend.fbank_online(waveform)  # 1d, (sample,), numpy
+    feat, feat_len = frontend.lfr_cmvn(
+        speech
+    )  # 2d, (frame, 450), np.float32 -> torch, torch.from_numpy(), dtype, (1, frame, 450)
+    frontend.reset_status()  # clear cache
+    return feat, feat_len
+if __name__ == "__main__":
+    test()

main.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os, sys
+import argparse
+from SenseVoiceAx import SenseVoiceAx
+from tokenizer import SentencepiecesTokenizer
+from print_utils import rich_transcription_postprocess, rich_print_asr_res
+from download_utils import download_model
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", "-i", required=True, type=str, help="Input audio file")
+    parser.add_argument("--language", "-l", required=False, type=str, default="auto", choices=["auto", "zh", "en", "yue", "ja", "ko"])
+    return parser.parse_args()
+def main():
+    args = get_args()
+    input_audio = args.input
+    language = args.language
+    use_itn = True # 标点符号预测
+    model_path = os.path.join("sensevoice_ax650", "sensevoice.axmodel")
+    bpemodel = "chn_jpn_yue_eng_ko_spectok.bpe.model"
+    assert os.path.exists(model_path), f"model {model_path} not exist"
+    print(f"input_audio: {input_audio}")
+    print(f"language: {language}")
+    print(f"use_itn: {use_itn}")
+    print(f"model_path: {model_path}")
+    tokenizer = SentencepiecesTokenizer(bpemodel=bpemodel)
+    pipeline = SenseVoiceAx(model_path, language, use_itn, tokenizer=tokenizer)
+    asr_res = pipeline.infer(input_audio, print_rtf=True)
+    print([rich_transcription_postprocess(i) for i in asr_res])
+    # rich_print_asr_res(asr_res)
+if __name__ == "__main__":
+    main()

print_utils.py ADDED Viewed

	@@ -0,0 +1,121 @@

+emo_dict = {
+    "<|HAPPY|>": "😊",
+    "<|SAD|>": "😔",
+    "<|ANGRY|>": "😡",
+    "<|NEUTRAL|>": "",
+    "<|FEARFUL|>": "😰",
+    "<|DISGUSTED|>": "🤢",
+    "<|SURPRISED|>": "😮",
+}
+event_dict = {
+    "<|BGM|>": "🎼",
+    "<|Speech|>": "",
+    "<|Applause|>": "👏",
+    "<|Laughter|>": "😀",
+    "<|Cry|>": "😭",
+    "<|Sneeze|>": "🤧",
+    "<|Breath|>": "",
+    "<|Cough|>": "🤧",
+}
+lang_dict = {
+    "<|zh|>": "<|lang|>",
+    "<|en|>": "<|lang|>",
+    "<|yue|>": "<|lang|>",
+    "<|ja|>": "<|lang|>",
+    "<|ko|>": "<|lang|>",
+    "<|nospeech|>": "<|lang|>",
+}
+emoji_dict = {
+    "<|nospeech|><|Event_UNK|>": "❓",
+    "<|zh|>": "",
+    "<|en|>": "",
+    "<|yue|>": "",
+    "<|ja|>": "",
+    "<|ko|>": "",
+    "<|nospeech|>": "",
+    "<|HAPPY|>": "😊",
+    "<|SAD|>": "😔",
+    "<|ANGRY|>": "😡",
+    "<|NEUTRAL|>": "",
+    "<|BGM|>": "🎼",
+    "<|Speech|>": "",
+    "<|Applause|>": "👏",
+    "<|Laughter|>": "😀",
+    "<|FEARFUL|>": "😰",
+    "<|DISGUSTED|>": "🤢",
+    "<|SURPRISED|>": "😮",
+    "<|Cry|>": "😭",
+    "<|EMO_UNKNOWN|>": "",
+    "<|Sneeze|>": "🤧",
+    "<|Breath|>": "",
+    "<|Cough|>": "😷",
+    "<|Sing|>": "",
+    "<|Speech_Noise|>": "",
+    "<|withitn|>": "",
+    "<|woitn|>": "",
+    "<|GBG|>": "",
+    "<|Event_UNK|>": "",
+}
+emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
+event_set = {
+    "🎼",
+    "👏",
+    "😀",
+    "😭",
+    "🤧",
+    "😷",
+}
+def format_str_v2(s):
+    sptk_dict = {}
+    for sptk in emoji_dict:
+        sptk_dict[sptk] = s.count(sptk)
+        s = s.replace(sptk, "")
+    emo = "<|NEUTRAL|>"
+    for e in emo_dict:
+        if sptk_dict[e] > sptk_dict[emo]:
+            emo = e
+    for e in event_dict:
+        if sptk_dict[e] > 0:
+            s = event_dict[e] + s
+    s = s + emo_dict[emo]
+    for emoji in emo_set.union(event_set):
+        s = s.replace(" " + emoji, emoji)
+        s = s.replace(emoji + " ", emoji)
+    return s.strip()
+def rich_transcription_postprocess(s):
+    def get_emo(s):
+        return s[-1] if s[-1] in emo_set else None
+    def get_event(s):
+        return s[0] if s[0] in event_set else None
+    s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
+    for lang in lang_dict:
+        s = s.replace(lang, "<|lang|>")
+    s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
+    new_s = " " + s_list[0]
+    cur_ent_event = get_event(new_s)
+    for i in range(1, len(s_list)):
+        if len(s_list[i]) == 0:
+            continue
+        if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
+            s_list[i] = s_list[i][1:]
+        # else:
+        cur_ent_event = get_event(s_list[i])
+        if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
+            new_s = new_s[:-1]
+        new_s += s_list[i].strip().lstrip()
+    new_s = new_s.replace("The.", " ")
+    return new_s.strip()
+def rich_print_asr_res(asr_res):
+    res = "".join([rich_transcription_postprocess(i) for i in asr_res])
+    print(res)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+huggingface_hub
+numpy<2
+kaldi-native-fbank
+librosa==0.9.1
+sentencepiece

sensevoice.axmodel → sensevoice_ax650/sensevoice.axmodel RENAMED Viewed

File without changes

tokenizer.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import sentencepiece as spm
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
+import json
+from abc import abstractmethod
+from abc import ABC
+import numpy as np
+class BaseTokenizer(ABC):
+    def __init__(
+        self,
+        token_list: Union[Path, str, Iterable[str]] = None,
+        unk_symbol: str = "<unk>",
+        **kwargs,
+    ):
+        if token_list is not None:
+            if isinstance(token_list, (Path, str)) and token_list.endswith(".txt"):
+                token_list = Path(token_list)
+                self.token_list_repr = str(token_list)
+                self.token_list: List[str] = []
+                with token_list.open("r", encoding="utf-8") as f:
+                    for idx, line in enumerate(f):
+                        line = line.rstrip()
+                        self.token_list.append(line)
+            elif isinstance(token_list, (Path, str)) and token_list.endswith(".json"):
+                token_list = Path(token_list)
+                self.token_list_repr = str(token_list)
+                self.token_list: List[str] = []
+                with open(token_list, "r", encoding="utf-8") as f:
+                    self.token_list = json.load(f)
+            else:
+                self.token_list: List[str] = list(token_list)
+                self.token_list_repr = ""
+                for i, t in enumerate(self.token_list):
+                    if i == 3:
+                        break
+                    self.token_list_repr += f"{t}, "
+                self.token_list_repr += f"... (NVocab={(len(self.token_list))})"
+            self.token2id: Dict[str, int] = {}
+            for i, t in enumerate(self.token_list):
+                if t in self.token2id:
+                    raise RuntimeError(f'Symbol "{t}" is duplicated')
+                self.token2id[t] = i
+            self.unk_symbol = unk_symbol
+            if self.unk_symbol not in self.token2id:
+                raise RuntimeError(f"Unknown symbol '{unk_symbol}' doesn't exist in the token_list")
+            self.unk_id = self.token2id[self.unk_symbol]
+    def encode(self, text, **kwargs):
+        tokens = self.text2tokens(text)
+        text_ints = self.tokens2ids(tokens)
+        return text_ints
+    def decode(self, text_ints):
+        token = self.ids2tokens(text_ints)
+        text = self.tokens2text(token)
+        return text
+    def get_num_vocabulary_size(self) -> int:
+        return len(self.token_list)
+    def ids2tokens(self, integers: Union[np.ndarray, Iterable[int]]) -> List[str]:
+        if isinstance(integers, np.ndarray) and integers.ndim != 1:
+            raise ValueError(f"Must be 1 dim ndarray, but got {integers.ndim}")
+        return [self.token_list[i] for i in integers]
+    def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
+        return [self.token2id.get(i, self.unk_id) for i in tokens]
+    @abstractmethod
+    def text2tokens(self, line: str) -> List[str]:
+        raise NotImplementedError
+    @abstractmethod
+    def tokens2text(self, tokens: Iterable[str]) -> str:
+        raise NotImplementedError
+class SentencepiecesTokenizer(BaseTokenizer):
+    def __init__(self, bpemodel: Union[Path, str], **kwargs):
+        super().__init__(**kwargs)
+        self.bpemodel = str(bpemodel)
+        # NOTE(kamo):
+        # Don't build SentencePieceProcessor in __init__()
+        # because it's not picklable and it may cause following error,
+        # "TypeError: can't pickle SwigPyObject objects",
+        # when giving it as argument of "multiprocessing.Process()".
+        self.sp = None
+        self._build_sentence_piece_processor()
+    def __repr__(self):
+        return f'{self.__class__.__name__}(model="{self.bpemodel}")'
+    def _build_sentence_piece_processor(self):
+        # Build SentencePieceProcessor lazily.
+        if self.sp is None:
+            self.sp = spm.SentencePieceProcessor()
+            self.sp.load(self.bpemodel)
+    def text2tokens(self, line: str) -> List[str]:
+        self._build_sentence_piece_processor()
+        return self.sp.EncodeAsPieces(line)
+    def tokens2text(self, tokens: Iterable[str]) -> str:
+        self._build_sentence_piece_processor()
+        return self.sp.DecodePieces(list(tokens))
+    def encode(self, line: str, **kwargs) -> List[int]:
+        self._build_sentence_piece_processor()
+        return self.sp.EncodeAsIds(line)
+    def decode(self, line: List[int], **kwargs):
+        self._build_sentence_piece_processor()
+        return self.sp.DecodeIds(line)
+    def get_vocab_size(self):
+        return self.sp.GetPieceSize()
+    def ids2tokens(self, *args, **kwargs):
+        return self.decode(*args, **kwargs)
+    def tokens2ids(self, *args, **kwargs):
+        return self.encode(*args, **kwargs)