lxowalle commited on Jan 5

Commit

6c5434c

1 Parent(s): 5a18609

* optmize models path

Browse files

Files changed (28) hide show

README.md +2 -89
README_ZH.md +9 -0
config.json +0 -0
download_dataset.sh +0 -2
download_utils.py +0 -33
example/en.mp3 +0 -3
example/ja.mp3 +0 -3
example/ko.mp3 +0 -3
example/yue.mp3 +0 -3
example/zh.mp3 +0 -3
gradio_demo.py +0 -62
main.py +0 -79
print_utils.py +0 -131
requirements.txt +0 -11
SenseVoiceAx.py → sensevoice-maixcam2/SenseVoiceAx.py +6 -6
am.mvn → sensevoice-maixcam2/am.mvn +0 -0
chn_jpn_yue_eng_ko_spectok.bpe.model → sensevoice-maixcam2/chn_jpn_yue_eng_ko_spectok.bpe.model +0 -0
client.py → sensevoice-maixcam2/client.py +34 -9
embeddings.npy → sensevoice-maixcam2/embeddings.npy +0 -0
frontend.py → sensevoice-maixcam2/frontend.py +0 -0
model.mud → sensevoice-maixcam2/model.mud +4 -3
pe_nonstream.npy → sensevoice-maixcam2/pe_nonstream.npy +0 -0
pe_streaming.npy → sensevoice-maixcam2/pe_streaming.npy +0 -0
{sensevoice_ax630c → sensevoice-maixcam2}/sensevoice.axmodel +0 -0
server.py → sensevoice-maixcam2/server.py +22 -6
{sensevoice_ax630c → sensevoice-maixcam2}/streaming_sensevoice.axmodel +0 -0
tokenizer.py → sensevoice-maixcam2/tokenizer.py +0 -0
test_wer.py +0 -296

README.md CHANGED Viewed

@@ -4,93 +4,6 @@ language:
 - en
 pipeline_tag: automatic-speech-recognition
 ---
-# sensevoice.axera
-FunASR SenseVoice on Axera, official repo: https://github.com/FunAudioLLM/SenseVoice
-## TODO
-- [x] 支持AX630C
-- [ ] 支持C++
-- [x] 支持FastAPI
-## 功能
- - 语音识别
- - 自动识别语言(支持中文、英文、粤语、日语、韩语)
- - 情感识别
- - 自动标点
- - 支持流式识别
-## 支持平台
-- [x] AX650N
-- [x] AX630C
-## 环境安装
-```
-pip3 install -r requirements.txt
-```
-如果空间不足可以使用 --prefix 指定别的安装路径
-## 使用
-```
-# 首次运行会自动从huggingface上下载模型, 保存到models中
-python3 main.py -i 输入音频文件
-```
-运行参数说明:
-| 参数名称 | 说明 | 默认值 |
-| --- | --- | --- |
-| --input/-i | 输入音频文件 | |
-| --language/-l | 识别语言，支持auto, zh, en, yue, ja, ko | auto |
-| --streaming | 流式识别 | |
-### 示例:
-example下有测试音频
-如 粤语测试
-```
-python3 main.py -i example/yue.mp3
-```
-输出
-```
-RTF: 0.03026517820946964    Latency: 0.15689468383789062s  Total length: 5.184s
-['呢几个字。', '都表达唔到，我想讲嘅意。', '思。']
-```
-流式识别
-```
-python3 main.py -i example/zh.mp3 --streaming
-```
-输出
-```
-{'timestamps': [540], 'text': '开'}
-{'timestamps': [540, 780, 1080], 'text': '开放时'}
-{'timestamps': [540, 780, 1080, 1260, 1740], 'text': '开放时间早'}
-{'timestamps': [540, 780, 1080, 1260, 1740, 1920, 2340], 'text': '开放时间早上9'}
-{'timestamps': [540, 780, 1080, 1260, 1740, 1920, 2340, 2640], 'text': '开放时间早上9点'}
-{'timestamps': [540, 780, 1080, 1260, 1740, 1920, 2340, 2640, 3060], 'text': '开放时间早上9点至'}
-{'timestamps': [540, 780, 1080, 1260, 1740, 1920, 2340, 2640, 3060, 3780, 4020], 'text': '开放时间早上9点至下午'}
-{'timestamps': [540, 780, 1080, 1260, 1740, 1920, 2340, 2640, 3060, 3780, 4020, 4440, 4620], 'text': '开放时间早上9点至下午五点'}
-RTF: 0.03678379235444246
-```
-## 准确率
-使用WER(Word-Error-Rate)作为评价标准
-**WER = 0.0389**
-### 复现测试结果
-```
-./download_datasets.sh
-python test_wer.py -d datasets -l zh
-```
-## 技术讨论
-- Github issues
-- QQ 群: 139953715

 - en
 pipeline_tag: automatic-speech-recognition
 ---
+# SenseVoice
+Refer here[](https://wiki.sipeed.com/maixpy/doc/en/mllm/asr_sensevoice.html)

README_ZH.md ADDED Viewed

	@@ -0,0 +1,9 @@

+---
+license: mit
+language:
+- en
+pipeline_tag: automatic-speech-recognition
+---
+# SenseVoice
+使用文档参考[这里](https://wiki.sipeed.com/maixpy/doc/zh/mllm/asr_sensevoice.html)

config.json DELETED Viewed

File without changes

download_dataset.sh DELETED Viewed

	@@ -1,2 +0,0 @@
1	- wget https://github.com/ml-inory/whisper.axera/releases/download/v1.0/datasets.zip
2	- unzip datasets.zip -d ./

download_utils.py DELETED Viewed

@@ -1,33 +0,0 @@
-import os
-# Speed up hf download using mirror url
-os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
-from huggingface_hub import snapshot_download
-current_file_path = os.path.dirname(__file__)
-REPO_ROOT = "AXERA-TECH"
-CACHE_PATH = os.path.join(current_file_path, "models")
-def download_model(model_name: str) -> str:
-    """
-    Download model from AXERA-TECH's huggingface space.
-    model_name: str
-        Available model names could be checked on https://huggingface.co/AXERA-TECH.
-    Returns:
-        str: Path to model_name
-    """
-    os.makedirs(CACHE_PATH, exist_ok=True)
-    model_path = os.path.join(CACHE_PATH, model_name)
-    if not os.path.exists(model_path):
-        print(f"Downloading {model_name}...")
-        snapshot_download(
-            repo_id=f"{REPO_ROOT}/{model_name}",
-            local_dir=os.path.join(CACHE_PATH, model_name),
-        )
-    return model_path

example/en.mp3 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f10378336a4e584f3f63799e62f99d5add3c2a401b51d3abe7d3a3a82f255ada
-size 57441

example/ja.mp3 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:496dbc43b289e1d0d0cb916df9737450bca56acd8aaca046a7a2472363b1be53
-size 57837

example/ko.mp3 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8612f62db8319a6cb4ab4b1d2039bfc32f174f89611889ddafdeb5c0a6070b5f
-size 27909

example/yue.mp3 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5098eebc13530a66e4eac1f30d3246e65c9cfc4e096665f9d395aca8eff0d181
-size 31246

example/zh.mp3 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0e64de19e4ff9a02e682955c9112f32d2317cfdbb5bc2f3504664044c993f195
-size 44973

gradio_demo.py DELETED Viewed

@@ -1,62 +0,0 @@
-import gradio as gr
-import os
-from SenseVoiceAx import SenseVoiceAx
-from print_utils import rich_transcription_postprocess
-max_len = 256
-model_path = os.path.join("sensevoice_ax650", "sensevoice.axmodel")
-assert os.path.exists(model_path), f"model {model_path} not exist"
-pipeline = SenseVoiceAx(
-    model_path,
-    max_len=max_len,
-    beam_size=3,
-    language="auto",
-    hot_words=None,
-    use_itn=True,
-    streaming=False,
-)
-def speech_to_text(audio_path, lang):
-    """
-    audio_path: 音频文件路径
-    lang: 语言类型 "auto", "zh", "en", "yue", "ja", "ko"
-    """
-    if not audio_path:
-        return "无音频"
-    pipeline.choose_language(language=lang)
-    asr_res = pipeline.infer(audio_path, print_rtf=False)
-    return asr_res
-def main():
-    with gr.Blocks() as demo:
-        with gr.Row():
-            output_text = gr.Textbox(label="识别结果", lines=5)
-        with gr.Row():
-            audio_input = gr.Audio(
-                sources=["upload"], type="filepath", label="录制或上传音频", format="mp3"
-            )
-            lang_dropdown = gr.Dropdown(
-                choices=["auto", "zh", "en", "yue", "ja", "ko"],
-                value="auto",
-                label="选择音频语言",
-            )
-        audio_input.change(
-            fn=speech_to_text, inputs=[audio_input, lang_dropdown], outputs=output_text
-        )
-    demo.launch(
-        server_name="0.0.0.0",
-    )
-if __name__ == "__main__":
-    main()

main.py DELETED Viewed

@@ -1,79 +0,0 @@
-import os
-import argparse
-from SenseVoiceAx import SenseVoiceAx
-import librosa
-import numpy as np
-import time
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input", "-i", required=True, type=str, help="Input audio file"
-    )
-    parser.add_argument(
-        "--language",
-        "-l",
-        required=False,
-        type=str,
-        default="auto",
-        choices=["auto", "zh", "en", "yue", "ja", "ko"],
-    )
-    parser.add_argument("--streaming", action="store_true")
-    return parser.parse_args()
-def main():
-    args = get_args()
-    input_audio = args.input
-    language = args.language
-    use_itn = True  # 标点符号预测
-    if not args.streaming:
-        max_len = 256
-        model_path = os.path.join("sensevoice_ax630c", "sensevoice.axmodel")
-    else:
-        max_len = 26
-        model_path = os.path.join("sensevoice_ax630c", "streaming_sensevoice.axmodel")
-    assert os.path.exists(model_path), f"model {model_path} not exist"
-    print(f"input_audio: {input_audio}")
-    print(f"language: {language}")
-    print(f"use_itn: {use_itn}")
-    print(f"model_path: {model_path}")
-    print(f"streaming: {args.streaming}")
-    pipeline = SenseVoiceAx(
-        model_path,
-        max_len=max_len,
-        beam_size=3,
-        language="auto",
-        hot_words=None,
-        use_itn=True,
-        streaming=args.streaming,
-    )
-    if not args.streaming:
-        asr_res = pipeline.infer(input_audio, print_rtf=True)
-        print("ASR result: " + asr_res)
-    else:
-        samples, sr = librosa.load(input_audio, sr=16000)
-        samples = (samples * 32768).tolist()
-        duration = len(samples) / 16000
-        start = time.time()
-        step = int(0.1 * sr)
-        for i in range(0, len(samples), step):
-            is_last = i + step >= len(samples)
-            for res in pipeline.stream_infer(samples[i : i + step], is_last):
-                print(res)
-        end = time.time()
-        cost_time = end - start
-        print(f"RTF: {cost_time / duration}")
-if __name__ == "__main__":
-    main()

print_utils.py DELETED Viewed

@@ -1,131 +0,0 @@
-emo_dict = {
-    "<|HAPPY|>": "😊",
-    "<|SAD|>": "😔",
-    "<|ANGRY|>": "😡",
-    "<|NEUTRAL|>": "",
-    "<|FEARFUL|>": "😰",
-    "<|DISGUSTED|>": "🤢",
-    "<|SURPRISED|>": "😮",
-}
-event_dict = {
-    "<|BGM|>": "🎼",
-    "<|Speech|>": "",
-    "<|Applause|>": "👏",
-    "<|Laughter|>": "😀",
-    "<|Cry|>": "😭",
-    "<|Sneeze|>": "🤧",
-    "<|Breath|>": "",
-    "<|Cough|>": "🤧",
-}
-lang_dict = {
-    "<|zh|>": "<|lang|>",
-    "<|en|>": "<|lang|>",
-    "<|yue|>": "<|lang|>",
-    "<|ja|>": "<|lang|>",
-    "<|ko|>": "<|lang|>",
-    "<|nospeech|>": "<|lang|>",
-}
-emoji_dict = {
-    "<|nospeech|><|Event_UNK|>": "❓",
-    "<|zh|>": "",
-    "<|en|>": "",
-    "<|yue|>": "",
-    "<|ja|>": "",
-    "<|ko|>": "",
-    "<|nospeech|>": "",
-    "<|HAPPY|>": "😊",
-    "<|SAD|>": "😔",
-    "<|ANGRY|>": "😡",
-    "<|NEUTRAL|>": "",
-    "<|BGM|>": "🎼",
-    "<|Speech|>": "",
-    "<|Applause|>": "👏",
-    "<|Laughter|>": "😀",
-    "<|FEARFUL|>": "😰",
-    "<|DISGUSTED|>": "🤢",
-    "<|SURPRISED|>": "😮",
-    "<|Cry|>": "😭",
-    "<|EMO_UNKNOWN|>": "",
-    "<|Sneeze|>": "🤧",
-    "<|Breath|>": "",
-    "<|Cough|>": "😷",
-    "<|Sing|>": "",
-    "<|Speech_Noise|>": "",
-    "<|withitn|>": "",
-    "<|woitn|>": "",
-    "<|GBG|>": "",
-    "<|Event_UNK|>": "",
-}
-emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
-event_set = {
-    "🎼",
-    "👏",
-    "😀",
-    "😭",
-    "🤧",
-    "😷",
-}
-def format_str_v2(s):
-    sptk_dict = {}
-    for sptk in emoji_dict:
-        sptk_dict[sptk] = s.count(sptk)
-        s = s.replace(sptk, "")
-    emo = "<|NEUTRAL|>"
-    for e in emo_dict:
-        if sptk_dict[e] > sptk_dict[emo]:
-            emo = e
-    for e in event_dict:
-        if sptk_dict[e] > 0:
-            s = event_dict[e] + s
-    s = s + emo_dict[emo]
-    for emoji in emo_set.union(event_set):
-        s = s.replace(" " + emoji, emoji)
-        s = s.replace(emoji + " ", emoji)
-    return s.strip()
-def rich_transcription_postprocess(s):
-    def get_emo(s):
-        return s[-1] if s[-1] in emo_set else None
-    def get_event(s):
-        return s[0] if s[0] in event_set else None
-    s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
-    for lang in lang_dict:
-        s = s.replace(lang, "<|lang|>")
-    s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
-    new_s = " " + s_list[0]
-    cur_ent_event = get_event(new_s)
-    for i in range(1, len(s_list)):
-        if len(s_list[i]) == 0:
-            continue
-        if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
-            s_list[i] = s_list[i][1:]
-        # else:
-        cur_ent_event = get_event(s_list[i])
-        if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
-            new_s = new_s[:-1]
-        new_s += s_list[i].strip().lstrip()
-    new_s = new_s.replace("The.", " ")
-    return new_s.strip()
-def rich_print_asr_res(asr_res, will_print=True, remove_punc=False):
-    res = "".join([rich_transcription_postprocess(i) for i in asr_res])
-    if remove_punc:
-        res = res.replace("，", "")
-        res = res.replace("。", "")
-    if will_print:
-        print(res)
-    return res

requirements.txt DELETED Viewed

@@ -1,11 +0,0 @@
-huggingface_hub
-numpy<2
-kaldi-native-fbank
-librosa==0.9.1
-sentencepiece
-fastapi
-gradio
-emoji
-asr-decoder
-online-fbank
-torch

SenseVoiceAx.py → sensevoice-maixcam2/SenseVoiceAx.py RENAMED Viewed

@@ -281,18 +281,18 @@ class SenseVoiceAx:
         """
         model_path_root = os.path.dirname(model_path)
-        emb_path = os.path.join(model_path_root, "../embeddings.npy")
-        cmvn_file = os.path.join(model_path_root, "../am.mvn")
         bpe_model = os.path.join(
-            model_path_root, "../chn_jpn_yue_eng_ko_spectok.bpe.model"
         )
         if streaming:
             self.position_encoding = np.load(
-                os.path.join(model_path_root, "../pe_streaming.npy")
             )
         else:
             self.position_encoding = np.load(
-                os.path.join(model_path_root, "../pe_nonstream.npy")
             )
         self.streaming = streaming
@@ -553,7 +553,7 @@ class SenseVoiceAx:
         self.cur_idx = -1
         self.decoder.reset()
         self.fbank = OnlineFbank(window_type="hamming")
-        self.caches = np.zeros(self.caches_shape)
     def get_size(self):
         effective_size = self.cur_idx + 1 - self.padding

         """
         model_path_root = os.path.dirname(model_path)
+        emb_path = os.path.join(model_path_root, "embeddings.npy")
+        cmvn_file = os.path.join(model_path_root, "am.mvn")
         bpe_model = os.path.join(
+            model_path_root, "chn_jpn_yue_eng_ko_spectok.bpe.model"
         )
         if streaming:
             self.position_encoding = np.load(
+                os.path.join(model_path_root, "pe_streaming.npy")
             )
         else:
             self.position_encoding = np.load(
+                os.path.join(model_path_root, "pe_nonstream.npy")
             )
         self.streaming = streaming
         self.cur_idx = -1
         self.decoder.reset()
         self.fbank = OnlineFbank(window_type="hamming")
+        self.caches = np.zeros(self.caches_shape, dtype=np.float32)
     def get_size(self):
         effective_size = self.cur_idx + 1 - self.padding

am.mvn → sensevoice-maixcam2/am.mvn RENAMED Viewed

File without changes

chn_jpn_yue_eng_ko_spectok.bpe.model → sensevoice-maixcam2/chn_jpn_yue_eng_ko_spectok.bpe.model RENAMED Viewed

File without changes

client.py → sensevoice-maixcam2/client.py RENAMED Viewed

@@ -6,7 +6,7 @@ import requests, json, os
 import wave
 import numpy as np
 import threading
-from maix import app, time
 class Sensevoice:
     def __init__(self, model = "", url="http://0.0.0.0:12347", lauguage="auto", stream=False):
@@ -26,6 +26,17 @@ class Sensevoice:
         if not os.path.exists(model):
             raise ValueError(f'Model {self.model} is not existed!')
     def _check_service(self):
         try:
             response = requests.get(self.url + '/status')
@@ -75,6 +86,15 @@ class Sensevoice:
         except:
             return "not loaded"
     def _start_model(self):
         try:
             data = {
@@ -83,6 +103,7 @@ class Sensevoice:
                 "language": self.launguage,
                 "stream": self.stream
             }
             response = requests.post(self.url + '/start_model', json=data)
             if response.status_code == 200:
                 res = json.loads(response.text)
@@ -129,6 +150,9 @@ class Sensevoice:
     def is_ready(self, block=False):
         while not app.need_exit():
             if self._get_status() == "loaded":
                 return True
             else:
@@ -136,10 +160,6 @@ class Sensevoice:
                     time.sleep(1)
                 else:
                     return False
-            if self.thread_is_exit:
-                return True if self.thread_exit_code == 0 else False
         return False
     def stop(self):
@@ -225,17 +245,20 @@ class Sensevoice:
         }
         try:
             response = requests.post(self.url + '/asr', json=data)
             if response.status_code == 200:
                 res = json.loads(response.text)
                 text = res.get("text", "")
                 if len(text) > 0:
-                    return text[0]
                 else:
-                    return ""
             else:
                 print(f"Requests failed: {response.status_code}")
-                return ""
         except Exception as e:
             print("Requests failed:", e)
             return ""
@@ -258,13 +281,15 @@ class Sensevoice:
             "launguage": "auto",
             "step": 0.1,
         }
-        print('start post')
         try:
             response = requests.post(self.url + '/asr_stream', json=data, stream=True)
             for line in response.iter_lines():
                 if line:
                     chunk = json.loads(line)
                     yield chunk.get("text", "")
         except Exception as e:
             print("Requests failed:", e)
             return ""

 import wave
 import numpy as np
 import threading
+from maix import app, time, nn
 class Sensevoice:
     def __init__(self, model = "", url="http://0.0.0.0:12347", lauguage="auto", stream=False):
         if not os.path.exists(model):
             raise ValueError(f'Model {self.model} is not existed!')
+        self.model_mud = nn.MUD(model)
+        self.model_configs = self.model_mud.items
+        server_path = self.model_configs.get('extra', {}).get('server', '/root/models/sensevoice_maixcam2/server.py')
+        model_dir = os.path.dirname(self.model)
+        service_env_path = self.model_configs.get('extra', {}).get('sensevoice_service_env')
+        self._create_service_environment_file(model_dir, os.path.join(model_dir, server_path), os.path.join(model_dir, service_env_path))
+    def _create_service_environment_file(self, working_dir: str, server_path: str, env_file_path: str = "/tmp/sensevoice.service.env"):
+        os.system(f"echo 'WORK_DIR={os.path.realpath(working_dir)}' > {os.path.realpath(env_file_path)}")
+        os.system(f"echo 'SERVER_FILE={os.path.realpath(server_path)}' >> {os.path.realpath(env_file_path)}")
     def _check_service(self):
         try:
             response = requests.get(self.url + '/status')
         except:
             return "not loaded"
+    def _reset_model(self):
+        try:
+            response = requests.post(self.url + '/reset_model')
+            if response.status_code == 200:
+                res = json.loads(response.text)
+                return res["status"]
+        except:
+            return "not loaded"
     def _start_model(self):
         try:
             data = {
                 "language": self.launguage,
                 "stream": self.stream
             }
             response = requests.post(self.url + '/start_model', json=data)
             if response.status_code == 200:
                 res = json.loads(response.text)
     def is_ready(self, block=False):
         while not app.need_exit():
+            if self.thread_is_exit:
+                return True if self.thread_exit_code == 0 else False
             if self._get_status() == "loaded":
                 return True
             else:
                     time.sleep(1)
                 else:
                     return False
         return False
     def stop(self):
         }
         try:
+            result = ""
             response = requests.post(self.url + '/asr', json=data)
             if response.status_code == 200:
                 res = json.loads(response.text)
                 text = res.get("text", "")
                 if len(text) > 0:
+                    result = text[0]
                 else:
+                    result = ""
             else:
                 print(f"Requests failed: {response.status_code}")
+                result = ""
+            return result
         except Exception as e:
             print("Requests failed:", e)
             return ""
             "launguage": "auto",
             "step": 0.1,
         }
         try:
             response = requests.post(self.url + '/asr_stream', json=data, stream=True)
             for line in response.iter_lines():
                 if line:
                     chunk = json.loads(line)
                     yield chunk.get("text", "")
+            self._reset_model()
         except Exception as e:
             print("Requests failed:", e)
             return ""

embeddings.npy → sensevoice-maixcam2/embeddings.npy RENAMED Viewed

File without changes

frontend.py → sensevoice-maixcam2/frontend.py RENAMED Viewed

File without changes

model.mud → sensevoice-maixcam2/model.mud RENAMED Viewed

@@ -1,6 +1,6 @@
 [basic]
 type = axmodel
-model_npu = sensevoice_ax630c/sensevoice.axmodel
 model_vnpu =
 [extra]
@@ -11,5 +11,6 @@ beam_size = 3
 language = auto
 hot_words = None,
 use_itn = True
-stream_model = sensevoice_ax630c/streaming_sensevoice.axmodel
-server_url = http://127.0.0.1:12345

 [basic]
 type = axmodel
+model_npu = sensevoice.axmodel
 model_vnpu =
 [extra]
 language = auto
 hot_words = None,
 use_itn = True
+stream_model = streaming_sensevoice.axmodel
+sensevoice_service_env = /tmp/sensevoice.service.env
+server = server.py

pe_nonstream.npy → sensevoice-maixcam2/pe_nonstream.npy RENAMED Viewed

File without changes

pe_streaming.npy → sensevoice-maixcam2/pe_streaming.npy RENAMED Viewed

File without changes

{sensevoice_ax630c → sensevoice-maixcam2}/sensevoice.axmodel RENAMED Viewed

File without changes

server.py → sensevoice-maixcam2/server.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import numpy as np
 from fastapi import FastAPI, HTTPException, Body
 from fastapi.responses import JSONResponse, StreamingResponse
 from typing import List, Optional
 import logging
 import json
@@ -15,13 +16,12 @@ import time
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-app = FastAPI(title="ASR Server", description="Automatic Speech Recognition API")
 # 全局变量存储模型
 asr_model = None
 asr_model_is_loaded = False
 mud_configs = None
 def parse_config_file_to_json(file_path):
     """从文件读取配置并解析为JSON"""
     if not os.path.exists(file_path):
@@ -96,9 +96,12 @@ async def handle_heartbeat():
             last_get_heartbeat_s = time.time() # 模型未加载时，重置获取心跳的时间
         await asyncio.sleep(10)  # 每隔 10 秒检查一次
-@app.on_event("startup")
-async def load_model():
     asyncio.create_task(handle_heartbeat())
 def validate_audio_data(audio_data: List[float]) -> np.ndarray:
     """
@@ -147,6 +150,15 @@ async def get_status():
     global asr_model_is_loaded
     return JSONResponse(content={"status": "loaded" if asr_model_is_loaded else "not loaded"})
 @app.post("/start_model", summary="Load model")
 async def start_model(
     model_path: str = Body(
@@ -161,10 +173,14 @@ async def start_model(
     """
     global asr_model
     global asr_model_is_loaded
-    logger.info("Loading ASR model...")
     if asr_model_is_loaded:
-        return JSONResponse(content={"status": "loaded"})
     try:
         mud_configs = parse_config_file_to_json(model_path)

 import numpy as np
 from fastapi import FastAPI, HTTPException, Body
 from fastapi.responses import JSONResponse, StreamingResponse
+from contextlib import asynccontextmanager
 from typing import List, Optional
 import logging
 import json
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # 全局变量存储模型
 asr_model = None
 asr_model_is_loaded = False
 mud_configs = None
 def parse_config_file_to_json(file_path):
     """从文件读取配置并解析为JSON"""
     if not os.path.exists(file_path):
             last_get_heartbeat_s = time.time() # 模型未加载时，重置获取心跳的时间
         await asyncio.sleep(10)  # 每隔 10 秒检查一次
+@asynccontextmanager
+async def lifespan(app: FastAPI):
     asyncio.create_task(handle_heartbeat())
+    yield
+app = FastAPI(title="ASR Server", lifespan = lifespan, description="Automatic Speech Recognition API")
 def validate_audio_data(audio_data: List[float]) -> np.ndarray:
     """
     global asr_model_is_loaded
     return JSONResponse(content={"status": "loaded" if asr_model_is_loaded else "not loaded"})
+@app.post("/reset_model", summary="Reset model")
+async def reset_model():
+    global asr_model
+    global asr_model_is_loaded
+    if not asr_model_is_loaded:
+        return JSONResponse(content={"status": "not loaded"})
+    asr_model.reset()
+    return JSONResponse(content={"status": "ok"})
 @app.post("/start_model", summary="Load model")
 async def start_model(
     model_path: str = Body(
     """
     global asr_model
     global asr_model_is_loaded
     if asr_model_is_loaded:
+        if asr_model.streaming == stream:
+            return JSONResponse(content={"status": "loaded"})
+        else:
+            await try_stop_model()
+    logger.info("Loading ASR model...")
     try:
         mud_configs = parse_config_file_to_json(model_path)

{sensevoice_ax630c → sensevoice-maixcam2}/streaming_sensevoice.axmodel RENAMED Viewed

File without changes

tokenizer.py → sensevoice-maixcam2/tokenizer.py RENAMED Viewed

File without changes

test_wer.py DELETED Viewed

@@ -1,296 +0,0 @@
-import os, sys
-import argparse
-from SenseVoiceAx import SenseVoiceAx
-from tokenizer import SentencepiecesTokenizer
-from print_utils import rich_transcription_postprocess, rich_print_asr_res
-from download_utils import download_model
-import logging
-import re
-import emoji
-def setup_logging():
-    """配置日志系统，同时输出到控制台和文件"""
-    # 获取脚本所在目录
-    script_dir = os.path.dirname(os.path.abspath(__file__))
-    log_file = os.path.join(script_dir, "test_wer.log")
-    # 配置日志格式
-    log_format = "%(asctime)s - %(levelname)s - %(message)s"
-    date_format = "%Y-%m-%d %H:%M:%S"
-    # 创建logger
-    logger = logging.getLogger()
-    logger.setLevel(logging.INFO)
-    # 清除现有的handler
-    for handler in logger.handlers[:]:
-        logger.removeHandler(handler)
-    # 创建文件handler
-    file_handler = logging.FileHandler(log_file, mode="w", encoding="utf-8")
-    file_handler.setLevel(logging.INFO)
-    file_formatter = logging.Formatter(log_format, date_format)
-    file_handler.setFormatter(file_formatter)
-    # 创建控制台handler
-    console_handler = logging.StreamHandler()
-    console_handler.setLevel(logging.INFO)
-    console_formatter = logging.Formatter(log_format, date_format)
-    console_handler.setFormatter(console_formatter)
-    # 添加handler到logger
-    logger.addHandler(file_handler)
-    logger.addHandler(console_handler)
-    return logger
-class AIShellDataset:
-    def __init__(self, gt_path: str):
-        """
-        初始化数据集
-        Args:
-            json_path: voice.json文件的路径
-        """
-        self.gt_path = gt_path
-        self.dataset_dir = os.path.dirname(gt_path)
-        self.voice_dir = os.path.join(self.dataset_dir, "aishell_S0764")
-        # 检查必要文件和文件夹是否存在
-        assert os.path.exists(gt_path), f"gt文件不存在: {gt_path}"
-        assert os.path.exists(self.voice_dir), f"aishell_S0764文件夹不存在: {self.voice_dir}"
-        # 加载数据
-        self.data = []
-        with open(gt_path, "r", encoding="utf-8") as f:
-            for line in f:
-                line = line.strip()
-                audio_path, gt = line.split(" ")
-                audio_path = os.path.join(self.voice_dir, audio_path + ".wav")
-                self.data.append({"audio_path": audio_path, "gt": gt})
-        # 使用logging而不是print
-        logger = logging.getLogger()
-        logger.info(f"加载了 {len(self.data)} 条数据")
-    def __iter__(self):
-        """返回迭代器"""
-        self.index = 0
-        return self
-    def __next__(self):
-        """返回下一个数据项"""
-        if self.index >= len(self.data):
-            raise StopIteration
-        item = self.data[self.index]
-        audio_path = item["audio_path"]
-        ground_truth = item["gt"]
-        self.index += 1
-        return audio_path, ground_truth
-    def __len__(self):
-        """返回数据集大小"""
-        return len(self.data)
-class CommonVoiceDataset:
-    """Common Voice数据集解析器"""
-    def __init__(self, tsv_path: str):
-        """
-        初始化数据集
-        Args:
-            json_path: voice.json文件的路径
-        """
-        self.tsv_path = tsv_path
-        self.dataset_dir = os.path.dirname(tsv_path)
-        self.voice_dir = os.path.join(self.dataset_dir, "clips")
-        # 检查必要文件和文件夹是否存在
-        assert os.path.exists(tsv_path), f"{tsv_path}文件不存在: {tsv_path}"
-        assert os.path.exists(self.voice_dir), f"voice文件夹不存在: {self.voice_dir}"
-        # 加载JSON数据
-        self.data = []
-        with open(tsv_path, "r", encoding="utf-8") as f:
-            f.readline()
-            for line in f:
-                line = line.strip()
-                splits = line.split("\t")
-                audio_path = splits[1]
-                gt = splits[3]
-                audio_path = os.path.join(self.voice_dir, audio_path)
-                self.data.append({"audio_path": audio_path, "gt": gt})
-        # 使用logging而不是print
-        logger = logging.getLogger()
-        logger.info(f"加载了 {len(self.data)} 条数据")
-    def __iter__(self):
-        """返回迭代器"""
-        self.index = 0
-        return self
-    def __next__(self):
-        """返回下一个数据项"""
-        if self.index >= len(self.data):
-            raise StopIteration
-        item = self.data[self.index]
-        audio_path = item["audio_path"]
-        ground_truth = item["gt"]
-        self.index += 1
-        return audio_path, ground_truth
-    def __len__(self):
-        """返回数据集大小"""
-        return len(self.data)
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--dataset",
-        "-d",
-        type=str,
-        required=True,
-        choices=["aishell", "common_voice"],
-        help="Test dataset",
-    )
-    parser.add_argument(
-        "--gt_path",
-        "-g",
-        type=str,
-        required=True,
-        help="Test dataset ground truth file",
-    )
-    parser.add_argument(
-        "--language",
-        "-l",
-        required=False,
-        type=str,
-        default="auto",
-        choices=["auto", "zh", "en", "yue", "ja", "ko"],
-    )
-    parser.add_argument(
-        "--max_num", type=int, default=-1, required=False, help="Maximum test data num"
-    )
-    return parser.parse_args()
-def min_distance(word1: str, word2: str) -> int:
-    row = len(word1) + 1
-    column = len(word2) + 1
-    cache = [[0] * column for i in range(row)]
-    for i in range(row):
-        for j in range(column):
-            if i == 0 and j == 0:
-                cache[i][j] = 0
-            elif i == 0 and j != 0:
-                cache[i][j] = j
-            elif j == 0 and i != 0:
-                cache[i][j] = i
-            else:
-                if word1[i - 1] == word2[j - 1]:
-                    cache[i][j] = cache[i - 1][j - 1]
-                else:
-                    replace = cache[i - 1][j - 1] + 1
-                    insert = cache[i][j - 1] + 1
-                    remove = cache[i - 1][j] + 1
-                    cache[i][j] = min(replace, insert, remove)
-    return cache[row - 1][column - 1]
-def remove_punctuation(text):
-    # 定义正则表达式模式，匹配所有标点符号
-    # 这个模式包括常见的标点符号和中文标点
-    pattern = r"[^\w\s]|_"
-    # 使用sub方法将所有匹配的标点符号替换为空字符串
-    cleaned_text = re.sub(pattern, "", text)
-    return cleaned_text
-def main():
-    logger = setup_logging()
-    args = get_args()
-    language = args.language
-    use_itn = False  # 标点符号预测
-    max_num = args.max_num
-    dataset_type = args.dataset.lower()
-    if dataset_type == "aishell":
-        dataset = AIShellDataset(args.gt_path)
-    elif dataset_type == "common_voice":
-        dataset = CommonVoiceDataset(args.gt_path)
-    else:
-        raise ValueError(f"Unknown dataset type {dataset_type}")
-    # model_path_root = download_model("SenseVoice")
-    model_path = os.path.join("sensevoice_ax650", "sensevoice.axmodel")
-    bpemodel = "chn_jpn_yue_eng_ko_spectok.bpe.model"
-    assert os.path.exists(model_path), f"model {model_path} not exist"
-    logger.info(f"dataset: {args.dataset}")
-    logger.info(f"language: {language}")
-    logger.info(f"use_itn: {use_itn}")
-    logger.info(f"model_path: {model_path}")
-    tokenizer = SentencepiecesTokenizer(bpemodel=bpemodel)
-    pipeline = SenseVoiceAx(
-        model_path, language=language, use_itn=use_itn, tokenizer=tokenizer, max_len=256
-    )
-    # Iterate over dataset
-    hyp = []
-    references = []
-    all_character_error_num = 0
-    all_character_num = 0
-    max_data_num = max_num if max_num > 0 else len(dataset)
-    for n, (audio_path, reference) in enumerate(dataset):
-        reference = remove_punctuation(reference).lower()
-        asr_res = pipeline.infer(audio_path, print_rtf=False)
-        hypothesis = rich_print_asr_res(
-            asr_res, will_print=False, remove_punc=True
-        ).lower()
-        hypothesis = emoji.replace_emoji(hypothesis, replace="")
-        character_error_num = min_distance(reference, hypothesis)
-        character_num = len(reference)
-        character_error_rate = character_error_num / character_num * 100
-        all_character_error_num += character_error_num
-        all_character_num += character_num
-        hyp.append(hypothesis)
-        references.append(reference)
-        line_content = f"({n+1}/{max_data_num}) {os.path.basename(audio_path)}  gt: {reference}  predict: {hypothesis}  WER: {character_error_rate}%"
-        logger.info(line_content)
-        if n + 1 >= max_data_num:
-            break
-    total_character_error_rate = all_character_error_num / all_character_num * 100
-    logger.info(f"Total WER: {total_character_error_rate}%")
-if __name__ == "__main__":
-    main()