AXERA-TECH
/

SenseVoice

inoryQwQ commited on Nov 21, 2025

Commit

b3b007c

1 Parent(s): d1ae526

fix server and gradio

Files changed (3) hide show

SenseVoiceAx.py CHANGED Viewed

@@ -69,8 +69,8 @@ def unique_consecutive_np(arr):
 class SenseVoiceAx:
-    """ SenseVoice axmodel runner """
     def __init__(
         self,
         model_path: str,
@@ -89,13 +89,13 @@ class SenseVoiceAx:
             max_len:    Fixed shape of input of axmodel
             beam_size:  Max number of hypos to hold after each decode step
             language:   Support auto, zh(Chinese), en(English), yue(Cantonese), ja(Japanese), ko(Korean)
-            hot_words:  Words that may fail to recognize,
-                        special words/phrases (aka hotwords) like rare words, personalized information etc.
-            use_itn:    Allow Invert Text Normalization if True,
                         ITN converts ASR model output into its written form to improve text readability,
                         For example, the ITN module replaces “one hundred and twenty-three dollars” transcribed by an ASR model with “$123.”
             streaming:  Processes audio in small segments or "chunks" sequentially and outputs text on the fly.
-                        Use stream_infer method if streaming is true otherwise infer.
         """
         model_path_root = os.path.dirname(model_path)

 class SenseVoiceAx:
+    """SenseVoice axmodel runner"""
     def __init__(
         self,
         model_path: str,
             max_len:    Fixed shape of input of axmodel
             beam_size:  Max number of hypos to hold after each decode step
             language:   Support auto, zh(Chinese), en(English), yue(Cantonese), ja(Japanese), ko(Korean)
+            hot_words:  Words that may fail to recognize,
+                        special words/phrases (aka hotwords) like rare words, personalized information etc.
+            use_itn:    Allow Invert Text Normalization if True,
                         ITN converts ASR model output into its written form to improve text readability,
                         For example, the ITN module replaces “one hundred and twenty-three dollars” transcribed by an ASR model with “$123.”
             streaming:  Processes audio in small segments or "chunks" sequentially and outputs text on the fly.
+                        Use stream_infer method if streaming is true otherwise infer.
         """
         model_path_root = os.path.dirname(model_path)

gradio_demo.py CHANGED Viewed

@@ -1,21 +1,22 @@
 import gradio as gr
 import os
 from SenseVoiceAx import SenseVoiceAx
-from tokenizer import SentencepiecesTokenizer
 from print_utils import rich_transcription_postprocess
-from download_utils import download_model
-use_itn = True  # 标点符号预测
 max_len = 256
 model_path = os.path.join("sensevoice_ax650", "sensevoice.axmodel")
-bpemodel = "chn_jpn_yue_eng_ko_spectok.bpe.model"
 assert os.path.exists(model_path), f"model {model_path} not exist"
-tokenizer = SentencepiecesTokenizer(bpemodel=bpemodel)
 pipeline = SenseVoiceAx(
-    model_path, max_len=max_len, language="auto", use_itn=use_itn, tokenizer=tokenizer
 )
@@ -28,10 +29,9 @@ def speech_to_text(audio_path, lang):
         return "无音频"
     pipeline.choose_language(language=lang)
-    asr_res = pipeline.infer(audio_path, print_rtf=True)
-    res = " ".join([rich_transcription_postprocess(i) for i in asr_res])
-    return res
 def main():

 import gradio as gr
 import os
 from SenseVoiceAx import SenseVoiceAx
 from print_utils import rich_transcription_postprocess
 max_len = 256
 model_path = os.path.join("sensevoice_ax650", "sensevoice.axmodel")
 assert os.path.exists(model_path), f"model {model_path} not exist"
 pipeline = SenseVoiceAx(
+    model_path,
+    max_len=max_len,
+    beam_size=3,
+    language="auto",
+    hot_words=None,
+    use_itn=True,
+    streaming=False,
 )
         return "无音频"
     pipeline.choose_language(language=lang)
+    asr_res = pipeline.infer(audio_path, print_rtf=False)
+    return asr_res
 def main():

server.py CHANGED Viewed

@@ -3,11 +3,7 @@ from fastapi import FastAPI, HTTPException, Body
 from fastapi.responses import JSONResponse
 from typing import List, Optional
 import logging
-import json
 from SenseVoiceAx import SenseVoiceAx
-from tokenizer import SentencepiecesTokenizer
-from print_utils import rich_transcription_postprocess, rich_print_asr_res
-from download_utils import download_model
 import os
 import librosa
@@ -32,11 +28,10 @@ async def load_model():
     try:
         # 模型加载
         language = "auto"
-        use_itn = True  # 标点符号预测
         max_len = 256
         model_path = os.path.join("sensevoice_ax650", "sensevoice.axmodel")
-        bpemodel = "chn_jpn_yue_eng_ko_spectok.bpe.model"
         assert os.path.exists(model_path), f"model {model_path} not exist"
@@ -44,13 +39,14 @@ async def load_model():
         print(f"use_itn: {use_itn}")
         print(f"model_path: {model_path}")
-        tokenizer = SentencepiecesTokenizer(bpemodel=bpemodel)
         asr_model = SenseVoiceAx(
             model_path,
             max_len=max_len,
-            language=language,
             use_itn=use_itn,
-            tokenizer=tokenizer,
         )
         logger.info("ASR model loaded successfully")

 from fastapi.responses import JSONResponse
 from typing import List, Optional
 import logging
 from SenseVoiceAx import SenseVoiceAx
 import os
 import librosa
     try:
         # 模型加载
         language = "auto"
+        use_itn = True  # 逆文本规范
         max_len = 256
         model_path = os.path.join("sensevoice_ax650", "sensevoice.axmodel")
         assert os.path.exists(model_path), f"model {model_path} not exist"
         print(f"use_itn: {use_itn}")
         print(f"model_path: {model_path}")
         asr_model = SenseVoiceAx(
             model_path,
             max_len=max_len,
+            beam_size=3,
+            language="auto",
+            hot_words=None,
             use_itn=use_itn,
+            streaming=False,
         )
         logger.info("ASR model loaded successfully")