File size: 1,658 Bytes
1e495f3
 
 
 
 
 
 
 
 
 
b295d06
1e495f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b295d06
1e495f3
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from pathlib import Path
import time
import csv
import numpy as np
from funasr_onnx import SeacoParaformer, CT_Transformer, Fsmn_vad

from lib.utils import Timer, read_audio

MODEL_DIR = Path("/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models")

class FunAsrOnnx:
    def __init__(self, model_dir=MODEL_DIR, quantize=True):
        asr_model_path = model_dir / 'speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
        # vad_model_path = model_dir / 'speech_fsmn_vad_zh-cn-16k-common-pytorch'
        punc_model_path = model_dir / 'punc_ct-transformer_cn-en-common-vocab471067-large'
        t0 = time.time()
        # vad_model = Fsmn_vad(vad_model_path, quantize=quantize)
        with Timer("load FunASR") as t:
            self.asr_model = SeacoParaformer(asr_model_path, quantize=quantize)
            self.punc_model = CT_Transformer(punc_model_path, quantize=quantize)
        self._warm_up()

    def _warm_up(self):
        # 生成 1 秒 16kHz 的假音频数据
        fake_audio = np.random.randn(16000).astype(np.float32)
        self.asr_model(fake_audio, hotwords="")

    def transcribe(self, audio:np.ndarray):
        with Timer("FunASR inference") as t:
            asr_res = self.asr_model(audio, hotwords="")
            asr_text = asr_res[0]["preds"]
            result = self.punc_model(asr_text)
            text = result[0]
        return text, t.duration

if __name__ == '__main__':
    funasr = FunAsrOnnx()
    audio = read_audio(Path("/Users/jeqin/work/code/TestTranslator/test_data/recordings/1.wav"))
    text, time_cost =funasr.transcribe(audio)
    print(text)
    print(time_cost)