add test_models

Browse files

Files changed (16) hide show

lib/models/__init__.py +0 -0
lib/models/funasr.py +42 -0
lib/models/kokoro.py +113 -0
lib/models/llm.py +91 -0
lib/models/whisper.py +68 -0
scripts/asr_utils.py +0 -41
test_data/audios.py +49 -0
test_data/texts.py +18 -0
test_data/{recordings/text → texts}/test_translation_en.txt +0 -0
test_data/{recordings/text → texts}/test_translation_zh.txt +0 -0
tests/test_models/__init__.py +0 -0
tests/test_models/conftest.py +12 -0
tests/test_models/test_funasr.py +22 -0
tests/test_models/test_llm.py +30 -0
tests/test_models/test_tts.py +31 -0
tests/test_models/test_whisper.py +22 -0

lib/models/__init__.py ADDED Viewed

File without changes

lib/models/funasr.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from pathlib import Path
+import time
+import csv
+import numpy as np
+from funasr_onnx import SeacoParaformer, CT_Transformer, Fsmn_vad
+from lib.utils import Timer, read_audio
+MODEL_DIR = Path("/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models")
+class FunASR:
+    def __init__(self, model_dir=MODEL_DIR, quantize=True):
+        asr_model_path = model_dir / 'speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
+        # vad_model_path = model_dir / 'speech_fsmn_vad_zh-cn-16k-common-pytorch'
+        punc_model_path = model_dir / 'punc_ct-transformer_cn-en-common-vocab471067-large'
+        t0 = time.time()
+        # vad_model = Fsmn_vad(vad_model_path, quantize=quantize)
+        with Timer("load FunASR") as t:
+            self.asr_model = SeacoParaformer(asr_model_path, quantize=quantize)
+            self.punc_model = CT_Transformer(punc_model_path, quantize=quantize)
+        self._warm_up()
+    def _warm_up(self):
+        # 生成 1 秒 16kHz 的假音频数据
+        fake_audio = np.random.randn(16000).astype(np.float32)
+        self.asr_model(fake_audio, hotwords="")
+    def transcribe(self, audio:np.ndarray):
+        with Timer("FunASR inference") as t:
+            asr_res = self.asr_model(audio, hotwords="")
+            asr_text = asr_res[0]["preds"]
+            result = self.punc_model(asr_text)
+            text = result[0]
+        return text, t.duration
+if __name__ == '__main__':
+    funasr = FunASR()
+    audio = read_audio(Path("/Users/jeqin/work/code/TestTranslator/test_data/recordings/1.wav"))
+    text, time_cost =funasr.transcribe(audio)
+    print(text)
+    print(time_cost)

lib/models/kokoro.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import os
+from pathlib import Path
+from kokoro_onnx import Kokoro
+from misaki import espeak, en, zh
+from misaki.espeak import EspeakG2P
+from functools import lru_cache
+from logging import getLogger
+import librosa
+import onnxruntime
+from lib.utils import Timer, write_audio
+logger = getLogger(__name__)
+providers = onnxruntime.get_available_providers()
+MODEL_DIR = Path("//Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models/kokoro")
+def create_session(model_path):
+    # See list of providers https://github.com/microsoft/onnxruntime/issues/22101#issuecomment-2357667377
+    providers = onnxruntime.get_available_providers()
+    print(f"Available onnx runtime providers: {providers}")
+    # See session options https://onnxruntime.ai/docs/performance/tune-performance/threading.html#thread-management
+    sess_options = onnxruntime.SessionOptions()
+    cpu_count = os.cpu_count() // 2
+    print(f"Setting threads to CPU cores count: {cpu_count}")
+    sess_options.intra_op_num_threads = cpu_count
+    session = onnxruntime.InferenceSession(
+        model_path, providers=["CPUExecutionProvider"], sess_options=sess_options
+    )
+    return session
+class KokoroTTS:
+    language_voice_mapping = {
+        "JP": "jf_alpha",
+        "JA": "jf_alpha",
+        "ZH": "zf_xiaoyi",
+        "EN": "af_heart",
+        "FR": "ff_siwis",
+        "IT": "im_nicola",
+        "HI": "hf_alpha",
+        "PT": "im_nicola",
+        "ES": "im_nicola"
+    }
+    language_word_mapping = {
+        "ZH": "你好",
+        "EN": "hello",
+        "FR": "Bonjour",
+        "IT": "Ciao",
+        "HI": "हेलो",
+        "PT": "Olá",
+        "ES": "Hola"
+    }
+    def __init__(self, model_path: str, voice_model_path: str, vocab_config=None, gcp=None, voice=None):
+        self._session = create_session(model_path)
+        self.model = Kokoro.from_session(self._session, voice_model_path, vocab_config=vocab_config)
+        self.g2p = gcp
+        self.voice = voice
+    @classmethod
+    def from_language(cls, language: str, model_dir: Path=MODEL_DIR):
+        model_path: str = str(model_dir / "kokoro-quant.onnx")
+        voice_model_path: str = str(model_dir / "voices-v1.0.bin")
+        voice = cls.language_voice_mapping.get(language.upper())
+        warm_up_text = cls.language_word_mapping.get(language.upper())
+        logger.info(f"[TTS] language: {language}")
+        if not voice:
+            raise ValueError(f"Unsupported language: {language}, voice: {voice}")
+        vocab_config = None
+        if language.upper() == "ZH":
+            g2p = zh.ZHG2P()
+            vocab_config = model_dir / "zh_config.json"
+        elif language.upper() == 'EN':
+            fallback = espeak.EspeakFallback(british=False)
+            g2p = en.G2P(trf=False, british=False, fallback=fallback)
+        elif language.upper() == "HI":
+            g2p = EspeakG2P(language="hi")
+        elif language.upper() == "IT":
+            g2p = EspeakG2P(language="it")
+        elif language.upper() == "PT":
+            g2p = EspeakG2P(language="pt-br")
+        elif language.upper() == "ES":
+            g2p = EspeakG2P(language="es")
+        elif language.upper() == "FR":
+            g2p = EspeakG2P(language="fr-fr")
+        else:
+            g2p = EspeakG2P(language.lower())
+        with Timer("load tts"):
+            tts = cls(model_path, voice_model_path,vocab_config=vocab_config, gcp=g2p, voice=voice)
+        tts.generate(warm_up_text)
+        return tts
+    def generate(self, text, speed=1.2):
+        with Timer("tts inference") as t:
+            phonemes, _ = self.g2p(text)
+            samples, sample_rate = self.model.create(phonemes, self.voice, is_phonemes=True, speed=speed)
+        return samples, sample_rate, t.duration
+        # return librosa.resample(samples, target_sr=44100, orig_sr=sample_rate)
+    async def stream(self, text, speed=1.2):
+        phonemes, _ = self.g2p(text)
+        stream = self.model.create_stream(phonemes, self.voice, is_phonemes=True, speed=speed)
+        async for samples, sample_rate in stream:
+            yield samples, sample_rate
+if __name__ == '__main__':
+    tts = KokoroTTS.from_language(language="ZH")
+    samples, sr, time_cost = tts.generate("今天天气怎么样？")
+    write_audio("tts_out.wav", samples, sr)
+    print(time_cost)

lib/models/llm.py ADDED Viewed

	@@ -0,0 +1,91 @@

+from logging import getLogger
+from pathlib import Path
+from llama_cpp import Llama
+from functools import lru_cache
+from lib.utils import Timer
+logger = getLogger(__name__)
+LLM_SYS_PROMPT_EN= """
+你是一名专业的同声传译员，正在为 GOSIM 会议提供英中翻译服务。你的任务是准确、流畅地翻译发言内容。
+请遵循以下要求：
+1. 语言风格：翻译成中文时，请使用自然、流畅、符合现代汉语口语习惯的表达方式。避免生硬、逐字翻译的痕迹，要让听众容易理解。
+2. 专业术语：**请优先参考下方提供的术语对照表进行翻译。** 对于对照表中未包含的术语，如果该术语有公认的标准翻译，请使用标准翻译；如果没有或不确定，请保留英文原文。不要用通俗词汇替代专业术语。
+3. 专有名词：对于专有名词，如会议名称 "GOSIM"、人名、公司名、项目名、特定技术名称等，请保留其原始英文不做翻译。
+4. 流畅性与准确性：在追求口语化的同时，务必保证信息传达的准确性。
+5. 输出：请直接输出翻译结果，不要添加任何额外的解释或说明。
+**专业术语对照表：**
+* driver: 驱动
+* bus: 总线
+* mask: 掩码
+* preemption: 抢占
+* register: 寄存器
+* Library: 库
+* biases: 偏移
+* OpenAGI: OpenAGI
+* LLaMA Factory: LLaMA Factory
+* OPENGL: OPENGL
+现在，请将以下内容翻译成中文：
+"""
+LLM_SYS_PROMPT_ZH = """
+你是一位中英文翻译专家。请将以下中文文本翻译成英文，遵循以下要求：
+翻译要求：
+- 保留原文英文内容：以下内容请保持原始英文形式，不进行翻译或改写：
+- 技术术语与专业词汇
+- 产品名称、品牌名称
+- 代码片段、函数名、变量名
+- 专有名词、缩写、首字母缩略词（如 API、NLP、RAG 等）
+- 翻译符合英文表达习惯，流畅自然，不生硬直译。
+- 保持专业性与准确性，清晰传达原意。
+- 如遇原文表达模糊或逻辑不清的情况，允许适度调整语序或措辞，以增强英文表述的清晰度和逻辑性。
+注意：
+若难以确定某个词汇是否需要翻译，请优先保留原始英文形式。
+不需添加额外解释或注释，仅翻译正文内容。
+特别注意，翻译的内容只能包含英文，不能包含其他的语言。
+文本："""
+MODEL_PATH = Path("/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models/qwen2.5-1.5b-instruct-q5_0.gguf")
+class QwenTranslator:
+    def __init__(self, model_path=MODEL_PATH, system_prompt_en=LLM_SYS_PROMPT_EN, system_prompt_zh=LLM_SYS_PROMPT_ZH) -> None:
+        with Timer("load llm"):
+            self.llm = Llama(
+            model_path=str(model_path),
+            chat_format="chatml",
+            verbose=False)
+        self.sys_prompt_en = system_prompt_en
+        self.sys_prompt_zh = system_prompt_zh
+        self._warmup()
+    def to_message(self, prompt, src_lang, dst_lang):
+        """构造提示词"""
+        return [
+            {"role": "system", "content": self.sys_prompt_en if src_lang == "en" else self.sys_prompt_zh},
+            {"role": "user", "content": prompt},
+        ]
+    def _warmup(self):
+        self.translate(prompt="hello", src_lang="en", dst_lang="zh")
+    @lru_cache(maxsize=10)
+    def translate(self, prompt, src_lang, dst_lang) -> str:
+        message = self.to_message(prompt, src_lang, dst_lang)
+        with Timer("llm inference") as t:
+            output = self.llm.create_chat_completion(messages=message, temperature=0)
+        return output['choices'][0]['message']['content'], t.duration
+if __name__ == '__main__':
+    model_dir = Path("/Users/jeqin/work/code/Translator/moyoyo_asr_models")
+    qwen2 = (model_dir / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
+    qwen3 = (model_dir / "Qwen_Qwen3-1.7B-Q4_K_M.gguf").as_posix()
+    translator = QwenTranslator(qwen3)
+    text, time_cost =translator.translate("今天天气怎么样？", "zh", "en")
+    print(text)
+    print(time_cost)

lib/models/whisper.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from pywhispercpp.model import Model
+import soundfile
+import numpy as np
+from logging import getLogger
+from pathlib import Path
+from lib.utils import Timer, read_audio
+logger = getLogger(__name__)
+MODEL_DIR = Path("/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models")
+WHISPER_PROMPT_ZH = "以下是简体中文普通话的句子。"
+WHISPER_PROMPT_EN = ""  # "The following is an English sentence."
+class WhisperCPP:
+    def __init__(self, model_dir=MODEL_DIR, source_lange: str = 'en') -> None:
+        whisper_model = 'large-v3-turbo-q5_0'
+        with Timer("load whisper"):
+            self.model = Model(
+                model=whisper_model,
+                models_dir=str(model_dir),
+                print_realtime=False,
+                print_progress=False,
+                print_timestamps=False,
+                translate=False,
+                # beam_search=1,
+                temperature=0.,
+                no_context=True
+            )
+        self._warmup()
+    def _warmup(self):
+        fake_audio = np.random.randn(16000).astype(np.float32)
+        self.model.transcribe(fake_audio, print_progress=False)
+    @staticmethod
+    def config_language(language):
+        if language == "zh":
+            return WHISPER_PROMPT_ZH
+        elif language == "en":
+            return WHISPER_PROMPT_EN
+        raise ValueError(f"Unsupported language : {language}")
+    def transcribe(self, audio: np.ndarray, language):
+        prompt = self.config_language(language)
+        try:
+            with Timer("whisper inference") as t:
+                segments = self.model.transcribe(
+                    audio,
+                    initial_prompt=prompt,
+                    language=language,
+                    # token_timestamps=True,
+                    split_on_word=True,
+                    # max_len=max_len
+                )
+                text = "".join([s.text for s in segments])
+            return text, t.duration
+        except Exception as e:
+            logger.error(e)
+            return []
+if __name__ == '__main__':
+    from lib.utils import read_audio
+    whisper = WhisperCPP()
+    audio = read_audio(Path("/Users/jeqin/work/code/TestTranslator/test_data/recordings/1.wav"))
+    text, time_cost = whisper.transcribe(audio, "zh")
+    print(text)
+    print(time_cost)

scripts/asr_utils.py CHANGED Viewed

@@ -7,17 +7,6 @@ from pathlib import Path
 import subprocess
 from subprocess import CompletedProcess
-def cmd(command: str, check=True, capture_output=False) -> CompletedProcess:
-    print(command)
-    if capture_output:
-        ret = subprocess.run(command, shell=True, check=check, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
-                             universal_newlines=True)
-    else:
-        ret = subprocess.run(command, shell=True, check=check)
-    print(ret.stdout)
-    return ret
 def add_text_index():
     text_file = '../test_data/text/test_asr_zh.txt'
     index = 1
@@ -89,37 +78,7 @@ def get_origin_text_dict():
         text_dict[idx] = text
     return text_dict
-def read_dataset(file):
-    """line sample: {"audio": {"path": "dataset/audio/data_aishell/wav/test/S0916/BAC009S0916W0158.wav"}, "sentence": "顾客体验的核心是真善美", "duration": 3.22, "sentences": [{"start": 0, "end": 3.22, "text": "顾客体验的核心是真善美"}]}"""
-    with open(file) as f:
-        lines =f.readlines()
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
-        data = json.loads(line)
-        yield data["audio"]["path"], data["sentence"], data["duration"]
-def read_emilia(folder: Path, count_limit=None):
-    """读取 emilia 数据集，返回音频路径、文本、时长,
-    json 文件样例：
-    {"id": "ZH_B00000_S00110_W000000", "wav": "ZH_B00000/ZH_B00000_S00110/mp3/ZH_B00000_S00110_W000000.mp3", "text": "\u628a\u63e1\u6700\u524d\u6cbf\u7684\u91d1\u878d\u9886\u57df\u548c\u533a\u5757\u94fe\u6700\u65b0\u8d44\u8baf\u3002\u6211\u4eec\u4e00\u8d77\u6765\u4e86\u89e3\u4e00\u4e0b\u4eca\u5929\u5e02\u573a\u4e0a\u6709\u53d1\u751f\u54ea\u4e9b\u91cd\u8981\u4e8b\u4ef6\u3002", "duration": 7.963, "speaker": "ZH_B00000_S00110", "language": "zh", "dnsmos": 3.3808}"""
-    count = 0
-    for json_file in sorted(folder.glob("*.json")):
-        count += 1
-        if count_limit and count > count_limit:
-            break
-        with open(json_file, encoding="utf-8") as f:
-            data = json.load(f)
-            text = data["text"]
-            duration = data["duration"]
-        wav_path = folder /f'{json_file.stem}.wav'
-        if not wav_path.exists():
-            mp3_path = folder / f'{json_file.stem}.mp3'
-            command=f"ffmpeg -i {mp3_path}  -ac 1 -ar 16000 {wav_path}"
-            cmd(command)
-        yield wav_path, text, duration

 import subprocess
 from subprocess import CompletedProcess
 def add_text_index():
     text_file = '../test_data/text/test_asr_zh.txt'
     index = 1
         text_dict[idx] = text
     return text_dict

test_data/audios.py ADDED Viewed

	@@ -0,0 +1,49 @@

+from pathlib import Path
+import json
+from lib.utils import cmd
+from environment import TEST_DATA
+def read_recording(folder: Path=Path("./recordings"), count_limit=None):
+    pass
+def read_dataset(file: Path=Path("./dataset_aishell/dataset.txt"), count_limit=None):
+    """line sample: {"audio": {"path": "dataset/audio/data_aishell/wav/test/S0916/BAC009S0916W0158.wav"}, "sentence": "顾客体验的核心是真善美", "duration": 3.22, "sentences": [{"start": 0, "end": 3.22, "text": "顾客体验的核心是真善美"}]}"""
+    with open(file) as f:
+        lines =f.readlines()
+    count = 0
+    for line in lines:
+        if count_limit and count > count_limit:
+            break
+        count += 1
+        line = line.strip()
+        if not line:
+            continue
+        data = json.loads(line)
+        yield data["audio"]["path"], data["sentence"], data["duration"]
+def read_emilia(folder: Path=TEST_DATA/"ZH-B000000", count_limit=None):
+    """读取 emilia 数据集，返回音频路径、文本、时长,
+    json 文件样例：
+    {"id": "ZH_B00000_S00110_W000000", "wav": "ZH_B00000/ZH_B00000_S00110/mp3/ZH_B00000_S00110_W000000.mp3", "text": "\u628a\u63e1\u6700\u524d\u6cbf\u7684\u91d1\u878d\u9886\u57df\u548c\u533a\u5757\u94fe\u6700\u65b0\u8d44\u8baf\u3002\u6211\u4eec\u4e00\u8d77\u6765\u4e86\u89e3\u4e00\u4e0b\u4eca\u5929\u5e02\u573a\u4e0a\u6709\u53d1\u751f\u54ea\u4e9b\u91cd\u8981\u4e8b\u4ef6\u3002", "duration": 7.963, "speaker": "ZH_B00000_S00110", "language": "zh", "dnsmos": 3.3808}"""
+    count = 0
+    for json_file in sorted(folder.glob("*.json")):
+        count += 1
+        if count_limit and count > count_limit:
+            break
+        with open(json_file, encoding="utf-8") as f:
+            data = json.load(f)
+            text = data["text"]
+            duration = data["duration"]
+        wav_path = folder /f'{json_file.stem}.wav'
+        if not wav_path.exists():
+            mp3_path = folder / f'{json_file.stem}.mp3'
+            command=f"ffmpeg -i {mp3_path}  -ac 1 -ar 16000 {wav_path}"
+            cmd(command)
+        yield wav_path, text, duration
+if __name__ == '__main__':
+    for res in read_dataset(count_limit=3):
+        print(res)

test_data/texts.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from environment import TEST_DATA
+def read_translation(language, count_limit=None):
+    if language == "zh":
+        text_file = TEST_DATA/"texts"/"test_translation_zh.txt"
+    elif language == "en":
+        text_file = TEST_DATA/"texts"/"test_translation_en.txt"
+    else:
+        raise ValueError(f"not support language: {language}")
+    count = 0
+    with open(text_file, encoding="utf-8") as f:
+        for line in f:
+            if not line.strip():
+                continue
+            count += 1
+            if count_limit is not None and count > count_limit:
+                break
+            yield line.strip()

test_data/{recordings/text → texts}/test_translation_en.txt RENAMED Viewed

File without changes

test_data/{recordings/text → texts}/test_translation_zh.txt RENAMED Viewed

File without changes

tests/test_models/__init__.py ADDED Viewed

File without changes

tests/test_models/conftest.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import platform
+from pytest import fixture
+@fixture(scope="session")
+def get_platform():
+    processor = platform.processor()
+    if processor.startswith("Intel"):
+        return "intel"
+    elif processor.startswith("arm"):
+        return "apple"
+    else:
+        raise ValueError(f"Unsupported platform: {processor}")

tests/test_models/test_funasr.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import pytest
+from lib.models.funasr import FunASR
+from lib.utils import read_audio, save_csv
+from test_data.audios import read_emilia
+from environment import REPORTS_DIR
+@pytest.fixture(scope="module")
+def asr(get_platform)-> FunASR:
+    if get_platform == "apple":
+        return FunASR()
+    elif get_platform == "intel":
+        pass
+def test_inference(asr: FunASR):
+    #TODO: 测试CER
+    report = []
+    for audio_file, text, duration in read_emilia(count_limit=100):
+        print(audio_file)
+        audio = read_audio(audio_file)
+        asr_text, time_cost = asr.transcribe(audio)
+        report.append([audio_file,duration, text, asr_text, time_cost])
+    save_csv(REPORTS_DIR/"funasr.csv", ["audio", "duration", "ref", "asr", "time"], report)

tests/test_models/test_llm.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import pytest
+from lib.models.llm import QwenTranslator
+from test_data.texts import read_translation
+from lib.utils import save_csv
+from environment import REPORTS_DIR
+@pytest.fixture(scope="module")
+def llm(get_platform)-> QwenTranslator:
+    if get_platform == "apple":
+        return QwenTranslator()
+    elif get_platform == "intel":
+        pass
+def test_llm_zh(llm: QwenTranslator):
+    report = []
+    for src in read_translation("zh"):
+        dst, time_cost = llm.translate(src, src_lang="zh", dst_lang="en")
+        print("Prompt:", src)
+        print("Response:", dst)
+        report.append([src, dst, time_cost])
+    save_csv(REPORTS_DIR/"translation_zh.csv", ["src", "dst", "time"], report)
+def test_llm_en(llm: QwenTranslator):
+    report = []
+    for src in read_translation("en"):
+        dst, time_cost = llm.translate(src, src_lang="en", dst_lang="zh")
+        print("Prompt:", src)
+        print("Response:", dst)
+        report.append([src, dst, time_cost])
+    save_csv(REPORTS_DIR/"translation_en.csv", ["src", "dst", "time"], report)

tests/test_models/test_tts.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import pytest
+from lib.models.kokoro import KokoroTTS
+from test_data.texts import read_translation
+from lib.utils import save_csv
+from environment import REPORTS_DIR
+@pytest.fixture(scope="module")
+def llm(get_platform) -> KokoroTTS:
+    if get_platform == "apple":
+        pass
+    elif get_platform == "intel":
+        pass
+def test_tts_zh():
+    tts = KokoroTTS.from_language("zh")
+    report = []
+    for text in read_translation("zh"):
+        samples, sr, time_cost = tts.generate(text)
+        report.append([text, time_cost])
+    save_csv(REPORTS_DIR / "tts_zh.csv", ["text", "time"], report)
+def test_tts_en():
+    tts = KokoroTTS.from_language("en")
+    report = []
+    for text in read_translation("en"):
+        samples, sr, time_cost = tts.generate(text)
+        report.append([text, time_cost])
+    save_csv(REPORTS_DIR / "tts_en.csv", ["text", "time"], report)

tests/test_models/test_whisper.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import pytest
+from lib.models.whisper import WhisperCPP
+from lib.utils import read_audio, save_csv
+from test_data.audios import read_emilia
+from environment import REPORTS_DIR
+@pytest.fixture(scope="module")
+def whisper(get_platform)-> WhisperCPP:
+    if get_platform == "apple":
+        return WhisperCPP()
+    elif get_platform == "intel":
+        pass
+def test_inference(whisper: WhisperCPP):
+    #TODO: 测试CER
+    report = []
+    for audio_file, text, duration in read_emilia(count_limit=100):
+        print(audio_file)
+        audio = read_audio(audio_file)
+        asr_text, time_cost = whisper.transcribe(audio, "zh")
+        report.append([audio_file,duration, text, asr_text, time_cost])
+    save_csv(REPORTS_DIR/"whisper.csv", ["audio", "duration", "ref", "asr", "time"], report)