support test_models on Intel

Browse files

Files changed (15) hide show

environment.py +1 -1
lib/models/intel/__init__.py +15 -0
lib/models/intel/funasr.py +29 -0
lib/models/intel/kokoro.py +111 -0
lib/models/intel/llm.py +30 -0
lib/models/intel/whisper.py +30 -0
lib/models/mac/__init__.py +0 -0
lib/models/{funasr.py → mac/funasr.py} +2 -2
lib/models/{kokoro.py → mac/kokoro.py} +0 -2
lib/models/{llm.py → mac/llm.py} +0 -0
lib/models/{whisper.py → mac/whisper.py} +0 -0
tests/test_models/test_funasr.py +7 -5
tests/test_models/test_llm.py +6 -5
tests/test_models/test_tts.py +10 -9
tests/test_models/test_whisper.py +5 -4

environment.py CHANGED Viewed

@@ -3,7 +3,7 @@ from enum import Enum
-PROJECT_DIR = Path("/Users/jeqin/work/code/TestTranslator")
 APP_PATH = Path("/Applications/YoYo Translator.app/Contents/MacOS/YoYo Translator")
 APP_LOG = Path('/tmp/translator.log')

+PROJECT_DIR = Path(__file__).parent
 APP_PATH = Path("/Applications/YoYo Translator.app/Contents/MacOS/YoYo Translator")
 APP_LOG = Path('/tmp/translator.log')

lib/models/intel/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+import sys
+from pathlib import Path
+BINARY_DIR = Path(r"D:\yujuan\yoyo-translator-win\resources\bin")
+def load_s2ts_lib():
+    sys.path.append(f"{BINARY_DIR}")
+    dll_path = os.environ.get("PATH")
+    new_dll_path = dll_path + f";{BINARY_DIR}"
+    os.environ["PATH"] = new_dll_path
+    os.environ["ONEDNN_MAX_CPU_ISA"] = "AVX2_VNNI"
+    os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+load_s2ts_lib()

lib/models/intel/funasr.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from pathlib import Path
+import numpy as np
+from lib.utils import Timer
+from s2ts import S2TS
+from s2ts import TaskExecInfo as CTaskExecInfo
+MODEL_DIR = Path(r"D:\yujuan\yoyo-translator-win\models\funasr")
+class FunAsrOv:
+    def __init__(self, model_dir=MODEL_DIR):
+        with Timer("load FunASR"):
+            self.instance = S2TS()
+            ret = self.instance.start_fun_asr(f'{model_dir}/model_files', f'{model_dir}/punc', f"{model_dir}/hotword.bin")
+            print(f"model load {'success' if ret else 'failed'}")
+        self._warm_up()
+    def _warm_up(self):
+        fake_audio = np.random.randn(16000).astype(np.float32)
+        self.transcribe(fake_audio)
+    def transcribe(self, audio: np.ndarray):
+        task_info = CTaskExecInfo()
+        task_info.audio_data = audio.tolist()
+        task_info.audio_language = "zh"
+        with Timer("FunASR inference") as t:
+            self.instance.put_asr(task_info)
+            res: CTaskExecInfo = self.instance.get_asr(0)
+        return "".join(word.text for word in res.words), t.duration

lib/models/intel/kokoro.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from pathlib import Path
+from kokoro_onnx import Kokoro
+from misaki import espeak, ja, en, zh
+from misaki.espeak import EspeakG2P
+import re
+from functools import lru_cache
+from loguru import logger
+import onnxruntime
+import os
+from lib.utils import Timer, write_audio
+providers = onnxruntime.get_available_providers()
+print(f"Available onnx runtime providers: {providers}")
+MODEL_DIR = Path(r"D:\yujuan\yoyo-translator-win\models\kokoro")
+def create_session(model_path):
+    # See list of providers https://github.com/microsoft/onnxruntime/issues/22101#issuecomment-2357667377
+    providers = onnxruntime.get_available_providers()
+    print(f"Available onnx runtime providers: {providers}")
+    # See session options https://onnxruntime.ai/docs/performance/tune-performance/threading.html#thread-management
+    sess_options = onnxruntime.SessionOptions()
+    cpu_count = os.cpu_count() // 2
+    print(f"Setting threads to CPU cores count: {cpu_count}")
+    sess_options.intra_op_num_threads = cpu_count
+    session = onnxruntime.InferenceSession(
+        model_path, providers=providers, sess_options=sess_options
+    )
+    return session
+class KokoroTTS:
+    language_voice_mapping = {
+        "JP": "jf_alpha",
+        "JA": "jf_alpha",
+        "ZH": "zf_xiaoyi",
+        "EN": "af_heart",
+        "FR": "ff_siwis",
+        "IT": "im_nicola",
+        "HI": "hf_alpha",
+        "PT": "im_nicola",
+        "ES": "im_nicola"
+    }
+    def __init__(self, model_path: str, voice_model_path: str, vocab_config=None, gcp=None, voice=None):
+        self._session = create_session(model_path)
+        self.model = Kokoro.from_session(self._session, voice_model_path, vocab_config=vocab_config)
+        self.g2p = gcp
+        self.voice = voice
+    @classmethod
+    def from_language(cls, language: str, model_dir: Path=MODEL_DIR):
+        model_path: str = str(model_dir/"kokoro-quant.onnx")
+        voice_model_path: str = str(model_dir/"voices-v1.0.bin")
+        voice = cls.language_voice_mapping.get(language.upper())
+        logger.info(f"[TTS] language: {language}")
+        if not voice:
+            raise ValueError(f"Unsupported language: {language}, voice: {voice}")
+        if language.upper() == "ZH":
+            tts = cls(model_path, voice_model_path, vocab_config=model_dir / "zh_config.json", gcp=zh.ZHG2P(),
+                      voice=voice)
+            tts.generate("你好")
+        elif language.upper() in ['JP', 'JA']:
+            tts = cls(model_path, voice_model_path, vocab_config=model_dir / "ja_config.json", gcp=ja.JAG2P(),
+                      voice=voice)
+        elif language.upper() == 'EN':
+            fallback = espeak.EspeakFallback(british=False)
+            tts = cls(model_path, voice_model_path, gcp=en.G2P(trf=False, british=False, fallback=fallback),
+                      voice=voice)
+            tts.generate("hello")
+        elif language.upper() == "HI":
+            g2p = EspeakG2P(language="hi")
+            tts = cls(model_path, voice_model_path, gcp=g2p, voice=voice)
+            tts.generate("हेलो")
+        elif language.upper() == "IT":
+            g2p = EspeakG2P(language="it")
+            tts = cls(model_path, voice_model_path, gcp=g2p, voice=voice)
+            tts.generate("Ciao")
+        elif language.upper() == "PT":
+            g2p = EspeakG2P(language="pt-br")
+            tts = cls(model_path, voice_model_path, gcp=g2p, voice=voice)
+            tts.generate("Olá")
+        elif language.upper() == "ES":
+            g2p = EspeakG2P(language="es")
+            tts = cls(model_path, voice_model_path, gcp=g2p, voice=voice)
+            tts.generate("Hola")
+        elif language.upper() == "FR":
+            g2p = EspeakG2P(language="fr-fr")
+            tts = cls(model_path, voice_model_path, gcp=g2p, voice=voice)
+            tts.generate("Bonjour")
+        else:
+            tts = cls(model_path, voice_model_path, gcp=EspeakG2P(language.lower()), voice=voice)
+        return tts
+    def generate(self, text, speed=1.2):
+        with Timer("tts inference") as t:
+            phonemes, _ = self.g2p(text)
+            samples, sample_rate = self.model.create(phonemes, self.voice, is_phonemes=True, speed=speed)
+        return samples, sample_rate, t.duration
+    async def stream(self, text, speed=1.2):
+        phonemes, _ = self.g2p(text)
+        stream = self.model.create_stream(phonemes, self.voice, is_phonemes=True, speed=speed)
+        async for samples, sample_rate in stream:
+            yield samples, sample_rate
+@lru_cache
+def get_model(language):
+    return KokoroTTS.from_language(language=language, model_dir_path=resource_path('models/kokoro'))

lib/models/intel/llm.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from pathlib import Path
+import numpy as np
+from lib.utils import Timer
+from s2ts import S2TS
+from s2ts import TaskExecInfo as CTaskExecInfo
+MODEL_DIR = Path(r"D:\yujuan\yoyo-translator-win\models\llm\Qwen3-1.7B-int8-ov")
+class QwenOv:
+    def __init__(self, model_dir=MODEL_DIR):
+        with Timer("load LLM"):
+            self.instance = S2TS()
+            ret = self.instance.start_translate_genai(str(model_dir))
+            print(f"model load {'success' if ret else 'failed'}")
+        self._warm_up()
+    def _warm_up(self):
+        self.translate("How are you?", "en", "zh")
+    def translate(self, prompt, src_lang, dst_lang):
+        task_info = CTaskExecInfo()
+        task_info.transcribe_content = prompt
+        task_info.audio_language=src_lang
+        task_info.translate_language=dst_lang
+        with Timer("LLM inference") as t:
+            self.instance.put_llm(task_info)
+            res: CTaskExecInfo = self.instance.get_llm(0)
+        return res.translate_content, t.duration

lib/models/intel/whisper.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from pathlib import Path
+import numpy as np
+from lib.utils import Timer
+from s2ts import S2TS
+from s2ts import TaskExecInfo as CTaskExecInfo
+MODEL_DIR = Path(r"D:\yujuan\yoyo-translator-win\models\whisper-large-v3-turbo-int8")
+class WhisperOv:
+    def __init__(self, model_dir=MODEL_DIR):
+        with Timer("load Whisper"):
+            self.instance = S2TS()
+            ret = self.instance.start_asr_genai("en", str(model_dir), False, "")
+            print(f"model load {'success' if ret else 'failed'}")
+        self._warm_up()
+    def _warm_up(self):
+        fake_audio = np.random.randn(16000).astype(np.float32)
+        self.transcribe(fake_audio, "en")
+    def transcribe(self, audio: np.ndarray, language):
+        task_info = CTaskExecInfo()
+        task_info.audio_data = audio.tolist()
+        task_info.audio_language = language
+        with Timer("Whisper inference") as t:
+            self.instance.put_asr(task_info)
+            res: CTaskExecInfo = self.instance.get_asr(0)
+        return "".join(word.text for word in res.words), t.duration

lib/models/mac/__init__.py ADDED Viewed

File without changes

lib/models/{funasr.py → mac/funasr.py} RENAMED Viewed

@@ -8,7 +8,7 @@ from lib.utils import Timer, read_audio
 MODEL_DIR = Path("/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models")
-class FunASR:
     def __init__(self, model_dir=MODEL_DIR, quantize=True):
         asr_model_path = model_dir / 'speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
         # vad_model_path = model_dir / 'speech_fsmn_vad_zh-cn-16k-common-pytorch'
@@ -34,7 +34,7 @@ class FunASR:
         return text, t.duration
 if __name__ == '__main__':
-    funasr = FunASR()
     audio = read_audio(Path("/Users/jeqin/work/code/TestTranslator/test_data/recordings/1.wav"))
     text, time_cost =funasr.transcribe(audio)
     print(text)

 MODEL_DIR = Path("/Users/jeqin/work/code/Translator/python_server/moyoyo_asr_models")
+class FunAsrOnnx:
     def __init__(self, model_dir=MODEL_DIR, quantize=True):
         asr_model_path = model_dir / 'speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
         # vad_model_path = model_dir / 'speech_fsmn_vad_zh-cn-16k-common-pytorch'
         return text, t.duration
 if __name__ == '__main__':
+    funasr = FunAsrOnnx()
     audio = read_audio(Path("/Users/jeqin/work/code/TestTranslator/test_data/recordings/1.wav"))
     text, time_cost =funasr.transcribe(audio)
     print(text)

lib/models/{kokoro.py → mac/kokoro.py} RENAMED Viewed

@@ -3,9 +3,7 @@ from pathlib import Path
 from kokoro_onnx import Kokoro
 from misaki import espeak, en, zh
 from misaki.espeak import EspeakG2P
-from functools import lru_cache
 from logging import getLogger
-import librosa
 import onnxruntime
 from lib.utils import Timer, write_audio

 from kokoro_onnx import Kokoro
 from misaki import espeak, en, zh
 from misaki.espeak import EspeakG2P
 from logging import getLogger
 import onnxruntime
 from lib.utils import Timer, write_audio

lib/models/{llm.py → mac/llm.py} RENAMED Viewed

File without changes

lib/models/{whisper.py → mac/whisper.py} RENAMED Viewed

File without changes

tests/test_models/test_funasr.py CHANGED Viewed

@@ -1,17 +1,19 @@
 import pytest
-from lib.models.funasr import FunASR
 from lib.utils import read_audio, save_csv
 from test_data.audios import read_emilia
 from environment import REPORTS_DIR
 @pytest.fixture(scope="module")
-def asr(get_platform)-> FunASR:
     if get_platform == "apple":
-        return FunASR()
     elif get_platform == "intel":
-        pass
-def test_inference(asr: FunASR):
     #TODO: 测试CER
     report = []
     for audio_file, text, duration in read_emilia(count_limit=100):

 import pytest
 from lib.utils import read_audio, save_csv
 from test_data.audios import read_emilia
 from environment import REPORTS_DIR
 @pytest.fixture(scope="module")
+def asr(get_platform):
     if get_platform == "apple":
+        from lib.models.mac.funasr import FunAsrOnnx
+        return FunAsrOnnx()
     elif get_platform == "intel":
+        from lib.models.intel.funasr import FunAsrOv
+        return FunAsrOv()
+def test_inference(asr):
     #TODO: 测试CER
     report = []
     for audio_file, text, duration in read_emilia(count_limit=100):

tests/test_models/test_llm.py CHANGED Viewed

@@ -1,17 +1,18 @@
 import pytest
-from lib.models.llm import QwenTranslator
 from test_data.texts import read_translation
 from lib.utils import save_csv
 from environment import REPORTS_DIR
 @pytest.fixture(scope="module")
-def llm(get_platform)-> QwenTranslator:
     if get_platform == "apple":
         return QwenTranslator()
     elif get_platform == "intel":
-        pass
-def test_llm_zh(llm: QwenTranslator):
     report = []
     for src in read_translation("zh"):
         dst, time_cost = llm.translate(src, src_lang="zh", dst_lang="en")
@@ -20,7 +21,7 @@ def test_llm_zh(llm: QwenTranslator):
         report.append([src, dst, time_cost])
     save_csv(REPORTS_DIR/"translation_zh.csv", ["src", "dst", "time"], report)
-def test_llm_en(llm: QwenTranslator):
     report = []
     for src in read_translation("en"):
         dst, time_cost = llm.translate(src, src_lang="en", dst_lang="zh")

 import pytest
 from test_data.texts import read_translation
 from lib.utils import save_csv
 from environment import REPORTS_DIR
 @pytest.fixture(scope="module")
+def llm(get_platform):
     if get_platform == "apple":
+        from lib.models.mac.llm import QwenTranslator
         return QwenTranslator()
     elif get_platform == "intel":
+        from lib.models.intel.llm import QwenOv
+        return QwenOv()
+def test_llm_zh(llm):
     report = []
     for src in read_translation("zh"):
         dst, time_cost = llm.translate(src, src_lang="zh", dst_lang="en")
         report.append([src, dst, time_cost])
     save_csv(REPORTS_DIR/"translation_zh.csv", ["src", "dst", "time"], report)
+def test_llm_en(llm):
     report = []
     for src in read_translation("en"):
         dst, time_cost = llm.translate(src, src_lang="en", dst_lang="zh")

tests/test_models/test_tts.py CHANGED Viewed

@@ -1,20 +1,21 @@
 import pytest
-from lib.models.kokoro import KokoroTTS
 from test_data.texts import read_translation
 from lib.utils import save_csv
 from environment import REPORTS_DIR
 @pytest.fixture(scope="module")
-def llm(get_platform) -> KokoroTTS:
     if get_platform == "apple":
-        pass
     elif get_platform == "intel":
-        pass
-def test_tts_zh():
-    tts = KokoroTTS.from_language("zh")
     report = []
     for text in read_translation("zh"):
         samples, sr, time_cost = tts.generate(text)
@@ -22,10 +23,10 @@ def test_tts_zh():
     save_csv(REPORTS_DIR / "tts_zh.csv", ["text", "time"], report)
-def test_tts_en():
-    tts = KokoroTTS.from_language("en")
     report = []
     for text in read_translation("en"):
-        samples, sr, time_cost = tts.generate(text)
         report.append([text, time_cost])
     save_csv(REPORTS_DIR / "tts_en.csv", ["text", "time"], report)

 import pytest
 from test_data.texts import read_translation
 from lib.utils import save_csv
 from environment import REPORTS_DIR
 @pytest.fixture(scope="module")
+def tts(get_platform):
     if get_platform == "apple":
+        from lib.models.mac.kokoro import KokoroTTS
+        return KokoroTTS
     elif get_platform == "intel":
+        from lib.models.intel.kokoro import KokoroTTS
+        return KokoroTTS
+def test_tts_zh(tts):
+    tts = tts.from_language("zh")
     report = []
     for text in read_translation("zh"):
         samples, sr, time_cost = tts.generate(text)
     save_csv(REPORTS_DIR / "tts_zh.csv", ["text", "time"], report)
+def test_tts_en(tts):
+    tts = tts.from_language("en")
     report = []
     for text in read_translation("en"):
+        samples, sr, time_cost = tts.generate(text, speed=1.4)
         report.append([text, time_cost])
     save_csv(REPORTS_DIR / "tts_en.csv", ["text", "time"], report)

tests/test_models/test_whisper.py CHANGED Viewed

@@ -1,17 +1,18 @@
 import pytest
-from lib.models.whisper import WhisperCPP
 from lib.utils import read_audio, save_csv
 from test_data.audios import read_emilia
 from environment import REPORTS_DIR
 @pytest.fixture(scope="module")
-def whisper(get_platform)-> WhisperCPP:
     if get_platform == "apple":
         return WhisperCPP()
     elif get_platform == "intel":
-        pass
-def test_inference(whisper: WhisperCPP):
     #TODO: 测试CER
     report = []
     for audio_file, text, duration in read_emilia(count_limit=100):

 import pytest
 from lib.utils import read_audio, save_csv
 from test_data.audios import read_emilia
 from environment import REPORTS_DIR
 @pytest.fixture(scope="module")
+def whisper(get_platform):
     if get_platform == "apple":
+        from lib.models.mac.whisper import WhisperCPP
         return WhisperCPP()
     elif get_platform == "intel":
+        from lib.models.intel.whisper import WhisperOv
+        return WhisperOv()
+def test_inference(whisper):
     #TODO: 测试CER
     report = []
     for audio_file, text, duration in read_emilia(count_limit=100):