qqc1989 commited on Mar 16, 2025

Commit

566fca0

verified ·

1 Parent(s): fedb44c

Upload 30 files

Browse files

Files changed (31) hide show

.gitattributes +1 -0
cpp/TSCharacters.ocd2 +0 -0
cpp/TSPhrases.ocd2 +0 -0
cpp/t2s.json +22 -0
cpp/whisper +3 -0
models-ax630c/base-decoder-loop.axmodel +3 -0
models-ax630c/base-decoder-main.axmodel +3 -0
models-ax630c/base-encoder.axmodel +3 -0
models-ax630c/base-positional_embedding.bin +3 -0
models-ax630c/base-tokens.txt +0 -0
models-ax650/small-decoder-loop.axmodel +3 -0
models-ax650/small-decoder-main.axmodel +3 -0
models-ax650/small-encoder.axmodel +3 -0
models-ax650/small-positional_embedding.bin +3 -0
models-ax650/small-tokens.txt +0 -0
models-onnx/base-decoder-loop.onnx +3 -0
models-onnx/base-decoder-main.onnx +3 -0
models-onnx/base-encoder.onnx +3 -0
models-onnx/base-positional_embedding.bin +3 -0
models-onnx/base-tokens.txt +0 -0
models-onnx/small-positional_embedding.bin +3 -0
models-onnx/small-tokens.txt +0 -0
models-onnx/tiny-decoder-loop.onnx +3 -0
models-onnx/tiny-decoder-main.onnx +3 -0
models-onnx/tiny-encoder.onnx +3 -0
models-onnx/tiny-positional_embedding.bin +3 -0
models-onnx/tiny-tokens.txt +0 -0
python/languages.py +102 -0
python/requirements.txt +4 -0
python/whisper.py +240 -0
python/whisper_onnx.py +239 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.axmodel filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.axmodel filter=lfs diff=lfs merge=lfs -text
+cpp/whisper filter=lfs diff=lfs merge=lfs -text

cpp/TSCharacters.ocd2 ADDED Viewed

Binary file (46.1 kB). View file

cpp/TSPhrases.ocd2 ADDED Viewed

Binary file (9.78 kB). View file

cpp/t2s.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "name": "Traditional Chinese to Simplified Chinese",
+  "segmentation": {
+    "type": "mmseg",
+    "dict": {
+      "type": "ocd2",
+      "file": "TSPhrases.ocd2"
+    }
+  },
+  "conversion_chain": [{
+    "dict": {
+      "type": "group",
+      "dicts": [{
+        "type": "ocd2",
+        "file": "TSPhrases.ocd2"
+      }, {
+        "type": "ocd2",
+        "file": "TSCharacters.ocd2"
+      }]
+    }
+  }]
+}

cpp/whisper ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2cbd2f10e309e8bdc3c63989b0637311fe3eb2a39d03c76c17bc66ac86405bc
+size 489848

models-ax630c/base-decoder-loop.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b12160aaa1ca31248a32ce05713fd72e273b16444389853c1f52990cf5130eb
+size 130364397

models-ax630c/base-decoder-main.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:940f273d111e3aee53cdb692a384a29556981aa146afbb2f558f6aac262c0621
+size 135675471

models-ax630c/base-encoder.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b9f89ed5bbe31bcf98aa0e479ced1699b39816db2d3e2e2ff84c6e887af2b79b
+size 56024079

models-ax630c/base-positional_embedding.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88fa1cdbf2b06f86b0ecb7be0fccfc39e906502986572b8cf5319c250e857169
+size 917504

models-ax630c/base-tokens.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

models-ax650/small-decoder-loop.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b472a0f3539d17fece09e92bf6cd69ebf391928a6050896bbf86b558a25def22
+size 269002567

models-ax650/small-decoder-main.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3bfc577f60c35192d8ce8cc24f9ca4aa84af72756ba11af9d178d337cb7eb1c
+size 285531695

models-ax650/small-encoder.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b3bc8db9762f9b2dfe78bffbc8070fb877b2572c5288253573e49a8c7b37948
+size 139705612

models-ax650/small-positional_embedding.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c10bc44f2bd94bdf1b7aa03581309fa536132b3fe79bfe22c9a6934a42cd8b58
+size 1376256

models-ax650/small-tokens.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

models-onnx/base-decoder-loop.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1616a829b7d3d643616633551204b8d0f008fb7a7dc38919eda2e8c6c6ed9714
+size 194571088

models-onnx/base-decoder-main.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1096b83590016bdbe74c66c7ccad1c0120abd6d37214560b1dfe4cd886a0e683
+size 205485892

models-onnx/base-encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd4b51bd569e9b2b2d83a8ed56f3618811f0c593aa95c010069df675027b5f2b
+size 95026988

models-onnx/base-positional_embedding.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88fa1cdbf2b06f86b0ecb7be0fccfc39e906502986572b8cf5319c250e857169
+size 917504

models-onnx/base-tokens.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

models-onnx/small-positional_embedding.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c10bc44f2bd94bdf1b7aa03581309fa536132b3fe79bfe22c9a6934a42cd8b58
+size 1376256

models-onnx/small-tokens.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

models-onnx/tiny-decoder-loop.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5cbb3533939e2dfdf567b27762b12cf0956b7d7982bfb915228d24789f483058
+size 112843354

models-onnx/tiny-decoder-main.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:59ced1cf4e9a6f2aef0a2457f64f846e5682033abb4b894ba7680a60c792ad73
+size 118301861

models-onnx/tiny-encoder.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8030a6d1f3615b8a5e000995fee88357768c7dbaad05a79f853a4040c97087b
+size 37606186

models-onnx/tiny-positional_embedding.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c13450ae630323a0bdd39b1226f92a7ac251131a909c7efdb7d2f5516736eb83
+size 688128

models-onnx/tiny-tokens.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

python/languages.py ADDED Viewed

	@@ -0,0 +1,102 @@

+WHISPER_LANGUAGES = {
+    "en": "english",
+    "zh": "chinese",
+    "de": "german",
+    "es": "spanish",
+    "ru": "russian",
+    "ko": "korean",
+    "fr": "french",
+    "ja": "japanese",
+    "pt": "portuguese",
+    "tr": "turkish",
+    "pl": "polish",
+    "ca": "catalan",
+    "nl": "dutch",
+    "ar": "arabic",
+    "sv": "swedish",
+    "it": "italian",
+    "id": "indonesian",
+    "hi": "hindi",
+    "fi": "finnish",
+    "vi": "vietnamese",
+    "he": "hebrew",
+    "uk": "ukrainian",
+    "el": "greek",
+    "ms": "malay",
+    "cs": "czech",
+    "ro": "romanian",
+    "da": "danish",
+    "hu": "hungarian",
+    "ta": "tamil",
+    "no": "norwegian",
+    "th": "thai",
+    "ur": "urdu",
+    "hr": "croatian",
+    "bg": "bulgarian",
+    "lt": "lithuanian",
+    "la": "latin",
+    "mi": "maori",
+    "ml": "malayalam",
+    "cy": "welsh",
+    "sk": "slovak",
+    "te": "telugu",
+    "fa": "persian",
+    "lv": "latvian",
+    "bn": "bengali",
+    "sr": "serbian",
+    "az": "azerbaijani",
+    "sl": "slovenian",
+    "kn": "kannada",
+    "et": "estonian",
+    "mk": "macedonian",
+    "br": "breton",
+    "eu": "basque",
+    "is": "icelandic",
+    "hy": "armenian",
+    "ne": "nepali",
+    "mn": "mongolian",
+    "bs": "bosnian",
+    "kk": "kazakh",
+    "sq": "albanian",
+    "sw": "swahili",
+    "gl": "galician",
+    "mr": "marathi",
+    "pa": "punjabi",
+    "si": "sinhala",
+    "km": "khmer",
+    "sn": "shona",
+    "yo": "yoruba",
+    "so": "somali",
+    "af": "afrikaans",
+    "oc": "occitan",
+    "ka": "georgian",
+    "be": "belarusian",
+    "tg": "tajik",
+    "sd": "sindhi",
+    "gu": "gujarati",
+    "am": "amharic",
+    "yi": "yiddish",
+    "lo": "lao",
+    "uz": "uzbek",
+    "fo": "faroese",
+    "ht": "haitian creole",
+    "ps": "pashto",
+    "tk": "turkmen",
+    "nn": "nynorsk",
+    "mt": "maltese",
+    "sa": "sanskrit",
+    "lb": "luxembourgish",
+    "my": "myanmar",
+    "bo": "tibetan",
+    "tl": "tagalog",
+    "mg": "malagasy",
+    "as": "assamese",
+    "tt": "tatar",
+    "haw": "hawaiian",
+    "ln": "lingala",
+    "ha": "hausa",
+    "ba": "bashkir",
+    "jw": "javanese",
+    "su": "sundanese",
+    "yue": "cantonese",
+}

python/requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+numpy==1.26.4
+soundfile
+librosa
+zhconv

python/whisper.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import argparse
+import axengine as axe
+import numpy as np
+import librosa
+import os
+from typing import Tuple
+import soundfile as sf
+import base64
+import zhconv
+import time
+from languages import WHISPER_LANGUAGES
+WHISPER_N_MELS      = 80
+WHISPER_SAMPLE_RATE = 16000
+WHISPER_N_FFT       = 480
+WHISPER_HOP_LENGTH  = 160
+WHISPER_SOT           = 50258
+WHISPER_EOT           = 50257
+WHISPER_BLANK         = 220
+WHISPER_NO_TIMESTAMPS = 50363
+WHISPER_NO_SPEECH     = 50362
+WHISPER_TRANSLATE     = 50358
+WHISPER_TRANSCRIBE    = 50359
+WHISPER_VOCAB_SIZE    = 51865
+WHISPER_N_TEXT_CTX    = 448
+NEG_INF = float("-inf")
+SOT_SEQUENCE = np.array([WHISPER_SOT,WHISPER_SOT + 1 + tuple(WHISPER_LANGUAGES).index("zh"),WHISPER_TRANSCRIBE,WHISPER_NO_TIMESTAMPS], dtype=np.int32)
+WHISPER_N_TEXT_STATE_MAP = {
+    "tiny": 384,
+    "base": 512,
+    "small": 768
+}
+def get_args():
+    parser = argparse.ArgumentParser(
+        prog="whisper",
+        description="Run Whisper on input audio file"
+    )
+    parser.add_argument("--wav", "-w", type=str, required=True, help="Input audio file")
+    parser.add_argument("--model_type", "-t", type=str, choices=["tiny", "base", "small"], required=True, help="model type, only support tiny, base and small currently")
+    parser.add_argument("--model_path", "-p", type=str, required=False, default="../models", help="model path for *.axmodel, tokens.txt, positional_embedding.bin")
+    parser.add_argument("--language", "-l", type=str, required=False, default="zh", help="Target language, support en, zh, ja, and others. See languages.py for more options.")
+    return parser.parse_args()
+def print_args(args):
+    print(f"wav: {args.wav}")
+    print(f"model_type: {args.model_type}")
+    print(f"model_path: {args.model_path}")
+    print(f"language: {args.language}")
+def load_audio(filename: str) -> Tuple[np.ndarray, int]:
+    data, sample_rate = sf.read(
+        filename,
+        always_2d=True,
+        dtype="float32",
+    )
+    data = data[:, 0]  # use only the first channel
+    data = librosa.resample(data, orig_sr=sample_rate, target_sr=WHISPER_SAMPLE_RATE)
+    samples = np.ascontiguousarray(data)
+    return samples, sample_rate
+def load_models(model_path, model_type):
+    encoder_path = f"{model_type}-encoder.axmodel"
+    decoder_main_path = f"{model_type}-decoder-main.axmodel"
+    decoder_loop_path = f"{model_type}-decoder-loop.axmodel"
+    pe_path = f"{model_type}-positional_embedding.bin"
+    token_path = f"{model_type}-tokens.txt"
+    required_files = [os.path.join(model_path, i) for i in (encoder_path, decoder_main_path, decoder_loop_path, pe_path, token_path)]
+    # Check file existence
+    for i, file_path in enumerate(required_files):
+        assert os.path.exists(file_path), f"{file_path} NOT exist"
+    # Load encoder
+    encoder = axe.InferenceSession(required_files[0])
+    # Load decoder main
+    decoder_main = axe.InferenceSession(required_files[1])
+    # Load decoder loop
+    decoder_loop = axe.InferenceSession(required_files[2])
+    # Load position embedding
+    pe = np.fromfile(required_files[3], dtype=np.float32)
+    # Load tokens
+    tokens = []
+    with open(required_files[4], "r") as f:
+        for line in f:
+            line = line.strip()
+            tokens.append(line.split(" ")[0])
+    return encoder, decoder_main, decoder_loop, pe, tokens
+def compute_feature(wav_path, n_mels = WHISPER_N_MELS, padding = 480000):
+    audio, sr = load_audio(wav_path)
+    audio = np.concatenate((audio, np.zeros((padding,), dtype=np.float32)), axis=-1)
+    mel = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=WHISPER_N_FFT, hop_length=WHISPER_HOP_LENGTH, window="hann", center=True, pad_mode="reflect", power=2.0, n_mels=n_mels)
+    log_spec = np.log10(np.maximum(mel, 1e-10))
+    log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
+    mel = (log_spec + 4.0) / 4.0
+    # We pad 1500 frames at the end so that it is able to detect eot
+    # You can use another value instead of 1500.
+    # mel = np.concatenate((mel, np.zeros((n_mels, 1500), dtype=np.float32)), axis=-1)
+    target = 3000
+    if mel.shape[1] > target:
+        # -50 so that there are some zero tail paddings.
+        mel = mel[:, : target]
+        mel[:, -50:] = 0
+    # We don't need to pad it to 30 seconds now!
+    if mel.shape[1] < target:
+        mel = np.concatenate((mel, np.zeros((n_mels, target - mel.shape[1]), dtype=np.float32)), axis=-1)
+    return mel
+def supress_tokens(logits, is_initial):
+    if is_initial:
+        logits[WHISPER_EOT] = NEG_INF
+        logits[WHISPER_BLANK] = NEG_INF
+    logits[WHISPER_NO_TIMESTAMPS] = NEG_INF
+    logits[WHISPER_SOT] = NEG_INF
+    logits[WHISPER_NO_SPEECH] = NEG_INF
+    logits[WHISPER_TRANSLATE] = NEG_INF
+    return logits
+def choose_language(lang):
+    if lang not in WHISPER_LANGUAGES.keys():
+        raise Exception(f"Unknown language: {lang}. Check languages.py for correct options.")
+    SOT_SEQUENCE[1] = WHISPER_SOT + 1 + tuple(WHISPER_LANGUAGES.keys()).index(lang)
+def main():
+    args = get_args()
+    print_args(args)
+    # Check wav existence
+    wav_path = args.wav
+    assert os.path.exists(wav_path), f"{wav_path} NOT exist"
+    # Choose language
+    choose_language(args.language)
+    # Load models and other stuff
+    start = time.time()
+    encoder, decoder_main, decoder_loop, pe, token_table = load_models(args.model_path, args.model_type)
+    print(f"Load models take {(time.time() - start) * 1000}ms")
+    WHISPER_N_TEXT_STATE = WHISPER_N_TEXT_STATE_MAP[args.model_type]
+    # Preprocess
+    start = time.time()
+    mel = compute_feature(wav_path, n_mels=WHISPER_N_MELS)
+    print(f"Preprocess wav take {(time.time() - start) * 1000}ms")
+    # mel.tofile("mel.bin")
+    # Run encoder
+    start = time.time()
+    x = encoder.run(None, input_feed={"mel": mel[None, ...]})
+    n_layer_cross_k, n_layer_cross_v = x
+    print(f"Run encoder take {(time.time() - start) * 1000}ms")
+    # n_layer_cross_k.tofile("n_layer_cross_k.bin")
+    # n_layer_cross_v.tofile("n_layer_cross_v.bin")
+    # Run decoder_main
+    start = time.time()
+    x = decoder_main.run(None, input_feed={
+        "tokens": SOT_SEQUENCE[None, ...],
+        "n_layer_cross_k": n_layer_cross_k,
+        "n_layer_cross_v": n_layer_cross_v
+    })
+    logits, n_layer_self_k_cache, n_layer_self_v_cache = x
+    print(f"Run decoder_main take {(time.time() - start) * 1000}ms")
+    # Decode token
+    logits = logits[0, -1, :]
+    logits = supress_tokens(logits, is_initial=True)
+    # logits.tofile("logits.bin")
+    max_token_id = np.argmax(logits)
+    output_tokens = []
+    print(f"First token: {max_token_id}")
+    # Position embedding offset
+    offset = SOT_SEQUENCE.shape[0]
+    # Autoregressively run decoder until token meets EOT
+    for i in range(WHISPER_N_TEXT_CTX - SOT_SEQUENCE.shape[0]):
+        if max_token_id == WHISPER_EOT:
+            break
+        output_tokens.append(max_token_id)
+        mask = np.zeros((WHISPER_N_TEXT_CTX,), dtype=np.float32)
+        mask[: WHISPER_N_TEXT_CTX - offset - 1] = NEG_INF
+        # Run decoder_loop
+        start = time.time()
+        x = decoder_loop.run(None, input_feed={
+            "tokens": np.array([[output_tokens[-1]]], dtype=np.int32),
+            "in_n_layer_self_k_cache": n_layer_self_k_cache,
+            "in_n_layer_self_v_cache": n_layer_self_v_cache,
+            "n_layer_cross_k": n_layer_cross_k,
+            "n_layer_cross_v": n_layer_cross_v,
+            "positional_embedding": pe[offset * WHISPER_N_TEXT_STATE : (offset + 1) * WHISPER_N_TEXT_STATE][None, ...],
+            "mask": mask
+        })
+        logits, n_layer_self_k_cache, n_layer_self_v_cache = x
+        print(f"Run decoder_loop take {(time.time() - start) * 1000}ms")
+        # Decode token
+        offset += 1
+        logits = supress_tokens(logits.flatten(), is_initial=False)
+        max_token_id = np.argmax(logits)
+        print(f"Iter {i} \t Token: {max_token_id}")
+    s = b""
+    for i in output_tokens:
+        s += base64.b64decode(token_table[i])
+    # print(s.decode().strip())
+    pd = s.decode().strip()
+    if args.language == "zh":
+        pd = zhconv.convert(pd, 'zh-hans')
+    print(f"Result: {pd}")
+if __name__ == "__main__":
+    main()

python/whisper_onnx.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import argparse
+import onnxruntime as ort
+import numpy as np
+import librosa
+import os
+from typing import Tuple
+import soundfile as sf
+import base64
+import zhconv
+import time
+import torch
+from torch.nn import functional as F
+from languages import WHISPER_LANGUAGES
+WHISPER_N_MELS      = 80
+WHISPER_SAMPLE_RATE = 16000
+WHISPER_N_FFT       = 480
+WHISPER_HOP_LENGTH  = 160
+WHISPER_SOT           = 50258
+WHISPER_EOT           = 50257
+WHISPER_BLANK         = 220
+WHISPER_NO_TIMESTAMPS = 50363
+WHISPER_NO_SPEECH     = 50362
+WHISPER_TRANSLATE     = 50358
+WHISPER_TRANSCRIBE    = 50359
+WHISPER_VOCAB_SIZE    = 51865
+WHISPER_N_TEXT_CTX    = 448
+NEG_INF = float("-inf")
+SOT_SEQUENCE = np.array([WHISPER_SOT,WHISPER_SOT + 1 + tuple(WHISPER_LANGUAGES).index("zh"),WHISPER_TRANSCRIBE,WHISPER_NO_TIMESTAMPS], dtype=np.int64)
+WHISPER_N_TEXT_STATE_MAP = {
+    "tiny": 384,
+    "base": 512,
+    "small": 768
+}
+def get_args():
+    parser = argparse.ArgumentParser(
+        prog="whisper",
+        description="Run Whisper on input audio file"
+    )
+    parser.add_argument("--wav", "-w", type=str, required=True, help="Input audio file")
+    parser.add_argument("--model_type", "-t", type=str, choices=["tiny", "base", "small"], required=True, help="model type, only support tiny/base/small currently")
+    parser.add_argument("--model_path", "-p", type=str, required=False, default="../models", help="model path for *.axmodel, tokens.txt, positional_embedding.bin")
+    parser.add_argument("--language", "-l", type=str, required=False, default="zh", help="Target language, support en, zh, ja, and others. See languages.py for more options.")
+    return parser.parse_args()
+def print_args(args):
+    print(f"wav: {args.wav}")
+    print(f"model_type: {args.model_type}")
+    print(f"model_path: {args.model_path}")
+    print(f"language: {args.language}")
+def load_audio(filename: str) -> Tuple[np.ndarray, int]:
+    data, sample_rate = sf.read(
+        filename,
+        always_2d=True,
+        dtype="float32",
+    )
+    data = data[:, 0]  # use only the first channel
+    data = librosa.resample(data, orig_sr=sample_rate, target_sr=WHISPER_SAMPLE_RATE)
+    samples = np.ascontiguousarray(data)
+    return samples, sample_rate
+def load_models(model_path, model_type):
+    encoder_path = f"{model_type}-encoder.onnx"
+    decoder_main_path = f"{model_type}-decoder-main.onnx"
+    decoder_loop_path = f"{model_type}-decoder-loop.onnx"
+    pe_path = f"{model_type}-positional_embedding.bin"
+    token_path = f"{model_type}-tokens.txt"
+    required_files = [os.path.join(model_path, i) for i in (encoder_path, decoder_main_path, decoder_loop_path, pe_path, token_path)]
+    # Check file existence
+    for i, file_path in enumerate(required_files):
+        assert os.path.exists(file_path), f"{file_path} NOT exist"
+    # Load encoder
+    encoder = ort.InferenceSession(required_files[0], providers=['CPUExecutionProvider'])
+    # Load decoder main
+    decoder_main = ort.InferenceSession(required_files[1], providers=['CPUExecutionProvider'])
+    # Load decoder loop
+    decoder_loop = ort.InferenceSession(required_files[2], providers=['CPUExecutionProvider'])
+    # Load position embedding
+    pe = np.fromfile(required_files[3], dtype=np.float32)
+    # Load tokens
+    tokens = []
+    with open(required_files[4], "r") as f:
+        for line in f:
+            line = line.strip()
+            tokens.append(line.split(" ")[0])
+    return encoder, decoder_main, decoder_loop, pe, tokens
+def compute_feature(wav_path, n_mels = WHISPER_N_MELS, padding = 480000):
+    audio, sr = load_audio(wav_path)
+    audio = np.concatenate((audio, np.zeros((padding,), dtype=np.float32)), axis=-1)
+    mel = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=WHISPER_N_FFT, hop_length=WHISPER_HOP_LENGTH, window="hann", center=True, pad_mode="reflect", power=2.0, n_mels=n_mels)
+    log_spec = np.log10(np.maximum(mel, 1e-10))
+    log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
+    mel = (log_spec + 4.0) / 4.0
+    # We pad 1500 frames at the end so that it is able to detect eot
+    # You can use another value instead of 1500.
+    # mel = np.concatenate((mel, np.zeros((n_mels, 1500), dtype=np.float32)), axis=-1)
+    target = 3000
+    if mel.shape[1] > target:
+        # -50 so that there are some zero tail paddings.
+        mel = mel[:, : target]
+        mel[:, -50:] = 0
+    # We don't need to pad it to 30 seconds now!
+    if mel.shape[1] < target:
+        mel = np.concatenate((mel, np.zeros((n_mels, target - mel.shape[1]), dtype=np.float32)), axis=-1)
+    return mel
+def supress_tokens(logits, is_initial):
+    if is_initial:
+        logits[WHISPER_EOT] = NEG_INF
+        logits[WHISPER_BLANK] = NEG_INF
+    logits[WHISPER_NO_TIMESTAMPS] = NEG_INF
+    logits[WHISPER_SOT] = NEG_INF
+    logits[WHISPER_NO_SPEECH] = NEG_INF
+    logits[WHISPER_TRANSLATE] = NEG_INF
+    return logits
+def choose_language(lang):
+    if lang not in WHISPER_LANGUAGES.keys():
+        raise Exception(f"Unknown language: {lang}. Check languages.py for correct options.")
+    SOT_SEQUENCE[1] = WHISPER_SOT + 1 + tuple(WHISPER_LANGUAGES.keys()).index(lang)
+def main():
+    args = get_args()
+    print_args(args)
+    # Check wav existence
+    wav_path = args.wav
+    assert os.path.exists(wav_path), f"{wav_path} NOT exist"
+    # Choose language
+    choose_language(args.language)
+    # Load models and other stuff
+    encoder, decoder_main, decoder_loop, pe, token_table = load_models(args.model_path, args.model_type)
+    WHISPER_N_TEXT_STATE = WHISPER_N_TEXT_STATE_MAP[args.model_type]
+    # Preprocess
+    mel = compute_feature(wav_path, n_mels=WHISPER_N_MELS)
+    # mel.tofile("mel.bin")
+    # mel = np.load("../mel.npy")[..., :3000]
+    # Run encoder
+    start = time.time()
+    x = encoder.run(None, input_feed={"mel": mel[None, ...]})
+    n_layer_cross_k, n_layer_cross_v = x
+    print(f"Run encoder take {(time.time() - start) * 1000}ms")
+    # n_layer_cross_k.tofile("n_layer_cross_k.bin")
+    # n_layer_cross_v.tofile("n_layer_cross_v.bin")
+    # Run decoder_main
+    start = time.time()
+    x = decoder_main.run(None, input_feed={
+        "tokens": SOT_SEQUENCE[None, ...],
+        "n_layer_cross_k": n_layer_cross_k,
+        "n_layer_cross_v": n_layer_cross_v
+    })
+    logits, n_layer_self_k_cache, n_layer_self_v_cache = x
+    print(f"Run decoder_main take {(time.time() - start) * 1000}ms")
+    # Decode token
+    logits = logits[0, -1, :]
+    logits = supress_tokens(logits, is_initial=True)
+    # logits.tofile("logits.bin")
+    max_token_id = np.argmax(logits)
+    output_tokens = []
+    print(f"First token: {max_token_id}")
+    # Position embedding offset
+    offset = SOT_SEQUENCE.shape[0]
+    # Autoregressively run decoder until token meets EOT
+    for i in range(WHISPER_N_TEXT_CTX - SOT_SEQUENCE.shape[0]):
+        if max_token_id == WHISPER_EOT:
+            break
+        output_tokens.append(max_token_id)
+        mask = np.zeros((WHISPER_N_TEXT_CTX,), dtype=np.float32)
+        mask[: WHISPER_N_TEXT_CTX - offset - 1] = NEG_INF
+        # Run decoder_loop
+        start = time.time()
+        x = decoder_loop.run(None, input_feed={
+            "tokens": np.array([[output_tokens[-1]]], dtype=np.int64),
+            "in_n_layer_self_k_cache": n_layer_self_k_cache,
+            "in_n_layer_self_v_cache": n_layer_self_v_cache,
+            "n_layer_cross_k": n_layer_cross_k,
+            "n_layer_cross_v": n_layer_cross_v,
+            "positional_embedding": pe[offset * WHISPER_N_TEXT_STATE : (offset + 1) * WHISPER_N_TEXT_STATE][None, ...],
+            "mask": mask
+        })
+        logits, n_layer_self_k_cache, n_layer_self_v_cache = x
+        print(f"Run decoder_loop take {(time.time() - start) * 1000}ms")
+        # Decode token
+        offset += 1
+        logits = supress_tokens(logits.flatten(), is_initial=False)
+        max_token_id = np.argmax(logits)
+        print(f"Iter {i} \t Token: {max_token_id}")
+    s = b""
+    for i in output_tokens:
+        s += base64.b64decode(token_table[i])
+    # print(s.decode().strip())
+    pd = s.decode().strip()
+    if args.language == "zh":
+        pd = zhconv.convert(pd, 'zh-hans')
+    print(f"Result: {pd}")
+if __name__ == "__main__":
+    main()