inoryQwQ commited on Jan 14

Commit

798e40d

1 Parent(s): 33455e8

Update cpp bins, python scripts, add English readme

Files changed (34) hide show

.gitattributes +4 -0
.gitignore +1 -0
README.md +61 -74
README_EN.md +261 -0
cpp/ax630c/TSCharacters.ocd2 +0 -0
cpp/ax630c/TSPhrases.ocd2 +0 -0
cpp/ax630c/include/ax_whisper_api.h +107 -0
cpp/ax630c/lib/cmake/ax_whisper/ax_whisper-config-release.cmake +19 -0
cpp/ax630c/lib/cmake/ax_whisper/ax_whisper-config.cmake +94 -0
cpp/{TSCharacters.ocd2 → ax630c/lib/libax_whisper.so} +2 -2
cpp/ax630c/t2s.json +22 -0
cpp/ax630c/whisper_cli +0 -0
cpp/{TSPhrases.ocd2 → ax630c/whisper_svr} +2 -2
cpp/ax650/TSCharacters.ocd2 +0 -0
cpp/ax650/TSPhrases.ocd2 +0 -0
cpp/ax650/include/ax_whisper_api.h +107 -0
cpp/ax650/lib/cmake/ax_whisper/ax_whisper-config-release.cmake +19 -0
cpp/ax650/lib/cmake/ax_whisper/ax_whisper-config.cmake +94 -0
cpp/{t2s.json → ax650/lib/libax_whisper.so} +2 -2
cpp/ax650/t2s.json +22 -0
cpp/ax650/whisper_cli +0 -0
cpp/{whisper_aarch64 → ax650/whisper_svr} +2 -2
cpp/whisper_axcl_aarch64 +0 -3
cpp/whisper_axcl_x86 +0 -3
cpp/whisper_srv +0 -3
python/assets/multilingual.tiktoken +0 -0
python/languages.py +0 -102
python/main.py +0 -74
python/test_svr.py +46 -0
python/test_wer.py +196 -25
python/{whisper.py → whisper_ax.py} +76 -45
python/whisper_cli.py +59 -35
python/whisper_svr.py +1 -1
python/whisper_tokenizer.py +0 -395

.gitattributes CHANGED Viewed

@@ -51,3 +51,7 @@ ax620e/install/whisper filter=lfs diff=lfs merge=lfs -text
 *axcl_aarch64/whisper filter=lfs diff=lfs merge=lfs -text
 *ax650/install/whisper filter=lfs diff=lfs merge=lfs -text
 *ax620e/install/whisper filter=lfs diff=lfs merge=lfs -text

 *axcl_aarch64/whisper filter=lfs diff=lfs merge=lfs -text
 *ax650/install/whisper filter=lfs diff=lfs merge=lfs -text
 *ax620e/install/whisper filter=lfs diff=lfs merge=lfs -text
+cpp/ax630c/lib/libax_whisper.so filter=lfs diff=lfs merge=lfs -text
+cpp/ax650/lib/libax_whisper.so filter=lfs diff=lfs merge=lfs -text
+cpp/ax650/whisper_svr filter=lfs diff=lfs merge=lfs -text
+cpp/ax630c/whisper_svr filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

README.md CHANGED Viewed

@@ -5,6 +5,10 @@ pipeline_tag: automatic-speech-recognition
 # Whisper
 OpenAI Whisper on Axera
 - 目前支持 C++ 和 Python 两种语言
@@ -13,6 +17,12 @@ OpenAI Whisper on Axera
 - 如需自行转换请参考[模型转换](https://github.com/ml-inory/whisper.axera/blob/main/model_convert/README.md)
 ## 支持平台
 - [x] AX650N
@@ -20,6 +30,21 @@ OpenAI Whisper on Axera
 ## 模型转换
 [模型转换](https://github.com/ml-inory/whisper.axera/blob/main/model_convert/README.md)
 ## 上板部署
@@ -87,60 +112,22 @@ python3 main.py --model_type small --model_path ../models-ax650 --wav ../demo.wa
 输出结果
 ```
-root@ax650:/mnt/qtang/whisper.axera/python# python3 main.py --wav ../demo.wav --model_type small --model_path ../models/ --language zh
 [INFO] Available providers:  ['AxEngineExecutionProvider']
-wav: ../demo.wav
-model_type: small
-model_path: ../models/
-language: zh
 [INFO] Using provider: AxEngineExecutionProvider
 [INFO] Chip type: ChipType.MC50
 [INFO] VNPU type: VNPUType.DISABLED
-[INFO] Engine version: 2.10.1s
 [INFO] Model type: 2 (triple core)
-[INFO] Compiler version: 3.2-patch1 117f5fd4
 [INFO] Using provider: AxEngineExecutionProvider
 [INFO] Model type: 2 (triple core)
-[INFO] Compiler version: 3.2-patch1 117f5fd4
-[INFO] Using provider: AxEngineExecutionProvider
-[INFO] Model type: 2 (triple core)
-[INFO] Compiler version: 3.2-patch1 117f5fd4
-Load models take 2322.563409805298ms
-Preprocess wav take 6971.68493270874ms
-Run encoder take 211.52877807617188ms
-Run decoder_main take 79.00094985961914ms
-First token: 17556
-Run decoder_loop take 101.91774368286133ms
-Iter 0   Token: 20844
-Run decoder_loop take 60.30416488647461ms
-Iter 1   Token: 7781
-Run decoder_loop take 60.22000312805176ms
-Iter 2   Token: 20204
-Run decoder_loop take 60.23716926574707ms
-Iter 3   Token: 28455
-Run decoder_loop take 60.214996337890625ms
-Iter 4   Token: 31962
-Run decoder_loop take 60.17565727233887ms
-Iter 5   Token: 6336
-Run decoder_loop take 60.94002723693848ms
-Iter 6   Token: 254
-Run decoder_loop take 60.71639060974121ms
-Iter 7   Token: 2930
-Run decoder_loop take 60.225725173950195ms
-Iter 8   Token: 236
-Run decoder_loop take 60.167789459228516ms
-Iter 9   Token: 36135
-Run decoder_loop take 60.29987335205078ms
-Iter 10          Token: 15868
-Run decoder_loop take 61.163902282714844ms
-Iter 11          Token: 252
-Run decoder_loop take 60.273170471191406ms
-Iter 12          Token: 1546
-Run decoder_loop take 60.23144721984863ms
-Iter 13          Token: 46514
-Run decoder_loop take 60.31966209411621ms
-Iter 14          Token: 50257
-Result: 甚至出现交易几乎停滞的情况
 ```
 运行参数说明:
@@ -152,6 +139,21 @@ Result: 甚至出现交易几乎停滞的情况
 | --language/-l | 识别语言 | zh |
 <h3 id="CPP">CPP</h3>
 #### 运行
@@ -160,50 +162,35 @@ Result: 甚至出现交易几乎停滞的情况
 ```
 cd cpp
-./whisper -w ../demo.wav
 ```
 或
 ```
 cd cpp
-./whisper --model_type small --model_path ../models -w ../demo.wav
 ```
 输出结果
 ```
-root@ax650:/mnt/qtang/whisper.axera/cpp# ./install/whisper --wav ../demo.wav --model_type small --model_path ../models/ --language zh
-wav_file: ../demo.wav
-model_path: ../models/
-model_type: small
 language: zh
-Encoder run take 188.30 ms
-First token: 17556       take 81.88ms
-Next Token: 20844        take 29.64ms
-Next Token: 7781         take 29.70ms
-Next Token: 20204        take 29.64ms
-Next Token: 28455        take 29.65ms
-Next Token: 31962        take 29.61ms
-Next Token: 6336         take 29.67ms
-Next Token: 254          take 29.63ms
-Next Token: 2930         take 29.61ms
-Next Token: 236          take 29.56ms
-Next Token: 36135        take 29.64ms
-Next Token: 15868        take 29.71ms
-Next Token: 252          take 29.51ms
-Next Token: 1546         take 29.63ms
-Next Token: 46514        take 29.51ms
-Next Token: 50257        take 29.69ms
-All take 801.13 ms
-Result: 甚至出现交易几乎停滞的情况
 ```
 ### 服务端
 ```
-cd cpp
-./whisper_srv --model_type tiny --model_path ../models-ax650 --language zh --port 8080
 ```
 ### 客户端

 # Whisper
+<div align="center">
+  <a href="README_EN.md">English</a> | <a href="README.md">中文</a>
+</div>
 OpenAI Whisper on Axera
 - 目前支持 C++ 和 Python 两种语言
 - 如需自行转换请参考[模型转换](https://github.com/ml-inory/whisper.axera/blob/main/model_convert/README.md)
+## Update
+ - 2026/01/14: 更简单的模型结构，现在只需要encoder和decoder，去掉原来的decoder_main和decoder_loop；支持来自HuggingFace的模型导出
 ## 支持平台
 - [x] AX650N
 ## 模型转换
+目前支持的模型规模:
+ - tiny
+ - base
+ - small
+ - medium
+ - turbo
+目前测试过的语言:
+ - English
+ - Chinese
+ - Japanese
+ - Korean
+ - Malaysian
 [模型转换](https://github.com/ml-inory/whisper.axera/blob/main/model_convert/README.md)
 ## 上板部署
 输出结果
 ```
+(whisper) root@ax650:/mnt/data/Github/whisper.axera/python# python whisper_cli.py -t tiny -w ../demo.wav
 [INFO] Available providers:  ['AxEngineExecutionProvider']
+{'wav': '../demo.wav', 'model_type': 'tiny', 'model_path': '../models-ax650', 'language': 'zh', 'task': 'transcribe'}
 [INFO] Using provider: AxEngineExecutionProvider
 [INFO] Chip type: ChipType.MC50
 [INFO] VNPU type: VNPUType.DISABLED
+[INFO] Engine version: 2.12.0s
 [INFO] Model type: 2 (triple core)
+[INFO] Compiler version: 5.0 76f70fdc
 [INFO] Using provider: AxEngineExecutionProvider
 [INFO] Model type: 2 (triple core)
+[INFO] Compiler version: 5.0 76f70fdc
+ASR result:
+擅职出现交易几乎停止的情况
+RTF: 0.11406774537746188
 ```
 运行参数说明:
 | --language/-l | 识别语言 | zh |
+##### 服务端
+```
+(whisper) root@ax650:/mnt/data/Github/whisper.axera/python# python whisper_svr.py
+[INFO] Available providers:  ['AxEngineExecutionProvider']
+Server started at http://0.0.0.0:8000
+```
+测试服务端
+```
+python test_svr.py
+```
 <h3 id="CPP">CPP</h3>
 #### 运行
 ```
 cd cpp
+./whisper_cli -w ../demo.wav -t tiny
 ```
 或
 ```
 cd cpp
+./whisper_cli --model_type small -w ../demo.wav
 ```
 输出结果
 ```
+(whisper) root@ax650:/mnt/data/HF/Whisper/cpp/ax650# ./whisper_cli -w ../../demo.wav -t tiny
+wav_file: ../../demo.wav
+model_path: ../../models-ax650
+model_type: tiny
 language: zh
+Init whisper success, take 0.3540seconds
+Result: 甚至出现交易几乎停止的情况
+RTF: 0.0968
 ```
 ### 服务端
 ```
+cd cpp/ax650
+./whisper_srv --model_type tiny --language zh --port 8080
 ```
 ### 客户端

README_EN.md ADDED Viewed

	@@ -0,0 +1,261 @@

+# whisper.axera
+<div align="center">
+  <a href="README_EN.md">English</a> | <a href="README.md">中文</a>
+</div>
+OpenAI Whisper on Axera Platform
+## Overview
+This project provides an optimized implementation of OpenAI's Whisper speech recognition model for Axera AI processors (AX650N/AX630C). It supports both C++ and Python interfaces for efficient on-device speech-to-text conversion.
+## Features
+- **Dual Language Support**: Both C++ and Python APIs available
+- **Multiple Model Sizes**: Support for tiny, base, small, and turbo model variants
+- **Multi-language Recognition**: Tested with English, Chinese, Japanese, and Korean
+- **Optimized Performance**: Specially optimized for Axera NPU acceleration
+- **Easy Deployment**: Pre-built packages and cross-compilation support
+## Update
+ - 2026/01/14: We provide cleaner model architecture now.(With encoder and decoder instead of decoder_main and decoder_loop). Support exporting models from huggingface.
+## Supported Platforms
+- ✅ AX650N
+- ✅ AX630C
+## Pre-trained Models
+Download pre-compiled models from:
+- [Baidu Cloud](https://pan.baidu.com/s/1tOHVMZCin0A68T5HmKRJyg?pwd=axyz)
+- [Huggingface](https://huggingface.co/AXERA-TECH/Whisper)
+For custom model conversion, please refer to [Model Conversion Guide](./model_convert/README_EN.md).
+## Model Conversion
+Currently supported model scales:
+- tiny
+- base
+- small
+- medium
+- turbo
+Tested languages:
+- English
+- Chinese
+- Japanese
+- Korean
+- Malaysian
+For other languages or custom model sizes, please refer to the [Model Conversion Guide](./model_convert/README_EN.md).
+## Deployment on Target Devices
+### Prerequisites
+- AX650N/AX630C devices with Ubuntu 22.04 pre-installed
+- Internet connection for `apt install` and `pip install`
+- Verified hardware platforms:
+  - [MaixIV M4nDock (AX650N)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
+  - [M.2 Accelerator Card (AX650N)](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html)
+  - [Axera Pi 2 (AX630C)](https://axera-pi-2-docs-cn.readthedocs.io/zh-cn/latest/index.html)
+  - [Module-LLM (AX630C)](https://docs.m5stack.com/zh_CN/module/Module-LLM)
+  - [LLM630 Compute Kit (AX630C)](https://docs.m5stack.com/zh_CN/core/LLM630%20Compute%20Kit)
+## Programming Language Support
+### Python
+Tested with Python 3.12. We recommend using [Miniconda](https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-aarch64.sh) for environment management.
+#### Installation
+```bash
+cd python
+pip3 install -r requirements.txt
+```
+####  pyaxenigne
+Install NPU Python API from: https://github.com/AXERA-TECH/pyaxengine
+#### Usage
+##### Command Line Interface
+```
+cd python
+(whisper) root@ax650:/mnt/data/HF/Whisper/python# python whisper_cli.py -w ../demo.wav -t tiny
+[INFO] Available providers:  ['AxEngineExecutionProvider']
+{'wav': '../demo.wav', 'model_type': 'tiny', 'model_path': '../models-ax650', 'language': 'zh', 'task': 'transcribe'}
+[INFO] Using provider: AxEngineExecutionProvider
+[INFO] Chip type: ChipType.MC50
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Engine version: 2.12.0s
+[INFO] Model type: 2 (triple core)
+[INFO] Compiler version: 5.0 76f70fdc
+[INFO] Using provider: AxEngineExecutionProvider
+[INFO] Model type: 2 (triple core)
+[INFO] Compiler version: 5.0 76f70fdc
+ASR result:
+擅职出现交易几乎停止的情况
+RTF: 0.10313174677896837
+```
+Command line arguments:
+| Argument | Description | Default |
+| --- | --- | --- |
+| --wav | Input audio file | - |
+| --model_type/-t | Model type: tiny/base/small | - |
+| --model_path/-p | Model directory | ../models |
+| --language/-l | Recognition language | zh |
+##### Server Mode
+```
+(whisper) root@ax650:/mnt/data/HF/Whisper/python# python whisper_svr.py
+[INFO] Available providers:  ['AxEngineExecutionProvider']
+Server started at http://0.0.0.0:8000
+```
+Test the server:
+```
+python test_svr.py
+```
+<h3 id="CPP">CPP</h3>
+#### Usage on Target Device
+```
+cd cpp/ax650
+./whisper_cli -w ../demo.wav -t tiny
+```
+或
+```
+cd cpp/ax650
+./whisper_cli --model_type small -w ../demo.wav
+```
+Example Output:
+```
+(whisper) root@ax650:/mnt/data/HF/Whisper/cpp/ax650# ./whisper_cli -w ../../demo.wav -t tiny
+wav_file: ../../demo.wav
+model_path: ../../models-ax650
+model_type: tiny
+language: zh
+Init whisper success, take 0.3540seconds
+Result: 甚至出现交易几乎停止的情况
+RTF: 0.0968
+```
+### Server Mode
+```
+cd cpp/ax650
+(whisper) root@ax650:/mnt/data/HF/Whisper/cpp/ax650# ./whisper_svr -t tiny
+port: 8080
+model_path: ../../models-ax650
+model_type: tiny
+language: zh
+[I][                            main][  60]: Initializing server...
+[I][                            main][  65]: Init server success
+[I][                           start][  32]: Start server at port 8080, POST binary stream to IP:8080/asr
+```
+### Client test using curl:
+```
+ffmpeg -i demo.wav -f f32le -c:a pcm_f32le - 2>/dev/null | \
+curl -X POST 10.126.33.192:8080/asr \
+  -H "Content-Type: application/octet-stream" \
+  --data-binary @-
+```
+## Performance Benchmarks
+### Latency
+RTF: Real-Time Factor
+CPP:
+| Models        | AX650N | AX630C |
+| ------------- | ------ | ------ |
+| Whisper-Tiny  | 0.08   |        |
+| Whisper-Base  | 0.11   | 0.35   |
+| Whisper-Small | 0.24   |        |
+| Whisper-Turbo | 0.48   |        |
+Python:
+| Models        | AX650N | AX630C |
+| ------------- | ------ | ------ |
+| Whisper-Tiny  | 0.12   |        |
+| Whisper-Base  | 0.16   | 0.35   |
+| Whisper-Small | 0.50   |        |
+| Whisper-Turbo | 0.60   |        |
+### Word Error Rate(Test on AIShell dataset)
+| Models        | AX650N | AX630C |
+| ------------- | ------ | ------ |
+| Whisper-Tiny  |  0.24  |        |
+| Whisper-Base  |  0.18  |        |
+| Whisper-Small |  0.11  |        |
+| Whisper-Turbo |  0.06  |        |
+To reproduce WER test results:
+Download dataset:
+```
+cd model_convert
+bash download_dataset.sh
+```
+Run test script:
+```
+cd python
+conda activate whisper
+python test_wer.py -d aishell --gt_path ../model_convert/datasets/ground_truth.txt --model_type tiny
+```
+### MEM Usage
+* CMM Stands for Physical memory used by Axera modules like VDEC(Video decoder), VENC(Video encoder), NPU, etc.
+Python:
+| Models        | CMM(MB)| OS(MB) |
+| ------------- | ------ | ------ |
+| Whisper-Tiny  |  332   |  512   |
+| Whisper-Base  |  533   |  644   |
+| Whisper-Small |  1106  |  906   |
+| Whisper-Turbo |  2065  |  2084  |
+C++:
+| Models        | CMM(MB)| OS(MB) |
+| ------------- | ------ | ------ |
+| Whisper-Tiny  |  332   |  31    |
+| Whisper-Base  |  533   |  54    |
+| Whisper-Small |  1106  |  146   |
+| Whisper-Turbo |  2065  |  86    |
+## Technical Discussion
+- Github issues
+- Tencent QQ Group: 139953715

cpp/ax630c/TSCharacters.ocd2 ADDED Viewed

Binary file (46.1 kB). View file

cpp/ax630c/TSPhrases.ocd2 ADDED Viewed

Binary file (9.78 kB). View file

cpp/ax630c/include/ax_whisper_api.h ADDED Viewed

	@@ -0,0 +1,107 @@

+/**
+ * @file ax_whisper_api.h
+ * @brief AX Whisper API header - C-compatible interface for Whisper ASR system
+ * @note This header provides a C interface to the Whisper speech recognition system
+ */
+#ifndef _AX_WHISPER_API_H_
+#define _AX_WHISPER_API_H_
+#ifdef __cplusplus
+extern "C" {
+#endif
+#define AX_WHISPER_API __attribute__((visibility("default")))
+/**
+ * @brief Opaque handle type for Whisper ASR context
+ *
+ * This handle encapsulates all internal state of the Whisper ASR system.
+ * The actual implementation is hidden from C callers to maintain ABI stability.
+ */
+typedef void* AX_WHISPER_HANDLE;
+/**
+ * @brief Initialize the Whisper ASR system with specific configuration
+ *
+ * Creates and initializes a new Whisper ASR context with the specified
+ * model type, model path, and language. This function loads the appropriate
+ * models, configures the recognizer, and prepares it for speech recognition.
+ *
+ * @param model_type Type of Whisper model to use (e.g., "tiny", "base", "small", "medium", "large")
+ *                   or custom model identifier
+ * @param model_path Directory path where model files are stored
+ *                   Model files are expected to be in the format:
+ *                   - {model_path}/{model_type}/{model_type}-encoder.axmodel
+ *                   - {model_path}/{model_type}/{model_type}-decoder.axmodel
+ *                   - {model_path}/{model_type}/{model_type}-tokens.txt
+ *                   - {model_path}/{model_type}/{model_type}_config.json
+ * @param language Language code for recognition (e.g., "en", "zh", "ja", "ko")
+ *                 Use "auto" for automatic language detection if supported
+ *
+ * @return AX_WHISPER_HANDLE Opaque handle to the initialized Whisper context,
+ *         or NULL if initialization fails
+ *
+ * @note The caller is responsible for calling AX_WHISPER_Uninit() to free
+ *       resources when the handle is no longer needed.
+ * @note If language is not supported by the model, the function may fall back
+ *       to a default language or return NULL.
+ * @example
+ *   // Initialize English recognition with base model
+ *   AX_WHISPER_HANDLE handle = AX_WHISPER_Init("base", "../models-ax650", "en");
+ *
+ */
+AX_WHISPER_API AX_WHISPER_HANDLE AX_WHISPER_Init(const char* model_type, const char* model_path, const char* language);
+/**
+ * @brief Deinitialize and release Whisper ASR resources
+ *
+ * Cleans up all resources associated with the Whisper context, including
+ * unloading models, freeing memory, and releasing hardware resources.
+ *
+ * @param handle Whisper context handle obtained from AX_WHISPER_Init()
+ *
+ * @warning After calling this function, the handle becomes invalid and
+ *          should not be used in any subsequent API calls.
+ */
+AX_WHISPER_API void AX_WHISPER_Uninit(AX_WHISPER_HANDLE handle);
+/**
+ * @brief Perform speech recognition and return dynamically allocated string
+ *
+ * @param handle Whisper context handle
+ * @param wav_file Path to the input 16k pcmf32 WAV audio file
+ * @param result Pointer to receive the allocated result string
+ *
+ * @return int Status code (0 = success, <0 = error)
+ *
+ * @note The returned string is allocated with malloc() and must be freed
+ *       by the caller using free() when no longer needed.
+ */
+AX_WHISPER_API int AX_WHISPER_RunFile(AX_WHISPER_HANDLE handle,
+                   const char* wav_file,
+                   char** result);
+/**
+ * @brief Perform speech recognition and return dynamically allocated string
+ *
+ * @param handle Whisper context handle
+ * @param pcm_data 16k Mono PCM f32 data, range from -1.0 to 1.0
+ * @param num_samples Sample num of PCM data
+ * @param result Pointer to receive the allocated result string
+ *
+ * @return int Status code (0 = success, <0 = error)
+ *
+ * @note The returned string is allocated with malloc() and must be freed
+ *       by the caller using free() when no longer needed.
+ */
+AX_WHISPER_API int AX_WHISPER_RunPCM(AX_WHISPER_HANDLE handle,
+                   float* pcm_data,
+                   int num_samples,
+                   char** result);
+#ifdef __cplusplus
+}
+#endif
+#endif // _AX_WHISPER_API_H_

cpp/ax630c/lib/cmake/ax_whisper/ax_whisper-config-release.cmake ADDED Viewed

	@@ -0,0 +1,19 @@

+#----------------------------------------------------------------
+# Generated CMake target import file for configuration "Release".
+#----------------------------------------------------------------
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+# Import target "ax::ax_whisper" for configuration "Release"
+set_property(TARGET ax::ax_whisper APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(ax::ax_whisper PROPERTIES
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libax_whisper.so"
+  IMPORTED_SONAME_RELEASE "libax_whisper.so"
+  )
+list(APPEND _IMPORT_CHECK_TARGETS ax::ax_whisper )
+list(APPEND _IMPORT_CHECK_FILES_FOR_ax::ax_whisper "${_IMPORT_PREFIX}/lib/libax_whisper.so" )
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)

cpp/ax630c/lib/cmake/ax_whisper/ax_whisper-config.cmake ADDED Viewed

	@@ -0,0 +1,94 @@

+# Generated by CMake
+if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.6)
+   message(FATAL_ERROR "CMake >= 2.6.0 required")
+endif()
+cmake_policy(PUSH)
+cmake_policy(VERSION 2.6...3.20)
+#----------------------------------------------------------------
+# Generated CMake target import file.
+#----------------------------------------------------------------
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+# Protect against multiple inclusion, which would fail when already imported targets are added once more.
+set(_targetsDefined)
+set(_targetsNotDefined)
+set(_expectedTargets)
+foreach(_expectedTarget ax::ax_whisper)
+  list(APPEND _expectedTargets ${_expectedTarget})
+  if(NOT TARGET ${_expectedTarget})
+    list(APPEND _targetsNotDefined ${_expectedTarget})
+  endif()
+  if(TARGET ${_expectedTarget})
+    list(APPEND _targetsDefined ${_expectedTarget})
+  endif()
+endforeach()
+if("${_targetsDefined}" STREQUAL "${_expectedTargets}")
+  unset(_targetsDefined)
+  unset(_targetsNotDefined)
+  unset(_expectedTargets)
+  set(CMAKE_IMPORT_FILE_VERSION)
+  cmake_policy(POP)
+  return()
+endif()
+if(NOT "${_targetsDefined}" STREQUAL "")
+  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_targetsDefined}\nTargets not yet defined: ${_targetsNotDefined}\n")
+endif()
+unset(_targetsDefined)
+unset(_targetsNotDefined)
+unset(_expectedTargets)
+# Compute the installation prefix relative to this file.
+get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+if(_IMPORT_PREFIX STREQUAL "/")
+  set(_IMPORT_PREFIX "")
+endif()
+# Create imported target ax::ax_whisper
+add_library(ax::ax_whisper SHARED IMPORTED)
+set_target_properties(ax::ax_whisper PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
+)
+# Load information for each installed configuration.
+get_filename_component(_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+file(GLOB CONFIG_FILES "${_DIR}/ax_whisper-config-*.cmake")
+foreach(f ${CONFIG_FILES})
+  include(${f})
+endforeach()
+# Cleanup temporary variables.
+set(_IMPORT_PREFIX)
+# Loop over all imported files and verify that they actually exist
+foreach(target ${_IMPORT_CHECK_TARGETS} )
+  foreach(file ${_IMPORT_CHECK_FILES_FOR_${target}} )
+    if(NOT EXISTS "${file}" )
+      message(FATAL_ERROR "The imported target \"${target}\" references the file
+   \"${file}\"
+but this file does not exist.  Possible reasons include:
+* The file was deleted, renamed, or moved to another location.
+* An install or uninstall procedure did not complete successfully.
+* The installation package was faulty and contained
+   \"${CMAKE_CURRENT_LIST_FILE}\"
+but not all the files it references.
+")
+    endif()
+  endforeach()
+  unset(_IMPORT_CHECK_FILES_FOR_${target})
+endforeach()
+unset(_IMPORT_CHECK_TARGETS)
+# This file does not depend on other imported targets which have
+# been exported from the same project but in a separate export set.
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
+cmake_policy(POP)

cpp/{TSCharacters.ocd2 → ax630c/lib/libax_whisper.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:85291e0173e972bbca58c848fb90b3bb41c79674cb61a75645e01bd884ad5927
-size 46126

 version https://git-lfs.github.com/spec/v1
+oid sha256:469ffe5522d67f92b9b7d5390dc17740cbef3cd3b8b484542a8e1de44c11ad5a
+size 623784

cpp/ax630c/t2s.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "name": "Traditional Chinese to Simplified Chinese",
+  "segmentation": {
+    "type": "mmseg",
+    "dict": {
+      "type": "ocd2",
+      "file": "TSPhrases.ocd2"
+    }
+  },
+  "conversion_chain": [{
+    "dict": {
+      "type": "group",
+      "dicts": [{
+        "type": "ocd2",
+        "file": "TSPhrases.ocd2"
+      }, {
+        "type": "ocd2",
+        "file": "TSCharacters.ocd2"
+      }]
+    }
+  }]
+}

cpp/ax630c/whisper_cli ADDED Viewed

Binary file (93.8 kB). View file

cpp/{TSPhrases.ocd2 → ax630c/whisper_svr} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eea69e525e01b8475a1b1ad45f78f25e5aa78986305f185ef6f85e11f5325387
-size 9782

 version https://git-lfs.github.com/spec/v1
+oid sha256:b2af4115e31d25a1c85f39666aba3918403449ca64e60d11f5af7e49c7456cd7
+size 531688

cpp/ax650/TSCharacters.ocd2 ADDED Viewed

Binary file (46.1 kB). View file

cpp/ax650/TSPhrases.ocd2 ADDED Viewed

Binary file (9.78 kB). View file

cpp/ax650/include/ax_whisper_api.h ADDED Viewed

	@@ -0,0 +1,107 @@

+/**
+ * @file ax_whisper_api.h
+ * @brief AX Whisper API header - C-compatible interface for Whisper ASR system
+ * @note This header provides a C interface to the Whisper speech recognition system
+ */
+#ifndef _AX_WHISPER_API_H_
+#define _AX_WHISPER_API_H_
+#ifdef __cplusplus
+extern "C" {
+#endif
+#define AX_WHISPER_API __attribute__((visibility("default")))
+/**
+ * @brief Opaque handle type for Whisper ASR context
+ *
+ * This handle encapsulates all internal state of the Whisper ASR system.
+ * The actual implementation is hidden from C callers to maintain ABI stability.
+ */
+typedef void* AX_WHISPER_HANDLE;
+/**
+ * @brief Initialize the Whisper ASR system with specific configuration
+ *
+ * Creates and initializes a new Whisper ASR context with the specified
+ * model type, model path, and language. This function loads the appropriate
+ * models, configures the recognizer, and prepares it for speech recognition.
+ *
+ * @param model_type Type of Whisper model to use (e.g., "tiny", "base", "small", "medium", "large")
+ *                   or custom model identifier
+ * @param model_path Directory path where model files are stored
+ *                   Model files are expected to be in the format:
+ *                   - {model_path}/{model_type}/{model_type}-encoder.axmodel
+ *                   - {model_path}/{model_type}/{model_type}-decoder.axmodel
+ *                   - {model_path}/{model_type}/{model_type}-tokens.txt
+ *                   - {model_path}/{model_type}/{model_type}_config.json
+ * @param language Language code for recognition (e.g., "en", "zh", "ja", "ko")
+ *                 Use "auto" for automatic language detection if supported
+ *
+ * @return AX_WHISPER_HANDLE Opaque handle to the initialized Whisper context,
+ *         or NULL if initialization fails
+ *
+ * @note The caller is responsible for calling AX_WHISPER_Uninit() to free
+ *       resources when the handle is no longer needed.
+ * @note If language is not supported by the model, the function may fall back
+ *       to a default language or return NULL.
+ * @example
+ *   // Initialize English recognition with base model
+ *   AX_WHISPER_HANDLE handle = AX_WHISPER_Init("base", "../models-ax650", "en");
+ *
+ */
+AX_WHISPER_API AX_WHISPER_HANDLE AX_WHISPER_Init(const char* model_type, const char* model_path, const char* language);
+/**
+ * @brief Deinitialize and release Whisper ASR resources
+ *
+ * Cleans up all resources associated with the Whisper context, including
+ * unloading models, freeing memory, and releasing hardware resources.
+ *
+ * @param handle Whisper context handle obtained from AX_WHISPER_Init()
+ *
+ * @warning After calling this function, the handle becomes invalid and
+ *          should not be used in any subsequent API calls.
+ */
+AX_WHISPER_API void AX_WHISPER_Uninit(AX_WHISPER_HANDLE handle);
+/**
+ * @brief Perform speech recognition and return dynamically allocated string
+ *
+ * @param handle Whisper context handle
+ * @param wav_file Path to the input 16k pcmf32 WAV audio file
+ * @param result Pointer to receive the allocated result string
+ *
+ * @return int Status code (0 = success, <0 = error)
+ *
+ * @note The returned string is allocated with malloc() and must be freed
+ *       by the caller using free() when no longer needed.
+ */
+AX_WHISPER_API int AX_WHISPER_RunFile(AX_WHISPER_HANDLE handle,
+                   const char* wav_file,
+                   char** result);
+/**
+ * @brief Perform speech recognition and return dynamically allocated string
+ *
+ * @param handle Whisper context handle
+ * @param pcm_data 16k Mono PCM f32 data, range from -1.0 to 1.0
+ * @param num_samples Sample num of PCM data
+ * @param result Pointer to receive the allocated result string
+ *
+ * @return int Status code (0 = success, <0 = error)
+ *
+ * @note The returned string is allocated with malloc() and must be freed
+ *       by the caller using free() when no longer needed.
+ */
+AX_WHISPER_API int AX_WHISPER_RunPCM(AX_WHISPER_HANDLE handle,
+                   float* pcm_data,
+                   int num_samples,
+                   char** result);
+#ifdef __cplusplus
+}
+#endif
+#endif // _AX_WHISPER_API_H_

cpp/ax650/lib/cmake/ax_whisper/ax_whisper-config-release.cmake ADDED Viewed

	@@ -0,0 +1,19 @@

+#----------------------------------------------------------------
+# Generated CMake target import file for configuration "Release".
+#----------------------------------------------------------------
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+# Import target "ax::ax_whisper" for configuration "Release"
+set_property(TARGET ax::ax_whisper APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(ax::ax_whisper PROPERTIES
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libax_whisper.so"
+  IMPORTED_SONAME_RELEASE "libax_whisper.so"
+  )
+list(APPEND _IMPORT_CHECK_TARGETS ax::ax_whisper )
+list(APPEND _IMPORT_CHECK_FILES_FOR_ax::ax_whisper "${_IMPORT_PREFIX}/lib/libax_whisper.so" )
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)

cpp/ax650/lib/cmake/ax_whisper/ax_whisper-config.cmake ADDED Viewed

	@@ -0,0 +1,94 @@

+# Generated by CMake
+if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.6)
+   message(FATAL_ERROR "CMake >= 2.6.0 required")
+endif()
+cmake_policy(PUSH)
+cmake_policy(VERSION 2.6...3.20)
+#----------------------------------------------------------------
+# Generated CMake target import file.
+#----------------------------------------------------------------
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+# Protect against multiple inclusion, which would fail when already imported targets are added once more.
+set(_targetsDefined)
+set(_targetsNotDefined)
+set(_expectedTargets)
+foreach(_expectedTarget ax::ax_whisper)
+  list(APPEND _expectedTargets ${_expectedTarget})
+  if(NOT TARGET ${_expectedTarget})
+    list(APPEND _targetsNotDefined ${_expectedTarget})
+  endif()
+  if(TARGET ${_expectedTarget})
+    list(APPEND _targetsDefined ${_expectedTarget})
+  endif()
+endforeach()
+if("${_targetsDefined}" STREQUAL "${_expectedTargets}")
+  unset(_targetsDefined)
+  unset(_targetsNotDefined)
+  unset(_expectedTargets)
+  set(CMAKE_IMPORT_FILE_VERSION)
+  cmake_policy(POP)
+  return()
+endif()
+if(NOT "${_targetsDefined}" STREQUAL "")
+  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_targetsDefined}\nTargets not yet defined: ${_targetsNotDefined}\n")
+endif()
+unset(_targetsDefined)
+unset(_targetsNotDefined)
+unset(_expectedTargets)
+# Compute the installation prefix relative to this file.
+get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+if(_IMPORT_PREFIX STREQUAL "/")
+  set(_IMPORT_PREFIX "")
+endif()
+# Create imported target ax::ax_whisper
+add_library(ax::ax_whisper SHARED IMPORTED)
+set_target_properties(ax::ax_whisper PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
+)
+# Load information for each installed configuration.
+get_filename_component(_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+file(GLOB CONFIG_FILES "${_DIR}/ax_whisper-config-*.cmake")
+foreach(f ${CONFIG_FILES})
+  include(${f})
+endforeach()
+# Cleanup temporary variables.
+set(_IMPORT_PREFIX)
+# Loop over all imported files and verify that they actually exist
+foreach(target ${_IMPORT_CHECK_TARGETS} )
+  foreach(file ${_IMPORT_CHECK_FILES_FOR_${target}} )
+    if(NOT EXISTS "${file}" )
+      message(FATAL_ERROR "The imported target \"${target}\" references the file
+   \"${file}\"
+but this file does not exist.  Possible reasons include:
+* The file was deleted, renamed, or moved to another location.
+* An install or uninstall procedure did not complete successfully.
+* The installation package was faulty and contained
+   \"${CMAKE_CURRENT_LIST_FILE}\"
+but not all the files it references.
+")
+    endif()
+  endforeach()
+  unset(_IMPORT_CHECK_FILES_FOR_${target})
+endforeach()
+unset(_IMPORT_CHECK_TARGETS)
+# This file does not depend on other imported targets which have
+# been exported from the same project but in a separate export set.
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
+cmake_policy(POP)

cpp/{t2s.json → ax650/lib/libax_whisper.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b818534194f27c2d95f01001edb0a5ec49b9050119892cb30a0504bb202cc07c
-size 406

 version https://git-lfs.github.com/spec/v1
+oid sha256:a2aafbad2ea23d93c226fdb3d7d22ccbad44813638c78222787c8de9f85ec358
+size 624072

cpp/ax650/t2s.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "name": "Traditional Chinese to Simplified Chinese",
+  "segmentation": {
+    "type": "mmseg",
+    "dict": {
+      "type": "ocd2",
+      "file": "TSPhrases.ocd2"
+    }
+  },
+  "conversion_chain": [{
+    "dict": {
+      "type": "group",
+      "dicts": [{
+        "type": "ocd2",
+        "file": "TSPhrases.ocd2"
+      }, {
+        "type": "ocd2",
+        "file": "TSCharacters.ocd2"
+      }]
+    }
+  }]
+}

cpp/ax650/whisper_cli ADDED Viewed

Binary file (93.8 kB). View file

cpp/{whisper_aarch64 → ax650/whisper_svr} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:70a165bd3b25a07c17fd45ed84dd04231f20dff7614ce4576ac090a51cc64513
-size 584440

 version https://git-lfs.github.com/spec/v1
+oid sha256:400a719b950cf61863179204d509c4f785b75184af2415bca9ed04a0198bf363
+size 531688

cpp/whisper_axcl_aarch64 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:47ad92295eecfcd230f420acbb5ecd86211c53735aa216d935b22e17d740cd09
-size 1251248

cpp/whisper_axcl_x86 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f59c233fb5c7d935422b650b5cd968e4b40654d368c671e6551c8ea7dcc78cee
-size 1212056

cpp/whisper_srv DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:27f1a07763df0f8fc2a94cf32551daae4082111668ea5e938b93bdd6a76e3d29
-size 1006728

python/assets/multilingual.tiktoken DELETED Viewed

The diff for this file is too large to render. See raw diff

python/languages.py DELETED Viewed

@@ -1,102 +0,0 @@
-WHISPER_LANGUAGES = {
-    "en": "english",
-    "zh": "chinese",
-    "de": "german",
-    "es": "spanish",
-    "ru": "russian",
-    "ko": "korean",
-    "fr": "french",
-    "ja": "japanese",
-    "pt": "portuguese",
-    "tr": "turkish",
-    "pl": "polish",
-    "ca": "catalan",
-    "nl": "dutch",
-    "ar": "arabic",
-    "sv": "swedish",
-    "it": "italian",
-    "id": "indonesian",
-    "hi": "hindi",
-    "fi": "finnish",
-    "vi": "vietnamese",
-    "he": "hebrew",
-    "uk": "ukrainian",
-    "el": "greek",
-    "ms": "malay",
-    "cs": "czech",
-    "ro": "romanian",
-    "da": "danish",
-    "hu": "hungarian",
-    "ta": "tamil",
-    "no": "norwegian",
-    "th": "thai",
-    "ur": "urdu",
-    "hr": "croatian",
-    "bg": "bulgarian",
-    "lt": "lithuanian",
-    "la": "latin",
-    "mi": "maori",
-    "ml": "malayalam",
-    "cy": "welsh",
-    "sk": "slovak",
-    "te": "telugu",
-    "fa": "persian",
-    "lv": "latvian",
-    "bn": "bengali",
-    "sr": "serbian",
-    "az": "azerbaijani",
-    "sl": "slovenian",
-    "kn": "kannada",
-    "et": "estonian",
-    "mk": "macedonian",
-    "br": "breton",
-    "eu": "basque",
-    "is": "icelandic",
-    "hy": "armenian",
-    "ne": "nepali",
-    "mn": "mongolian",
-    "bs": "bosnian",
-    "kk": "kazakh",
-    "sq": "albanian",
-    "sw": "swahili",
-    "gl": "galician",
-    "mr": "marathi",
-    "pa": "punjabi",
-    "si": "sinhala",
-    "km": "khmer",
-    "sn": "shona",
-    "yo": "yoruba",
-    "so": "somali",
-    "af": "afrikaans",
-    "oc": "occitan",
-    "ka": "georgian",
-    "be": "belarusian",
-    "tg": "tajik",
-    "sd": "sindhi",
-    "gu": "gujarati",
-    "am": "amharic",
-    "yi": "yiddish",
-    "lo": "lao",
-    "uz": "uzbek",
-    "fo": "faroese",
-    "ht": "haitian creole",
-    "ps": "pashto",
-    "tk": "turkmen",
-    "nn": "nynorsk",
-    "mt": "maltese",
-    "sa": "sanskrit",
-    "lb": "luxembourgish",
-    "my": "myanmar",
-    "bo": "tibetan",
-    "tl": "tagalog",
-    "mg": "malagasy",
-    "as": "assamese",
-    "tt": "tatar",
-    "haw": "hawaiian",
-    "ln": "lingala",
-    "ha": "hausa",
-    "ba": "bashkir",
-    "jw": "javanese",
-    "su": "sundanese",
-    "yue": "cantonese",
-}

python/main.py DELETED Viewed

@@ -1,74 +0,0 @@
-import argparse
-import os
-from whisper import Whisper
-import time
-def get_args():
-    parser = argparse.ArgumentParser(
-        prog="whisper", description="Run Whisper on input audio file"
-    )
-    parser.add_argument("--wav", "-w", type=str, required=True, help="Input audio file")
-    parser.add_argument(
-        "--model_type",
-        "-t",
-        type=str,
-        choices=["tiny", "base", "small", "large", "large-v3", "turbo"],
-        required=True,
-        help="model type, only support tiny, base and small currently",
-    )
-    parser.add_argument(
-        "--model_path",
-        "-p",
-        type=str,
-        required=False,
-        default="../models-ax650",
-        help="model path for *.axmodel, tokens.txt, positional_embedding.bin",
-    )
-    parser.add_argument(
-        "--language",
-        "-l",
-        type=str,
-        required=False,
-        default="zh",
-        help="Target language, support en, zh, ja, and others. See languages.py for more options.",
-    )
-    parser.add_argument(
-        "--task",
-        type=str,
-        required=False,
-        choices=["translate", "transcribe"],
-        default="transcribe",
-    )
-    parser.add_argument(
-        "--print_rtf", action="store_true", help="Print Real-Time Factor"
-    )
-    return parser.parse_args()
-def main():
-    args = get_args()
-    print(vars(args))
-    # Check wav existence
-    wav_path = args.wav
-    assert os.path.exists(wav_path), f"{wav_path} NOT exist"
-    model = Whisper(args.model_type, args.model_path, args.language, args.task)
-    print("ASR result:")
-    start = time.time()
-    print(model.run(wav_path))
-    end = time.time()
-    if args.print_rtf:
-        import librosa
-        samples, sr = librosa.load(wav_path, sr=16000)
-        duration = len(samples) / sr
-        process_time = end - start
-        print(f"RTF: {process_time / duration}")
-if __name__ == "__main__":
-    main()

python/test_svr.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import requests
+def transcribe_audio(
+    server_url: str,
+    wav_path: str,
+    model_type: str = "tiny",
+    model_path: str = "../models-ax650",
+    language: str = "zh",
+    task: str = "transcribe",
+):
+    url = f"{server_url.rstrip('/')}/asr"
+    files = {
+        "wav": open(wav_path, "rb"),
+    }
+    data = {
+        "model_type": model_type,
+        "model_path": model_path,
+        "language": language,
+        "task": task,
+    }
+    print(f"Sending request to: {url}")
+    response = requests.post(url, files=files, data=data)
+    if response.status_code != 200:
+        print("❌ Error:", response.text)
+        return None
+    result = response.json()
+    print("服务器返回结果：")
+    print(result)
+    return result
+if __name__ == "__main__":
+    # 你的服务器地址
+    SERVER = "http://127.0.0.1:8000"
+    # 本地 wav 文件路径
+    WAV = "../demo.wav"
+    transcribe_audio(SERVER, WAV)

python/test_wer.py CHANGED Viewed

@@ -2,14 +2,19 @@ import argparse
 import os
 import logging
 import re
-from whisper import Whisper
-def setup_logging():
     """配置日志系统，同时输出到控制台和文件"""
     # 获取脚本所在目录
     script_dir = os.path.dirname(os.path.abspath(__file__))
-    log_file = os.path.join(script_dir, "test_wer.log")
     # 配置日志格式
     log_format = "%(asctime)s - %(levelname)s - %(message)s"
@@ -24,7 +29,7 @@ def setup_logging():
         logger.removeHandler(handler)
     # 创建文件handler
-    file_handler = logging.FileHandler(log_file, mode="a", encoding="utf-8")
     file_handler.setLevel(logging.INFO)
     file_formatter = logging.Formatter(log_format, date_format)
     file_handler.setFormatter(file_formatter)
@@ -42,6 +47,61 @@ def setup_logging():
     return logger
 class AIShellDataset:
     def __init__(self, gt_path: str):
         """
@@ -149,6 +209,56 @@ class CommonVoiceDataset:
         return len(self.data)
 def get_args():
     parser = argparse.ArgumentParser(prog="whisper", description="Test WER on dataset")
     parser.add_argument(
@@ -156,7 +266,7 @@ def get_args():
         "-d",
         type=str,
         required=True,
-        choices=["aishell", "common_voice"],
         help="Test dataset",
     )
     parser.add_argument(
@@ -173,7 +283,7 @@ def get_args():
         "--model_type",
         "-t",
         type=str,
-        choices=["tiny", "base", "small", "large", "large-v3", "turbo"],
         required=True,
         help="model type, only support tiny, base and small currently",
     )
@@ -182,8 +292,11 @@ def get_args():
         "-p",
         type=str,
         required=False,
-        default="../models/models-ax650",
-        help="model path for *.axmodel, tokens.txt, positional_embedding.bin",
     )
     parser.add_argument(
         "--language",
@@ -193,17 +306,16 @@ def get_args():
         default="zh",
         help="Target language, support en, zh, ja, and others. See languages.py for more options.",
     )
     return parser.parse_args()
 def print_args(args):
     logger = logging.getLogger()
-    logger.info(f"dataset: {args.dataset}")
-    logger.info(f"gt_path: {args.gt_path}")
-    logger.info(f"max_num: {args.max_num}")
-    logger.info(f"model_type: {args.model_type}")
-    logger.info(f"model_path: {args.model_path}")
-    logger.info(f"language: {args.language}")
 def min_distance(word1: str, word2: str) -> int:
@@ -247,10 +359,10 @@ def remove_punctuation(text):
 def main():
-    # 设置日志系统
-    logger = setup_logging()
     args = get_args()
     print_args(args)
     dataset_type = args.dataset.lower()
@@ -258,26 +370,88 @@ def main():
         dataset = AIShellDataset(args.gt_path)
     elif dataset_type == "common_voice":
         dataset = CommonVoiceDataset(args.gt_path)
     else:
         raise ValueError(f"Unknown dataset type {dataset_type}")
     max_num = args.max_num
     # Load model
-    model = Whisper(args.model_type, args.model_path, args.language, "transcribe")
     # Iterate over dataset
     references = []
     hyp = []
     all_character_error_num = 0
     all_character_num = 0
-    wer_file = open("wer.txt", "w")
     max_data_num = max_num if max_num > 0 else len(dataset)
     for n, (audio_path, reference) in enumerate(dataset):
-        hypothesis = model.run(audio_path)
-        hypothesis = remove_punctuation(hypothesis)
-        reference = remove_punctuation(reference)
         character_error_num = min_distance(reference, hypothesis)
         character_num = len(reference)
@@ -290,7 +464,6 @@ def main():
         references.append(reference)
         line_content = f"({n+1}/{max_data_num}) {os.path.basename(audio_path)}  gt: {reference}  predict: {hypothesis}  WER: {character_error_rate}%"
-        wer_file.write(line_content + "\n")
         logger.info(line_content)
         if n + 1 >= max_data_num:
@@ -299,8 +472,6 @@ def main():
     total_character_error_rate = all_character_error_num / all_character_num * 100
     logger.info(f"Total WER: {total_character_error_rate}%")
-    wer_file.write(f"Total WER: {total_character_error_rate}%")
-    wer_file.close()
 if __name__ == "__main__":

 import os
 import logging
 import re
+import pandas as pd
+from typing import Tuple
+import numpy as np
+import soundfile as sf
+import zhconv
+import librosa
+def setup_logging(filename):
     """配置日志系统，同时输出到控制台和文件"""
     # 获取脚本所在目录
     script_dir = os.path.dirname(os.path.abspath(__file__))
+    log_file = os.path.join(script_dir, f"{filename}.log")
     # 配置日志格式
     log_format = "%(asctime)s - %(levelname)s - %(message)s"
         logger.removeHandler(handler)
     # 创建文件handler
+    file_handler = logging.FileHandler(log_file, mode="w", encoding="utf-8")
     file_handler.setLevel(logging.INFO)
     file_formatter = logging.Formatter(log_format, date_format)
     file_handler.setFormatter(file_formatter)
     return logger
+def load_audio(filename: str) -> Tuple[np.ndarray, int]:
+    data, sample_rate = sf.read(
+        filename,
+        always_2d=True,
+        dtype="float32",
+    )
+    data = data[:, 0]  # use only the first channel
+    if sample_rate != 16000:
+        wave = librosa.resample(wave, orig_sr=sample_rate, target_sr=16000)
+        sample_rate = 16000
+    samples = np.ascontiguousarray(data)
+    return samples, sample_rate
+def compute_feat(filename: str, n_mels: int = 80):
+    audio, sample_rate = load_audio(filename)
+    if sample_rate != 16000:
+        audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000)
+        sample_rate = 16000
+    mel = librosa.feature.melspectrogram(
+        y=audio,
+        sr=sample_rate,
+        n_fft=480,
+        hop_length=160,
+        window="hann",
+        center=True,
+        pad_mode="reflect",
+        power=2.0,
+        n_mels=n_mels,
+    )
+    log_spec = np.log10(np.maximum(mel, 1e-10))
+    log_spec = np.maximum(log_spec, log_spec.max() - 8.0)
+    mel = (log_spec + 4.0) / 4.0
+    target = 3000
+    if mel.shape[1] > target:
+        # -50 so that there are some zero tail paddings.
+        mel = mel[:, :target]
+        mel[:, -50:] = 0
+    # We don't need to pad it to 30 seconds now!
+    if mel.shape[1] < target:
+        mel = np.concatenate(
+            (
+                mel,
+                np.zeros((n_mels, target - mel.shape[1]), dtype=np.float32),
+            ),
+            axis=-1,
+        )
+    return mel[np.newaxis, ...]
 class AIShellDataset:
     def __init__(self, gt_path: str):
         """
         return len(self.data)
+class CustomDataset:
+    """自定义数据集解析器"""
+    def __init__(self, label_path: str):
+        """
+        初始化数据集
+        """
+        self.label_path = label_path
+        self.dataset_dir = os.path.dirname(label_path)
+        # 检查必要文件和文件夹是否存在
+        assert os.path.exists(label_path), f"{label_path}文件不存在: {label_path}"
+        # 加载csv
+        self.data = []
+        df = pd.read_csv(label_path, sep="\t")
+        for i, row in df.iterrows():
+            audio_path = os.path.join(
+                self.dataset_dir, row["SPEAKER_ID"], row["UTTRANS_ID"]
+            )
+            gt = row["TRANSCRIPTION"]
+            self.data.append({"audio_path": audio_path, "gt": gt})
+        # 使用logging而不是print
+        logger = logging.getLogger()
+        logger.info(f"加载了 {len(self.data)} 条数据")
+    def __iter__(self):
+        """返回迭代器"""
+        self.index = 0
+        return self
+    def __next__(self):
+        """返回下一个数据项"""
+        if self.index >= len(self.data):
+            raise StopIteration
+        item = self.data[self.index]
+        audio_path = item["audio_path"]
+        ground_truth = item["gt"]
+        self.index += 1
+        return audio_path, ground_truth
+    def __len__(self):
+        """返回数据集大小"""
+        return len(self.data)
 def get_args():
     parser = argparse.ArgumentParser(prog="whisper", description="Test WER on dataset")
     parser.add_argument(
         "-d",
         type=str,
         required=True,
+        choices=["aishell", "common_voice", "custom"],
         help="Test dataset",
     )
     parser.add_argument(
         "--model_type",
         "-t",
         type=str,
+        choices=["tiny", "base", "small", "medium", "large", "large-v3", "turbo"],
         required=True,
         help="model type, only support tiny, base and small currently",
     )
         "-p",
         type=str,
         required=False,
+        default="../models-ax650",
+        help="model path for *.axmodel, tokens.txt",
+    )
+    parser.add_argument(
+        "--repo_id", type=str, default=None, help="repo id from huggingface"
     )
     parser.add_argument(
         "--language",
         default="zh",
         help="Target language, support en, zh, ja, and others. See languages.py for more options.",
     )
+    parser.add_argument(
+        "--backend", type=str, default="ax", choices=["ax", "torch", "onnx"]
+    )
+    parser.add_argument("--log_name", type=str, default="test_wer")
     return parser.parse_args()
 def print_args(args):
     logger = logging.getLogger()
+    logger.info(vars(args))
 def min_distance(word1: str, word2: str) -> int:
 def main():
     args = get_args()
+    # 设置日志系统
+    logger = setup_logging(args.log_name)
     print_args(args)
     dataset_type = args.dataset.lower()
         dataset = AIShellDataset(args.gt_path)
     elif dataset_type == "common_voice":
         dataset = CommonVoiceDataset(args.gt_path)
+    elif dataset_type == "custom":
+        dataset = CustomDataset(args.gt_path)
     else:
         raise ValueError(f"Unknown dataset type {dataset_type}")
     max_num = args.max_num
     # Load model
+    use_hf_model = False
+    tokenizer = None
+    task = "transcribe"
+    if args.backend == "ax":
+        from whisper_ax import Whisper
+        model = Whisper(args.model_type, args.model_path, args.language, task)
+    elif args.backend == "torch":
+        if args.repo_id is not None:
+            use_hf_model = True
+            from transformers import WhisperForConditionalGeneration
+            import torch
+            model = WhisperForConditionalGeneration.from_pretrained(
+                args.repo_id,
+                dtype=torch.float32,
+            ).cpu()
+        else:
+            import whisper
+            model = whisper.load_model(args.model_type).cpu()
+        tokenizer = whisper.tokenizer.get_tokenizer(multilingual=True)
+    elif args.backend == "onnx":
+        import onnxruntime as ort
+        from ..model_convert.generate_data import OnnxModel
+        encoder_path = os.path.join(
+            args.model_path, f"{args.model_type}/{args.model_type}-encoder.onnx"
+        )
+        decoder_path = os.path.join(
+            args.model_path, f"{args.model_type}/{args.model_type}-decoder.onnx"
+        )
+        model = OnnxModel(encoder_path, decoder_path)
     # Iterate over dataset
     references = []
     hyp = []
     all_character_error_num = 0
     all_character_num = 0
     max_data_num = max_num if max_num > 0 else len(dataset)
     for n, (audio_path, reference) in enumerate(dataset):
+        if args.backend == "ax":
+            hypothesis = model.run(audio_path)
+        elif args.backend == "torch":
+            if use_hf_model:
+                with torch.no_grad():
+                    feature = compute_feat(audio_path, model.config.num_mel_bins)
+                    r = model.generate(
+                        torch.from_numpy(feature),
+                        output_scores=True,
+                        return_dict_in_generate=True,
+                        return_timestamps=False,
+                        language=args.language,
+                        task="transcribe",
+                    )
+                tokens = r["sequences"][0][4:-1]
+                hypothesis = "".join(tokenizer.decode(tokens)).strip()
+            else:
+                result = model.transcribe(
+                    audio_path, fp16=False, language=args.language
+                )
+                hypothesis = result["text"]
+                if args.language == "zh":
+                    hypothesis = zhconv.convert(hypothesis, "zh-hans")
+        elif args.backend == "onnx":
+            hypothesis = model.run(audio_path, args.language, task)
+        hypothesis = remove_punctuation(hypothesis).lower()
+        reference = remove_punctuation(reference).lower()
         character_error_num = min_distance(reference, hypothesis)
         character_num = len(reference)
         references.append(reference)
         line_content = f"({n+1}/{max_data_num}) {os.path.basename(audio_path)}  gt: {reference}  predict: {hypothesis}  WER: {character_error_rate}%"
         logger.info(line_content)
         if n + 1 >= max_data_num:
     total_character_error_rate = all_character_error_num / all_character_num * 100
     logger.info(f"Total WER: {total_character_error_rate}%")
 if __name__ == "__main__":

python/{whisper.py → whisper_ax.py} RENAMED Viewed

@@ -2,11 +2,11 @@ import axengine as axe
 import numpy as np
 import librosa
 import os
-from typing import Union
-from whisper_tokenizer import *
 import json
-from dataclasses import dataclass
 import zhconv
 @dataclass
@@ -34,11 +34,9 @@ class WhisperConfig:
 class Whisper:
     def __init__(self, model_type: str, model_path: str, language: str, task: str):
-        assert task in ["translate", "transcribe"]
         self.language = language
         self.task = task
-        self.encoder, self.decoder, self.tokenizer, model_config = self.load_model(
             model_type, model_path, language, task
         )
         self.config = self.load_config(model_config)
@@ -73,16 +71,20 @@ class Whisper:
         model_config["all_language_codes"] = [
             i for i in model_config["all_language_codes"].split(",")
         ]
-        tokenizer = get_tokenizer(
-            model_config["is_multilingual"],
-            num_languages=len(model_config["all_language_codes"]),
-            language=language,
-            task=task,
-        )
         self.id2token = self.load_tokens(required_files[3])
-        return encoder, decoder, tokenizer, model_config
     def load_config(self, model_config):
         config = WhisperConfig
@@ -109,6 +111,7 @@ class Whisper:
         task_token = (
             config.transcribe if self.task == "transcribe" else config.translate
         )
         config.sot_sequence = np.array(
             [config.sot, lang_token, task_token, config.no_timestamps], dtype=np.int32
         )
@@ -124,9 +127,14 @@ class Whisper:
         return tokens
     def load_audio(self, audio: str):
-        data, sample_rate = librosa.load(audio, sr=self.config.sample_rate)
-        samples = np.ascontiguousarray(data)
-        return samples, sample_rate
     def compute_feature(self, audio: np.ndarray):
         mel = librosa.feature.melspectrogram(
@@ -189,19 +197,27 @@ class Whisper:
         return out
     def get_self_cache(self) -> List[np.ndarray]:
-        self_cache = []
         batch_size = 1
-        for i in range(self.config.n_text_layer):
-            k = np.zeros(
-                (batch_size, self.config.n_text_ctx, self.config.n_text_state),
-                dtype=np.float32,
-            )
-            v = np.zeros(
-                (batch_size, self.config.n_text_ctx, self.config.n_text_state),
-                dtype=np.float32,
-            )
-            self_cache.extend([k, v])
-        return self_cache
     def causal_mask_1d(self, n: int, L: int):
         """
@@ -214,47 +230,46 @@ class Whisper:
             mask[:n] = 0
         return mask
-    def run(self, audio: Union[str, np.ndarray]) -> str:
-        if isinstance(audio, str):
-            audio, sample_rate = self.load_audio(audio)
-        mel = self.compute_feature(audio)
-        cross_kv = self.run_encoder(mel)
-        self_kv = self.get_self_cache()
         offset = np.array([0], dtype=np.int32)
         for t in self.config.sot_sequence:
             token = np.array([[t]], dtype=np.int32)  # sot
             mask = self.causal_mask_1d(offset.item(), self.config.n_text_ctx)
-            out = self.run_decoder([token] + self_kv + cross_kv + [offset, mask])
-            for i in range(1, len(out)):
-                self_kv[i - 1][:, offset.item() : offset.item() + 1, :] = out[i]
             offset += 1
-        idx = out[0][0, 0].argmax()
         eot = self.config.eot
         ans = []
-        while idx != eot and offset.item() < 100:
             ans.append(idx)
             token = np.array([[idx]], dtype=np.int32)
             mask = self.causal_mask_1d(offset.item(), self.config.n_text_ctx)
-            out = self.run_decoder([token] + self_kv + cross_kv + [offset, mask])
-            for i in range(1, len(out)):
-                self_kv[i - 1][:, offset.item() : offset.item() + 1, :] = out[i]
             offset += 1
-            idx = out[0][0, 0].argmax()
         # print(ans)
@@ -273,3 +288,19 @@ class Whisper:
                 return text
         return text

 import numpy as np
 import librosa
 import os
+from typing import Union, List
 import json
+from dataclasses import dataclass, field
 import zhconv
+import base64
 @dataclass
 class Whisper:
     def __init__(self, model_type: str, model_path: str, language: str, task: str):
         self.language = language
         self.task = task
+        self.encoder, self.decoder, model_config = self.load_model(
             model_type, model_path, language, task
         )
         self.config = self.load_config(model_config)
         model_config["all_language_codes"] = [
             i for i in model_config["all_language_codes"].split(",")
         ]
         self.id2token = self.load_tokens(required_files[3])
+        self.lang2token = {
+            k: v
+            for k, v in zip(
+                model_config["all_language_codes"], model_config["all_language_tokens"]
+            )
+        }
+        self.task2token = {
+            "transcribe": model_config["transcribe"],
+            "translate": model_config["translate"],
+        }
+        return encoder, decoder, model_config
     def load_config(self, model_config):
         config = WhisperConfig
         task_token = (
             config.transcribe if self.task == "transcribe" else config.translate
         )
         config.sot_sequence = np.array(
             [config.sot, lang_token, task_token, config.no_timestamps], dtype=np.int32
         )
         return tokens
     def load_audio(self, audio: str):
+        samples, sample_rate = librosa.load(audio, sr=self.config.sample_rate)
+        if sample_rate != self.config.sample_rate:
+            samples = librosa.resample(
+                samples, orig_sr=sample_rate, target_sr=self.config.sample_rate
+            )
+        samples = np.ascontiguousarray(samples)
+        return samples, self.config.sample_rate
     def compute_feature(self, audio: np.ndarray):
         mel = librosa.feature.melspectrogram(
         return out
     def get_self_cache(self) -> List[np.ndarray]:
         batch_size = 1
+        self_k = np.zeros(
+            (
+                self.config.n_text_layer,
+                batch_size,
+                self.config.n_text_ctx,
+                self.config.n_text_state,
+            ),
+            dtype=np.float32,
+        )
+        self_v = np.zeros(
+            (
+                self.config.n_text_layer,
+                batch_size,
+                self.config.n_text_ctx,
+                self.config.n_text_state,
+            ),
+            dtype=np.float32,
+        )
+        return self_k, self_v
     def causal_mask_1d(self, n: int, L: int):
         """
             mask[:n] = 0
         return mask
+    def run_mel(self, mel):
+        cross_k, cross_v = self.run_encoder(mel)
+        self_k, self_v = self.get_self_cache()
         offset = np.array([0], dtype=np.int32)
         for t in self.config.sot_sequence:
             token = np.array([[t]], dtype=np.int32)  # sot
             mask = self.causal_mask_1d(offset.item(), self.config.n_text_ctx)
+            logits, this_self_k, this_self_v = self.run_decoder(
+                [token] + [self_k, self_v] + [cross_k, cross_v] + [offset, mask]
+            )
+            self_k[:, :, offset.item() : offset.item() + 1, :] = this_self_k
+            self_v[:, :, offset.item() : offset.item() + 1, :] = this_self_v
             offset += 1
+        idx = logits[0, 0].argmax()
         eot = self.config.eot
         ans = []
+        while idx != eot and offset.item() < self.config.n_text_ctx:
             ans.append(idx)
             token = np.array([[idx]], dtype=np.int32)
             mask = self.causal_mask_1d(offset.item(), self.config.n_text_ctx)
+            logits, this_self_k, this_self_v = self.run_decoder(
+                [token] + [self_k, self_v] + [cross_k, cross_v] + [offset, mask]
+            )
+            self_k[:, :, offset.item() : offset.item() + 1, :] = this_self_k
+            self_v[:, :, offset.item() : offset.item() + 1, :] = this_self_v
             offset += 1
+            idx = logits[0, 0].argmax()
         # print(ans)
                 return text
         return text
+    def run(
+        self, audio: Union[str, np.ndarray], language: str = None, task: str = None
+    ) -> str:
+        if isinstance(audio, str):
+            audio, sample_rate = self.load_audio(audio)
+        mel = self.compute_feature(audio)
+        if language is not None and self.language != language:
+            self.config.sot_sequence[1] = self.lang2token(language)
+        if task is not None and self.task != task:
+            self.config.sot_sequence[2] = self.task2token(task)
+        return self.run_mel(mel)

python/whisper_cli.py CHANGED Viewed

@@ -1,46 +1,70 @@
-import requests
-def transcribe_audio(
-    server_url: str,
-    wav_path: str,
-    model_type: str = "tiny",
-    model_path: str = "../models/models-ax650",
-    language: str = "zh",
-    task: str = "transcribe",
-):
-    url = f"{server_url.rstrip('/')}/asr"
-    files = {
-        "wav": open(wav_path, "rb"),
-    }
-    data = {
-        "model_type": model_type,
-        "model_path": model_path,
-        "language": language,
-        "task": task,
-    }
-    print(f"Sending request to: {url}")
-    response = requests.post(url, files=files, data=data)
-    if response.status_code != 200:
-        print("❌ Error:", response.text)
-        return None
-    result = response.json()
-    print("服务器返回结果：")
-    print(result)
-    return result
-if __name__ == "__main__":
-    # 你的服务器地址
-    SERVER = "http://127.0.0.1:8000"
-    # 本地 wav 文件路径
-    WAV = "../demo.wav"
-    transcribe_audio(SERVER, WAV)

+import argparse
+import os
+from whisper_ax import Whisper
+import time
+def get_args():
+    parser = argparse.ArgumentParser(
+        prog="whisper", description="Run Whisper on input audio file"
+    )
+    parser.add_argument("--wav", "-w", type=str, required=True, help="Input audio file")
+    parser.add_argument(
+        "--model_type",
+        "-t",
+        type=str,
+        choices=["tiny", "base", "small", "large", "large-v3", "turbo"],
+        required=True,
+        help="model type, only support tiny, base and small currently",
+    )
+    parser.add_argument(
+        "--model_path",
+        "-p",
+        type=str,
+        required=False,
+        default="../models-ax650",
+        help="model path for *.axmodel, tokens.txt",
+    )
+    parser.add_argument(
+        "--language",
+        "-l",
+        type=str,
+        required=False,
+        default="zh",
+        help="Target language, support en, zh, ja, and others. See languages.py for more options.",
+    )
+    parser.add_argument(
+        "--task",
+        type=str,
+        required=False,
+        choices=["translate", "transcribe"],
+        default="transcribe",
+    )
+    return parser.parse_args()
+def main():
+    args = get_args()
+    print(vars(args))
+    # Check wav existence
+    wav_path = args.wav
+    assert os.path.exists(wav_path), f"{wav_path} NOT exist"
+    model = Whisper(args.model_type, args.model_path, args.language, args.task)
+    print("ASR result:")
+    start = time.time()
+    print(model.run(wav_path))
+    end = time.time()
+    import librosa
+    samples, sr = librosa.load(wav_path, sr=16000)
+    duration = len(samples) / sr
+    process_time = end - start
+    print(f"RTF: {process_time / duration}")
+if __name__ == "__main__":
+    main()

python/whisper_svr.py CHANGED Viewed

@@ -5,7 +5,7 @@ import tempfile
 from http.server import BaseHTTPRequestHandler, HTTPServer
 from urllib.parse import parse_qs
-from whisper import Whisper
 import cgi

 from http.server import BaseHTTPRequestHandler, HTTPServer
 from urllib.parse import parse_qs
+from whisper_ax import Whisper
 import cgi

python/whisper_tokenizer.py DELETED Viewed

@@ -1,395 +0,0 @@
-import base64
-import os
-import string
-from dataclasses import dataclass, field
-from functools import cached_property, lru_cache
-from typing import Dict, List, Optional, Tuple
-import tiktoken
-LANGUAGES = {
-    "en": "english",
-    "zh": "chinese",
-    "de": "german",
-    "es": "spanish",
-    "ru": "russian",
-    "ko": "korean",
-    "fr": "french",
-    "ja": "japanese",
-    "pt": "portuguese",
-    "tr": "turkish",
-    "pl": "polish",
-    "ca": "catalan",
-    "nl": "dutch",
-    "ar": "arabic",
-    "sv": "swedish",
-    "it": "italian",
-    "id": "indonesian",
-    "hi": "hindi",
-    "fi": "finnish",
-    "vi": "vietnamese",
-    "he": "hebrew",
-    "uk": "ukrainian",
-    "el": "greek",
-    "ms": "malay",
-    "cs": "czech",
-    "ro": "romanian",
-    "da": "danish",
-    "hu": "hungarian",
-    "ta": "tamil",
-    "no": "norwegian",
-    "th": "thai",
-    "ur": "urdu",
-    "hr": "croatian",
-    "bg": "bulgarian",
-    "lt": "lithuanian",
-    "la": "latin",
-    "mi": "maori",
-    "ml": "malayalam",
-    "cy": "welsh",
-    "sk": "slovak",
-    "te": "telugu",
-    "fa": "persian",
-    "lv": "latvian",
-    "bn": "bengali",
-    "sr": "serbian",
-    "az": "azerbaijani",
-    "sl": "slovenian",
-    "kn": "kannada",
-    "et": "estonian",
-    "mk": "macedonian",
-    "br": "breton",
-    "eu": "basque",
-    "is": "icelandic",
-    "hy": "armenian",
-    "ne": "nepali",
-    "mn": "mongolian",
-    "bs": "bosnian",
-    "kk": "kazakh",
-    "sq": "albanian",
-    "sw": "swahili",
-    "gl": "galician",
-    "mr": "marathi",
-    "pa": "punjabi",
-    "si": "sinhala",
-    "km": "khmer",
-    "sn": "shona",
-    "yo": "yoruba",
-    "so": "somali",
-    "af": "afrikaans",
-    "oc": "occitan",
-    "ka": "georgian",
-    "be": "belarusian",
-    "tg": "tajik",
-    "sd": "sindhi",
-    "gu": "gujarati",
-    "am": "amharic",
-    "yi": "yiddish",
-    "lo": "lao",
-    "uz": "uzbek",
-    "fo": "faroese",
-    "ht": "haitian creole",
-    "ps": "pashto",
-    "tk": "turkmen",
-    "nn": "nynorsk",
-    "mt": "maltese",
-    "sa": "sanskrit",
-    "lb": "luxembourgish",
-    "my": "myanmar",
-    "bo": "tibetan",
-    "tl": "tagalog",
-    "mg": "malagasy",
-    "as": "assamese",
-    "tt": "tatar",
-    "haw": "hawaiian",
-    "ln": "lingala",
-    "ha": "hausa",
-    "ba": "bashkir",
-    "jw": "javanese",
-    "su": "sundanese",
-    "yue": "cantonese",
-}
-# language code lookup by name, with a few language aliases
-TO_LANGUAGE_CODE = {
-    **{language: code for code, language in LANGUAGES.items()},
-    "burmese": "my",
-    "valencian": "ca",
-    "flemish": "nl",
-    "haitian": "ht",
-    "letzeburgesch": "lb",
-    "pushto": "ps",
-    "panjabi": "pa",
-    "moldavian": "ro",
-    "moldovan": "ro",
-    "sinhalese": "si",
-    "castilian": "es",
-    "mandarin": "zh",
-}
-@dataclass
-class Tokenizer:
-    """A thin wrapper around `tiktoken` providing quick access to special tokens"""
-    encoding: tiktoken.Encoding
-    num_languages: int
-    language: Optional[str] = None
-    task: Optional[str] = None
-    sot_sequence: Tuple[int] = ()
-    special_tokens: Dict[str, int] = field(default_factory=dict)
-    def __post_init__(self):
-        for special in self.encoding.special_tokens_set:
-            special_token = self.encoding.encode_single_token(special)
-            self.special_tokens[special] = special_token
-        sot: int = self.special_tokens["<|startoftranscript|>"]
-        translate: int = self.special_tokens["<|translate|>"]
-        transcribe: int = self.special_tokens["<|transcribe|>"]
-        langs = tuple(LANGUAGES.keys())[: self.num_languages]
-        sot_sequence = [sot]
-        if self.language is not None:
-            sot_sequence.append(sot + 1 + langs.index(self.language))
-        if self.task is not None:
-            task_token: int = transcribe if self.task == "transcribe" else translate
-            sot_sequence.append(task_token)
-        self.sot_sequence = tuple(sot_sequence)
-    def encode(self, text, **kwargs):
-        return self.encoding.encode(text, **kwargs)
-    def decode(self, token_ids: List[int], **kwargs) -> str:
-        token_ids = [t for t in token_ids if t < self.timestamp_begin]
-        return self.encoding.decode(token_ids, **kwargs)
-    def decode_with_timestamps(self, token_ids: List[int], **kwargs) -> str:
-        """
-        Timestamp tokens are above other special tokens' id range and are ignored by `decode()`.
-        This method decodes given tokens with timestamps tokens annotated, e.g. "<|1.08|>".
-        """
-        return self.encoding.decode(token_ids, **kwargs)
-    @cached_property
-    def eot(self) -> int:
-        return self.encoding.eot_token
-    @cached_property
-    def transcribe(self) -> int:
-        return self.special_tokens["<|transcribe|>"]
-    @cached_property
-    def translate(self) -> int:
-        return self.special_tokens["<|translate|>"]
-    @cached_property
-    def sot(self) -> int:
-        return self.special_tokens["<|startoftranscript|>"]
-    @cached_property
-    def sot_lm(self) -> int:
-        return self.special_tokens["<|startoflm|>"]
-    @cached_property
-    def sot_prev(self) -> int:
-        return self.special_tokens["<|startofprev|>"]
-    @cached_property
-    def no_speech(self) -> int:
-        return self.special_tokens["<|nospeech|>"]
-    @cached_property
-    def no_timestamps(self) -> int:
-        return self.special_tokens["<|notimestamps|>"]
-    @cached_property
-    def timestamp_begin(self) -> int:
-        return self.special_tokens["<|0.00|>"]
-    @cached_property
-    def language_token(self) -> int:
-        """Returns the token id corresponding to the value of the `language` field"""
-        if self.language is None:
-            raise ValueError("This tokenizer does not have language token configured")
-        return self.to_language_token(self.language)
-    def to_language_token(self, language):
-        if token := self.special_tokens.get(f"<|{language}|>", None):
-            return token
-        raise KeyError(f"Language {language} not found in tokenizer.")
-    @cached_property
-    def all_language_tokens(self) -> Tuple[int]:
-        result = []
-        for token, token_id in self.special_tokens.items():
-            if token.strip("<|>") in LANGUAGES:
-                result.append(token_id)
-        return tuple(result)[: self.num_languages]
-    @cached_property
-    def all_language_codes(self) -> Tuple[str]:
-        return tuple(self.decode([_l]).strip("<|>") for _l in self.all_language_tokens)
-    @cached_property
-    def sot_sequence_including_notimestamps(self) -> Tuple[int]:
-        return tuple(list(self.sot_sequence) + [self.no_timestamps])
-    @cached_property
-    def non_speech_tokens(self) -> Tuple[int]:
-        """
-        Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
-        annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
-        - ♪♪♪
-        - ( SPEAKING FOREIGN LANGUAGE )
-        - [DAVID] Hey there,
-        keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
-        """
-        symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
-        symbols += (
-            "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
-        )
-        # symbols that may be a single token or multiple tokens depending on the tokenizer.
-        # In case they're multiple tokens, suppress the first token, which is safe because:
-        # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
-        # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
-        miscellaneous = set("♩♪♫♬♭♮♯")
-        assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
-        # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
-        result = {self.encoding.encode(" -")[0], self.encoding.encode(" '")[0]}
-        for symbol in symbols + list(miscellaneous):
-            for tokens in [
-                self.encoding.encode(symbol),
-                self.encoding.encode(" " + symbol),
-            ]:
-                if len(tokens) == 1 or symbol in miscellaneous:
-                    result.add(tokens[0])
-        return tuple(sorted(result))
-    def split_to_word_tokens(self, tokens: List[int]):
-        if self.language in {"zh", "ja", "th", "lo", "my", "yue"}:
-            # These languages don't typically use spaces, so it is difficult to split words
-            # without morpheme analysis. Here, we instead split words at any
-            # position where the tokens are decoded as valid unicode points
-            return self.split_tokens_on_unicode(tokens)
-        return self.split_tokens_on_spaces(tokens)
-    def split_tokens_on_unicode(self, tokens: List[int]):
-        decoded_full = self.decode_with_timestamps(tokens)
-        replacement_char = "\ufffd"
-        words = []
-        word_tokens = []
-        current_tokens = []
-        unicode_offset = 0
-        for token in tokens:
-            current_tokens.append(token)
-            decoded = self.decode_with_timestamps(current_tokens)
-            if (
-                replacement_char not in decoded
-                or decoded_full[unicode_offset + decoded.index(replacement_char)]
-                == replacement_char
-            ):
-                words.append(decoded)
-                word_tokens.append(current_tokens)
-                current_tokens = []
-                unicode_offset += len(decoded)
-        return words, word_tokens
-    def split_tokens_on_spaces(self, tokens: List[int]):
-        subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens)
-        words = []
-        word_tokens = []
-        for subword, subword_tokens in zip(subwords, subword_tokens_list):
-            special = subword_tokens[0] >= self.eot
-            with_space = subword.startswith(" ")
-            punctuation = subword.strip() in string.punctuation
-            if special or with_space or punctuation or len(words) == 0:
-                words.append(subword)
-                word_tokens.append(subword_tokens)
-            else:
-                words[-1] = words[-1] + subword
-                word_tokens[-1].extend(subword_tokens)
-        return words, word_tokens
-@lru_cache(maxsize=None)
-def get_encoding(name: str = "gpt2", num_languages: int = 99):
-    vocab_path = os.path.join(os.path.dirname(__file__), "assets", f"{name}.tiktoken")
-    ranks = {
-        base64.b64decode(token): int(rank)
-        for token, rank in (line.split() for line in open(vocab_path) if line)
-    }
-    n_vocab = len(ranks)
-    special_tokens = {}
-    specials = [
-        "<|endoftext|>",
-        "<|startoftranscript|>",
-        *[f"<|{lang}|>" for lang in list(LANGUAGES.keys())[:num_languages]],
-        "<|translate|>",
-        "<|transcribe|>",
-        "<|startoflm|>",
-        "<|startofprev|>",
-        "<|nospeech|>",
-        "<|notimestamps|>",
-        *[f"<|{i * 0.02:.2f}|>" for i in range(1501)],
-    ]
-    for token in specials:
-        special_tokens[token] = n_vocab
-        n_vocab += 1
-    return tiktoken.Encoding(
-        name=os.path.basename(vocab_path),
-        explicit_n_vocab=n_vocab,
-        pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
-        mergeable_ranks=ranks,
-        special_tokens=special_tokens,
-    )
-@lru_cache(maxsize=None)
-def get_tokenizer(
-    multilingual: bool,
-    *,
-    num_languages: int = 99,
-    language: Optional[str] = None,
-    task: Optional[str] = None,  # Literal["transcribe", "translate", None]
-) -> Tokenizer:
-    if language is not None:
-        language = language.lower()
-        if language not in LANGUAGES:
-            if language in TO_LANGUAGE_CODE:
-                language = TO_LANGUAGE_CODE[language]
-            else:
-                raise ValueError(f"Unsupported language: {language}")
-    if multilingual:
-        encoding_name = "multilingual"
-        language = language or "en"
-        task = task or "transcribe"
-    else:
-        encoding_name = "gpt2"
-        language = None
-        task = None
-    encoding = get_encoding(name=encoding_name, num_languages=num_languages)
-    return Tokenizer(
-        encoding=encoding, num_languages=num_languages, language=language, task=task
-    )