niobures commited on
Commit
3ad8d62
·
verified ·
1 Parent(s): 9c84217

Qwen3-ForcedAligner

Browse files
Files changed (47) hide show
  1. .gitattributes +5 -0
  2. Qwen3-ForcedAligner-0.6B/.gitattributes +35 -0
  3. Qwen3-ForcedAligner-0.6B/README.md +1393 -0
  4. Qwen3-ForcedAligner-0.6B/chat_template.json +1 -0
  5. Qwen3-ForcedAligner-0.6B/config.json +205 -0
  6. Qwen3-ForcedAligner-0.6B/generation_config.json +6 -0
  7. Qwen3-ForcedAligner-0.6B/issues.txt +46 -0
  8. Qwen3-ForcedAligner-0.6B/languages.txt +30 -0
  9. Qwen3-ForcedAligner-0.6B/merges.txt +0 -0
  10. Qwen3-ForcedAligner-0.6B/model.safetensors +3 -0
  11. Qwen3-ForcedAligner-0.6B/preprocessor_config.json +14 -0
  12. Qwen3-ForcedAligner-0.6B/source.txt +1 -0
  13. Qwen3-ForcedAligner-0.6B/tokenizer_config.json +557 -0
  14. Qwen3-ForcedAligner-0.6B/vocab.json +0 -0
  15. chatllm_quantized_qwen3/.gitattributes +93 -0
  16. chatllm_quantized_qwen3/README.md +13 -0
  17. chatllm_quantized_qwen3/mai-ui-2b.bin +3 -0
  18. chatllm_quantized_qwen3/qwen3-asr-0.6b.bin +3 -0
  19. chatllm_quantized_qwen3/qwen3-asr-1.7b.bin +3 -0
  20. chatllm_quantized_qwen3/qwen3-focedaligner-0.6b-f16.bin +3 -0
  21. chatllm_quantized_qwen3/qwen3-focedaligner-0.6b.bin +3 -0
  22. chatllm_quantized_qwen3/source.txt +1 -0
  23. qwen3-asr-0.6b-f16/.gitattributes +36 -0
  24. qwen3-asr-0.6b-f16/README.md +80 -0
  25. qwen3-asr-0.6b-f16/languages.txt +30 -0
  26. qwen3-asr-0.6b-f16/qwen3-asr-0.6b-f16.gguf +3 -0
  27. qwen3-asr-0.6b-f16/source.txt +1 -0
  28. qwen3-forced-aligner-0.6b-f16/.gitattributes +36 -0
  29. qwen3-forced-aligner-0.6b-f16/README.md +81 -0
  30. qwen3-forced-aligner-0.6b-f16/languages.txt +30 -0
  31. qwen3-forced-aligner-0.6b-f16/qwen3-forced-aligner-0.6b-f16.gguf +3 -0
  32. qwen3-forced-aligner-0.6b-f16/source.txt +1 -0
  33. qwen3-forced-aligner-0.6b-q4-k-m/.gitattributes +36 -0
  34. qwen3-forced-aligner-0.6b-q4-k-m/README.md +81 -0
  35. qwen3-forced-aligner-0.6b-q4-k-m/languages.txt +30 -0
  36. qwen3-forced-aligner-0.6b-q4-k-m/qwen3-forced-aligner-0.6b-q4_k_m.gguf +3 -0
  37. qwen3-forced-aligner-0.6b-q4-k-m/source.txt +1 -0
  38. qwen3-forced-aligner-0.6b-q5-k-m/.gitattributes +36 -0
  39. qwen3-forced-aligner-0.6b-q5-k-m/README.md +81 -0
  40. qwen3-forced-aligner-0.6b-q5-k-m/languages.txt +30 -0
  41. qwen3-forced-aligner-0.6b-q5-k-m/qwen3-forced-aligner-0.6b-q5_k_m.gguf +3 -0
  42. qwen3-forced-aligner-0.6b-q5-k-m/source.txt +1 -0
  43. qwen3-forced-aligner-0.6b-q8-0/.gitattributes +36 -0
  44. qwen3-forced-aligner-0.6b-q8-0/README.md +81 -0
  45. qwen3-forced-aligner-0.6b-q8-0/languages.txt +30 -0
  46. qwen3-forced-aligner-0.6b-q8-0/qwen3-forced-aligner-0.6b-q8_0.gguf +3 -0
  47. qwen3-forced-aligner-0.6b-q8-0/source.txt +1 -0
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ qwen3-asr-0.6b-f16/qwen3-asr-0.6b-f16.gguf filter=lfs diff=lfs merge=lfs -text
37
+ qwen3-forced-aligner-0.6b-f16/qwen3-forced-aligner-0.6b-f16.gguf filter=lfs diff=lfs merge=lfs -text
38
+ qwen3-forced-aligner-0.6b-q4-k-m/qwen3-forced-aligner-0.6b-q4_k_m.gguf filter=lfs diff=lfs merge=lfs -text
39
+ qwen3-forced-aligner-0.6b-q5-k-m/qwen3-forced-aligner-0.6b-q5_k_m.gguf filter=lfs diff=lfs merge=lfs -text
40
+ qwen3-forced-aligner-0.6b-q8-0/qwen3-forced-aligner-0.6b-q8_0.gguf filter=lfs diff=lfs merge=lfs -text
Qwen3-ForcedAligner-0.6B/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Qwen3-ForcedAligner-0.6B/README.md ADDED
@@ -0,0 +1,1393 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ pipeline_tag: automatic-speech-recognition
4
+ ---
5
+
6
+ # Qwen3-ASR
7
+
8
+ ## Overview
9
+
10
+ ### Introduction
11
+
12
+ <p align="center">
13
+ <img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/qwen3_asr_introduction.png" width="90%"/>
14
+ <p>
15
+
16
+ The Qwen3-ASR family includes Qwen3-ASR-1.7B and Qwen3-ASR-0.6B, which support language identification and ASR for 52 languages and dialects. Both leverage large-scale speech training data and the strong audio understanding capability of their foundation model, Qwen3-Omni. Experiments show that the 1.7B version achieves state-of-the-art performance among open-source ASR models and is competitive with the strongest proprietary commercial APIs. Here are the main features:
17
+
18
+ * **All-in-one**: Qwen3-ASR-1.7B and Qwen3-ASR-0.6B support language identification and speech recognition for 30 languages and 22 Chinese dialects, so as to English accents from multiple countries and regions.
19
+
20
+ * **Excellent and Fast**: The Qwen3-ASR family ASR models maintains high-quality and robust recognition under complex acoustic environments and challenging text patterns. Qwen3-ASR-1.7B achieves strong performance on both open-sourced and internal benchmarks. While the 0.6B version achieves accuracy-efficient trade-off, it reaches 2000 times throughput at a concurrency of 128. They both achieve streaming / offline unified inference with single model and support transcribe long audio.
21
+
22
+ * **Novel and strong forced alignment Solution**: We introduce Qwen3-ForcedAligner-0.6B, which supports timestamp prediction for arbitrary units within up to 5 minutes of speech in 11 languages. Evaluations show its timestamp accuracy surpasses E2E based forced-alignment models.
23
+
24
+ * **Comprehensive inference toolkit**: In addition to open-sourcing the architectures and weights of the Qwen3-ASR series, we also release a powerful, full-featured inference framework that supports vLLM-based batch inference, asynchronous serving, streaming inference, timestamp prediction, and more.
25
+
26
+ ### Model Architecture
27
+
28
+ <p align="center">
29
+ <img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/overview.jpg" width="100%"/>
30
+ <p>
31
+
32
+
33
+ ### Released Models Description and Download
34
+
35
+ Below is an introduction and download information for the Qwen3-ASR models. Please select and download the model that fits your needs.
36
+
37
+ | Model | Supported Languages | Supported Dialects | Inference Mode | Audio Types |
38
+ |---|---|---|---|---|
39
+ | Qwen3-ASR-1.7B & Qwen3-ASR-0.6B | Chinese (zh), English (en), Cantonese (yue), Arabic (ar), German (de), French (fr), Spanish (es), Portuguese (pt), Indonesian (id), Italian (it), Korean (ko), Russian (ru), Thai (th), Vietnamese (vi), Japanese (ja), Turkish (tr), Hindi (hi), Malay (ms), Dutch (nl), Swedish (sv), Danish (da), Finnish (fi), Polish (pl), Czech (cs), Filipino (fil), Persian (fa), Greek (el), Hungarian (hu), Macedonian (mk), Romanian (ro) | Anhui, Dongbei, Fujian, Gansu, Guizhou, Hebei, Henan, Hubei, Hunan, Jiangxi, Ningxia, Shandong, Shaanxi, Shanxi, Sichuan, Tianjin, Yunnan, Zhejiang, Cantonese (Hong Kong accent), Cantonese (Guangdong accent), Wu language, Minnan language. | Offline / Streaming | Speech, Singing Voice, Songs with BGM |
40
+ | Qwen3-ForcedAligner-0.6B | Chinese, English, Cantonese, French, German, Italian, Japanese, Korean, Portuguese, Russian, Spanish | -- | NAR | Speech |
41
+
42
+ During model loading in the `qwen-asr` package or vLLM, model weights will be downloaded automatically based on the model name. However, if your runtime environment does not allow downloading weights during execution, you can use the following commands to manually download the model weights to a local directory:
43
+
44
+ ```bash
45
+ # Download through ModelScope (recommended for users in Mainland China)
46
+ pip install -U modelscope
47
+ modelscope download --model Qwen/Qwen3-ASR-1.7B --local_dir ./Qwen3-ASR-1.7B
48
+ modelscope download --model Qwen/Qwen3-ASR-0.6B --local_dir ./Qwen3-ASR-0.6B
49
+ modelscope download --model Qwen/Qwen3-ForcedAligner-0.6B --local_dir ./Qwen3-ForcedAligner-0.6B
50
+ # Download through Hugging Face
51
+ pip install -U "huggingface_hub[cli]"
52
+ huggingface-cli download Qwen/Qwen3-ASR-1.7B --local-dir ./Qwen3-ASR-1.7B
53
+ huggingface-cli download Qwen/Qwen3-ASR-0.6B --local-dir ./Qwen3-ASR-0.6B
54
+ huggingface-cli download Qwen/Qwen3-ForcedAligner-0.6B --local-dir ./Qwen3-ForcedAligner-0.6B
55
+ ```
56
+
57
+
58
+ ## Quickstart
59
+
60
+ ### Environment Setup
61
+
62
+ The easiest way to use Qwen3-ASR is to install the `qwen-asr` Python package from PyPI. This will pull in the required runtime dependencies and allow you to load any released Qwen3-ASR model. If you’d like to simplify environment setup further, you can also use our official [Docker image](#docker). The `qwen-asr` package provides two backends: the transformers backend and the vLLM backend. For usage instructions for different backends, please refer to [Python Package Usage](#python-package-usage). We recommend using a **fresh, isolated environment** to avoid dependency conflicts with existing packages. You can create a clean Python 3.12 environment like this:
63
+
64
+ ```bash
65
+ conda create -n qwen3-asr python=3.12 -y
66
+ conda activate qwen3-asr
67
+ ```
68
+
69
+ Run the following command to get the minimal installation with transformers-backend support:
70
+
71
+ ```bash
72
+ pip install -U qwen-asr
73
+ ```
74
+
75
+ To enable the vLLM backend for faster inference and streaming support, run:
76
+
77
+ ```bash
78
+ pip install -U qwen-asr[vllm]
79
+ ```
80
+
81
+ If you want to develop or modify the code locally, install from source in editable mode:
82
+
83
+ ```bash
84
+ git clone https://github.com/QwenLM/Qwen3-ASR.git
85
+ cd Qwen3-ASR
86
+ pip install -e .
87
+ # support vLLM backend
88
+ # pip install -e ".[vllm]"
89
+ ```
90
+
91
+ Additionally, we recommend using FlashAttention 2 to reduce GPU memory usage and accelerate inference speed, especially for long inputs and large batch sizes.
92
+
93
+ ```bash
94
+ pip install -U flash-attn --no-build-isolation
95
+ ```
96
+
97
+ If your machine has less than 96GB of RAM and lots of CPU cores, run:
98
+
99
+ ```bash
100
+ MAX_JOBS=4 pip install -U flash-attn --no-build-isolation
101
+ ```
102
+
103
+ Also, you should have hardware that is compatible with FlashAttention 2. Read more about it in the official documentation of the [FlashAttention repository](https://github.com/Dao-AILab/flash-attention). FlashAttention 2 can only be used when a model is loaded in `torch.float16` or `torch.bfloat16`.
104
+
105
+ ### Python Package Usage
106
+
107
+ #### Quick Inference
108
+
109
+ The `qwen-asr` package provides two backends: **transformers backend** and **vLLM backend**. You can pass audio inputs as a local path, a URL, base64 data, or a `(np.ndarray, sr)` tuple, and run batch inference. To quickly try Qwen3-ASR, you can use `Qwen3ASRModel.from_pretrained(...)` for the transformers backend with the following code:
110
+
111
+ ```python
112
+ import torch
113
+ from qwen_asr import Qwen3ASRModel
114
+
115
+ model = Qwen3ASRModel.from_pretrained(
116
+ "Qwen/Qwen3-ASR-1.7B",
117
+ dtype=torch.bfloat16,
118
+ device_map="cuda:0",
119
+ # attn_implementation="flash_attention_2",
120
+ max_inference_batch_size=32, # Batch size limit for inference. -1 means unlimited. Smaller values can help avoid OOM.
121
+ max_new_tokens=256, # Maximum number of tokens to generate. Set a larger value for long audio input.
122
+ )
123
+
124
+ results = model.transcribe(
125
+ audio="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav",
126
+ language=None, # set "English" to force the language
127
+ )
128
+
129
+ print(results[0].language)
130
+ print(results[0].text)
131
+ ```
132
+
133
+ If you want to return timestamps, pass `forced_aligner` and its init kwargs. Here is an example of batch inference with timestamps output:
134
+
135
+ ```python
136
+ import torch
137
+ from qwen_asr import Qwen3ASRModel
138
+
139
+ model = Qwen3ASRModel.from_pretrained(
140
+ "Qwen/Qwen3-ASR-1.7B",
141
+ dtype=torch.bfloat16,
142
+ device_map="cuda:0",
143
+ # attn_implementation="flash_attention_2",
144
+ max_inference_batch_size=32, # Batch size limit for inference. -1 means unlimited. Smaller values can help avoid OOM.
145
+ max_new_tokens=256, # Maximum number of tokens to generate. Set a larger value for long audio input.
146
+ forced_aligner="Qwen/Qwen3-ForcedAligner-0.6B",
147
+ forced_aligner_kwargs=dict(
148
+ dtype=torch.bfloat16,
149
+ device_map="cuda:0",
150
+ # attn_implementation="flash_attention_2",
151
+ ),
152
+ )
153
+
154
+ results = model.transcribe(
155
+ audio=[
156
+ "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_zh.wav",
157
+ "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav",
158
+ ],
159
+ language=["Chinese", "English"], # can also be set to None for automatic language detection
160
+ return_time_stamps=True,
161
+ )
162
+
163
+ for r in results:
164
+ print(r.language, r.text, r.time_stamps[0])
165
+ ```
166
+
167
+ For more detailed usage examples, please refer to the [example code](https://github.com/QwenLM/Qwen3-ASR/blob/main/examples/example_qwen3_asr_transformers.py) for the transformers backend.
168
+
169
+ #### vLLM Backend
170
+
171
+ If you want the fastest inference speed with Qwen3-ASR, we strongly recommend using the vLLM backend by initializing the model with `Qwen3ASRModel.LLM(...)`. Example code is provided below. Note that you must install it via `pip install -U qwen-asr[vllm]`. If you want the model to output timestamps, it’s best to install FlashAttention via `pip install -U flash-attn --no-build-isolation` to speed up inference for the forced aligner model. Remember to wrap your code under `if __name__ == '__main__':` to avoid the `spawn` error described in [vLLM Troubleshooting](https://docs.vllm.ai/en/latest/usage/troubleshooting/#python-multiprocessing).
172
+
173
+ ```python
174
+ import torch
175
+ from qwen_asr import Qwen3ASRModel
176
+
177
+ if __name__ == '__main__':
178
+ model = Qwen3ASRModel.LLM(
179
+ model="Qwen/Qwen3-ASR-1.7B",
180
+ gpu_memory_utilization=0.7,
181
+ max_inference_batch_size=128, # Batch size limit for inference. -1 means unlimited. Smaller values can help avoid OOM.
182
+ max_new_tokens=4096, # Maximum number of tokens to generate. Set a larger value for long audio input.
183
+ forced_aligner="Qwen/Qwen3-ForcedAligner-0.6B",
184
+ forced_aligner_kwargs=dict(
185
+ dtype=torch.bfloat16,
186
+ device_map="cuda:0",
187
+ # attn_implementation="flash_attention_2",
188
+ ),
189
+ )
190
+
191
+ results = model.transcribe(
192
+ audio=[
193
+ "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_zh.wav",
194
+ "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav",
195
+ ],
196
+ language=["Chinese", "English"], # can also be set to None for automatic language detection
197
+ return_time_stamps=True,
198
+ )
199
+
200
+ for r in results:
201
+ print(r.language, r.text, r.time_stamps[0])
202
+ ```
203
+
204
+ For more detailed usage examples, please refer to the [example code](https://github.com/QwenLM/Qwen3-ASR/blob/main/examples/example_qwen3_asr_vllm.py) for the vLLM backend. In addition, you can start a vLLM server via the `qwen-asr-serve` command, which is a wrapper around `vllm serve`. You can pass any arguments supported by `vllm serve`, for example:
205
+
206
+ ```bash
207
+ qwen-asr-serve Qwen/Qwen3-ASR-1.7B --gpu-memory-utilization 0.8 --host 0.0.0.0 --port 8000
208
+ ```
209
+
210
+ And send requests to the server via:
211
+
212
+ ```python
213
+ import requests
214
+
215
+ url = "http://localhost:8000/v1/chat/completions"
216
+ headers = {"Content-Type": "application/json"}
217
+
218
+ data = {
219
+ "messages": [
220
+ {
221
+ "role": "user",
222
+ "content": [
223
+ {
224
+ "type": "audio_url",
225
+ "audio_url": {
226
+ "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav"
227
+ },
228
+ }
229
+ ],
230
+ }
231
+ ]
232
+ }
233
+
234
+ response = requests.post(url, headers=headers, json=data, timeout=300)
235
+ response.raise_for_status()
236
+ content = response.json()['choices'][0]['message']['content']
237
+ print(content)
238
+
239
+ # parse ASR output if you want
240
+ from qwen_asr import parse_asr_output
241
+ language, text = parse_asr_output(content)
242
+ print(language)
243
+ print(text)
244
+ ```
245
+
246
+ #### Streaming Inference
247
+
248
+ Qwen3-ASR fully supports streaming inference. Currently, streaming inference is only available with the vLLM backend. Note that streaming inference does not support batch inference or returning timestamps. Please refer to the [example code](https://github.com/QwenLM/Qwen3-ASR/blob/main/examples/example_qwen3_asr_vllm_streaming.py) for details. You can also launch a streaming web demo through the [guide](#streaming-demo) to experience Qwen3-ASR’s streaming transcription capabilities.
249
+
250
+ #### ForcedAligner Usage
251
+
252
+ `Qwen3-ForcedAligner-0.6B` can align text–speech pairs and return word or character level timestamps. Here is an example of using the forced aligner directly:
253
+
254
+ ```python
255
+ import torch
256
+ from qwen_asr import Qwen3ForcedAligner
257
+
258
+ model = Qwen3ForcedAligner.from_pretrained(
259
+ "Qwen/Qwen3-ForcedAligner-0.6B",
260
+ dtype=torch.bfloat16,
261
+ device_map="cuda:0",
262
+ # attn_implementation="flash_attention_2",
263
+ )
264
+
265
+ results = model.align(
266
+ audio="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_zh.wav",
267
+ text="甚至出现交易几乎停滞的情况。",
268
+ language="Chinese",
269
+ )
270
+
271
+ print(results[0])
272
+ print(results[0][0].text, results[0][0].start_time, results[0][0].end_time)
273
+ ```
274
+
275
+ In addition, the forced aligner supports local paths / URLs / base64 data / `(np.ndarray, sr)` inputs and batch inference. Please refer to the [example code](https://github.com/QwenLM/Qwen3-ASR/blob/main/examples/example_qwen3_forced_aligner.py) for details.
276
+
277
+ ### DashScope API Usage
278
+
279
+ To further explore Qwen3-ASR, we encourage you to try our DashScope API for a faster and more efficient experience. For detailed API information and documentation, please refer to the following:
280
+
281
+ | API Description | API Documentation (Mainland China) | API Documentation (International) |
282
+ |------------------|-----------------------------------|------------------------------------|
283
+ | Real-time API for Qwen3-ASR. | [https://help.aliyun.com/zh/model-studio/qwen-real-time-speech-recognition](https://help.aliyun.com/zh/model-studio/qwen-real-time-speech-recognition) | [https://www.alibabacloud.com/help/en/model-studio/qwen-real-time-speech-recognition](https://www.alibabacloud.com/help/en/model-studio/qwen-real-time-speech-recognition) |
284
+ | FileTrans API for Qwen3-ASR. | [https://help.aliyun.com/zh/model-studio/qwen-speech-recognition](https://help.aliyun.com/zh/model-studio/qwen-speech-recognition) | [https://www.alibabacloud.com/help/en/model-studio/qwen-speech-recognition](https://www.alibabacloud.com/help/en/model-studio/qwen-speech-recognition) |
285
+
286
+
287
+ ## Launch Local Web UI Demo
288
+
289
+ ### Gradio Demo
290
+
291
+ To launch the Qwen3-ASR web UI gradio demo, install the `qwen-asr` package and run `qwen-asr-demo`. Use the command below for help:
292
+
293
+ ```bash
294
+ qwen-asr-demo --help
295
+ ```
296
+
297
+ To launch the demo, you can use the following commands:
298
+
299
+ ```bash
300
+ # Transformers backend
301
+ qwen-asr-demo \
302
+ --asr-checkpoint Qwen/Qwen3-ASR-1.7B \
303
+ --backend transformers \
304
+ --cuda-visible-devices 0 \
305
+ --ip 0.0.0.0 --port 8000
306
+
307
+ # Transformers backend + Forced Aligner (enable timestamps)
308
+ qwen-asr-demo \
309
+ --asr-checkpoint Qwen/Qwen3-ASR-1.7B \
310
+ --aligner-checkpoint Qwen/Qwen3-ForcedAligner-0.6B \
311
+ --backend transformers \
312
+ --cuda-visible-devices 0 \
313
+ --backend-kwargs '{"device_map":"cuda:0","dtype":"bfloat16","max_inference_batch_size":8,"max_new_tokens":256}' \
314
+ --aligner-kwargs '{"device_map":"cuda:0","dtype":"bfloat16"}' \
315
+ --ip 0.0.0.0 --port 8000
316
+
317
+ # vLLM backend + Forced Aligner (enable timestamps)
318
+ qwen-asr-demo \
319
+ --asr-checkpoint Qwen/Qwen3-ASR-1.7B \
320
+ --aligner-checkpoint Qwen/Qwen3-ForcedAligner-0.6B \
321
+ --backend vllm \
322
+ --cuda-visible-devices 0 \
323
+ --backend-kwargs '{"gpu_memory_utilization":0.7,"max_inference_batch_size":8,"max_new_tokens":2048}' \
324
+ --aligner-kwargs '{"device_map":"cuda:0","dtype":"bfloat16"}' \
325
+ --ip 0.0.0.0 --port 8000
326
+ ```
327
+
328
+ Then open `http://<your-ip>:8000`, or access it via port forwarding in tools like VS Code.
329
+
330
+ #### Backend Notes
331
+
332
+ This demo supports two backends: transformers and vLLM. All backend-specific initialization parameters should be passed via `--backend-kwargs` as a JSON dict. If not provided, the demo will use sensible defaults.
333
+
334
+ ```bash
335
+ # Example: override transformers init args without flash attention
336
+ --backend-kwargs '{"device_map":"cuda:0","dtype":"bfloat16"}'
337
+
338
+ # Example: override vLLM init args with 65% GPU memory
339
+ --backend-kwargs '{"gpu_memory_utilization":0.65}'
340
+ ```
341
+
342
+ #### CUDA Device Notes
343
+
344
+ Because vLLM does not follow `cuda:0` style device selection, this demo selects GPUs by setting `CUDA_VISIBLE_DEVICES` via `--cuda-visible-devices`.
345
+
346
+ ```bash
347
+ # Use GPU 0
348
+ --cuda-visible-devices 0
349
+
350
+ # Use GPU 1
351
+ --cuda-visible-devices 1
352
+ ```
353
+
354
+ #### Timestamps Notes
355
+
356
+ Timestamps are only available when `--aligner-checkpoint` is provided. If you launch the demo without a forced aligner, the timestamps UI will be hidden automatically.
357
+
358
+ ```bash
359
+ # No forced aligner
360
+ qwen-asr-demo --asr-checkpoint Qwen/Qwen3-ASR-1.7B
361
+
362
+ # With forced aligner
363
+ qwen-asr-demo \
364
+ --asr-checkpoint Qwen/Qwen3-ASR-1.7B \
365
+ --aligner-checkpoint Qwen/Qwen3-ForcedAligner-0.6B
366
+ ```
367
+
368
+ #### HTTPS Notes
369
+
370
+ To avoid browser microphone permission issues after deploying the server, it is recommended/required to run the gradio service over HTTPS (especially when accessed remotely or behind modern browsers/gateways). Use `--ssl-certfile` and `--ssl-keyfile` to enable HTTPS. First, generate a private key and a self-signed certificate (valid for 365 days):
371
+
372
+ ```bash
373
+ openssl req -x509 -newkey rsa:2048 \
374
+ -keyout key.pem -out cert.pem \
375
+ -days 365 -nodes \
376
+ -subj "/CN=localhost"
377
+ ```
378
+
379
+ Then run the demo with HTTPS:
380
+
381
+ ```bash
382
+ qwen-asr-demo \
383
+ --asr-checkpoint Qwen/Qwen3-ASR-1.7B \
384
+ --backend transformers \
385
+ --cuda-visible-devices 0 \
386
+ --ip 0.0.0.0 --port 8000 \
387
+ --ssl-certfile cert.pem \
388
+ --ssl-keyfile key.pem \
389
+ --no-ssl-verify
390
+ ```
391
+
392
+ Then open `https://<your-ip>:8000` to use it. If your browser shows a warning, that’s expected for self-signed certificates. For production, use a real certificate.
393
+
394
+ ### Streaming Demo
395
+
396
+ To experience Qwen3-ASR’s streaming transcription capability in a web UI, we provide a minimal Flask-based streaming demo. The demo captures microphone audio in the browser, resamples it to 16,000 Hz, and continuously pushes PCM chunks to the model. Run the demo with the following command:
397
+
398
+ ```bash
399
+ qwen-asr-demo-streaming \
400
+ --asr-model-path Qwen/Qwen3-ASR-1.7B \
401
+ --host 0.0.0.0 \
402
+ --port 8000 \
403
+ --gpu-memory-utilization 0.9
404
+ ```
405
+
406
+ Then open `http://<your-ip>:8000`, or access it via port forwarding in tools like VS Code.
407
+
408
+ ## Deployment with vLLM
409
+
410
+ vLLM officially provides day-0 model support for Qwen3-ASR for efficient inference.
411
+
412
+ ### Installation
413
+ You can run Qwen3-ASR with vLLM nightly wheel or docker image. To install the nightly version of vLLM, we recommend using `uv` as the environment manager
414
+ ```bash
415
+ uv venv
416
+ source .venv/bin/activate
417
+ uv pip install -U vllm --pre \
418
+ --extra-index-url https://wheels.vllm.ai/nightly/cu129 \
419
+ --extra-index-url https://download.pytorch.org/whl/cu129 \
420
+ --index-strategy unsafe-best-match
421
+ uv pip install "vllm[audio]" # For additional audio dependencies
422
+ ```
423
+
424
+ ### Online Serving
425
+ You can easily deploy Qwen3-ASR with vLLM by running the following command
426
+ ```bash
427
+ vllm serve Qwen/Qwen3-ASR-1.7B
428
+ ```
429
+ After the model server is successfully deployed, you can interact with it in multiple ways.
430
+
431
+ #### Using OpenAI SDK
432
+ ```python
433
+ import base64
434
+ import httpx
435
+ from openai import OpenAI
436
+
437
+ # Initialize client
438
+ client = OpenAI(
439
+ base_url="http://localhost:8000/v1",
440
+ api_key="EMPTY"
441
+ )
442
+
443
+ # Create multimodal chat completion request
444
+ response = client.chat.completions.create(
445
+ model="Qwen/Qwen3-ASR-1.7B",
446
+ messages=[
447
+ {
448
+ "role": "user",
449
+ "content": [
450
+ {
451
+ "type": "audio_url",
452
+ "audio_url": {
453
+ {"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav"}
454
+ }
455
+ }
456
+ ]
457
+ }
458
+ ],
459
+ )
460
+
461
+ print(response.choices[0].message.content)
462
+ ```
463
+ This model is also supported on vLLM with OpenAI transcription API.
464
+ ```python
465
+ import httpx
466
+ from openai import OpenAI
467
+
468
+ # Initialize client
469
+ client = OpenAI(
470
+ base_url="http://localhost:8000/v1",
471
+ api_key="EMPTY"
472
+ )
473
+ audio_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav"
474
+ audio_file = httpx.get(audio_url).content
475
+
476
+ transcription = client.audio.transcriptions.create(
477
+ model="Qwen/Qwen3-ASR-1.7B",
478
+ file=audio_file,
479
+ )
480
+
481
+ print(transcription.text)
482
+ ```
483
+
484
+ #### Using cURL
485
+ ```bash
486
+ curl http://localhost:8000/v1/chat/completions \
487
+ -H "Content-Type: application/json" \
488
+ -d '{
489
+ "messages": [
490
+ {"role": "user", "content": [
491
+ {"type": "audio_url", "audio_url": {"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR-Repo/asr_en.wav"}}
492
+ ]}
493
+ ]
494
+ }'
495
+ ```
496
+
497
+ ### Offline Inference
498
+ See the following example on using vLLM to run offline inference with Qwen3-ASR
499
+ ```python
500
+ from vllm import LLM, SamplingParams
501
+ from vllm.assets.audio import AudioAsset
502
+ import base64
503
+ import requests
504
+
505
+ # Initialize the LLM
506
+ llm = LLM(
507
+ model="Qwen/Qwen3-ASR-1.7B"
508
+ )
509
+
510
+ # Load audio
511
+ audio_asset = AudioAsset("winning_call")
512
+
513
+ # Create conversation with audio content
514
+ conversation = [
515
+ {
516
+ "role": "user",
517
+ "content": [
518
+ {
519
+ "type": "audio_url",
520
+ "audio_url": {"url": audio_asset.url}
521
+ }
522
+ ]
523
+ }
524
+ ]
525
+
526
+ sampling_params = SamplingParams(temperature=0.01, max_tokens=256)
527
+
528
+ # Run inference using .chat()
529
+ outputs = llm.chat(conversation, sampling_params=sampling_params)
530
+ print(outputs[0].outputs[0].text)
531
+ ```
532
+
533
+
534
+ ## Docker
535
+
536
+ To make it easier to use our `qwen-asr` Python package, we provide a pre-built Docker image: [qwenllm/qwen3-asr](https://hub.docker.com/r/qwenllm/qwen3-asr). You only need to install the GPU driver and download the model files to run the code. Please follow the [NVIDIA Container Toolkit installation guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) to ensure Docker can access your GPU. If you are in Mainland China and have trouble reaching Docker Hub, you may use a registry mirror to accelerate image pulls.
537
+
538
+ First, pull the image and start a container:
539
+
540
+ ```bash
541
+ LOCAL_WORKDIR=/path/to/your/workspace
542
+ HOST_PORT=8000
543
+ CONTAINER_PORT=80
544
+ docker run --gpus all --name qwen3-asr \
545
+ -v /var/run/docker.sock:/var/run/docker.sock -p $HOST_PORT:$CONTAINER_PORT \
546
+ --mount type=bind,source=$LOCAL_WORKDIR,target=/data/shared/Qwen3-ASR \
547
+ --shm-size=4gb \
548
+ -it qwenllm/qwen3-asr:latest
549
+ ```
550
+
551
+ After running the command, you will enter the container’s bash shell. Your local workspace (**replace** `/path/to/your/workspace` **with the actual path**) will be mounted inside the container at `/data/shared/Qwen3-ASR`. Port `8000` on the host is mapped to port `80` in the container, so you can access services running in the container via `http://<host-ip>:8000`. Note that services inside the container must bind to `0.0.0.0` (not `127.0.0.1`) for port forwarding to work.
552
+
553
+ If you exit the container, you can start it again and re-enter it with:
554
+
555
+ ```bash
556
+ docker start qwen3-asr
557
+ docker exec -it qwen3-asr bash
558
+ ```
559
+
560
+ To remove the container completely, run:
561
+
562
+ ```bash
563
+ docker rm -f qwen3-asr
564
+ ```
565
+
566
+
567
+ ## Evaluation
568
+
569
+ During evaluation, we ran inference for all models with `dtype=torch.bfloat16` and set `max_new_tokens=1024` using vLLM. Greedy search was used for all decoding, and none of the tests specified a language parameter. The detailed evaluation results are shown below.
570
+
571
+ <details>
572
+ <summary>ASR Benchmarks on Public Datasets (WER ↓)</summary>
573
+
574
+ <table>
575
+ <thead>
576
+ <tr>
577
+ <th colspan="2" style="text-align: left;"></th>
578
+ <th style="text-align: center;">GPT-4o<br>-Transcribe</th>
579
+ <th style="text-align: center;">Gemini-2.5<br>-Pro</th>
580
+ <th style="text-align: center;">Doubao-ASR</th>
581
+ <th style="text-align: center;">Whisper<br>-large-v3</th>
582
+ <th style="text-align: center;">Fun-ASR<br>-MLT-Nano</th>
583
+ <th style="text-align: center;">Qwen3-ASR<br>-0.6B</th>
584
+ <th style="text-align: center;">Qwen3-ASR<br>-1.7B</th>
585
+ </tr>
586
+ </thead>
587
+ <tbody>
588
+ <tr>
589
+ <td colspan="9" style="text-align: left; font-style: italic; border-top: 1px solid #ddd; border-bottom: 1px solid #ddd;">English (en)</td>
590
+ </tr>
591
+ <tr>
592
+ <td colspan="2" style="text-align: left;">Librispeech<br>clean | other</td>
593
+ <td style="text-align: center;"><strong>1.39</strong> | 3.75</td>
594
+ <td style="text-align: center;">2.89 | 3.56</td>
595
+ <td style="text-align: center;">2.78 | 5.70</td>
596
+ <td style="text-align: center;">1.51 | 3.97</td>
597
+ <td style="text-align: center;">1.68 | 4.03</td>
598
+ <td style="text-align: center;">2.11 | 4.55</td>
599
+ <td style="text-align: center;">1.63 | <strong>3.38</strong></td>
600
+ </tr>
601
+ <tr>
602
+ <td colspan="2" style="text-align: left;">GigaSpeech</td>
603
+ <td style="text-align: center;">25.50</td>
604
+ <td style="text-align: center;">9.37</td>
605
+ <td style="text-align: center;">9.55</td>
606
+ <td style="text-align: center;">9.76</td>
607
+ <td style="text-align: center;">-</td>
608
+ <td style="text-align: center;">8.88</td>
609
+ <td style="text-align: center;"><strong>8.45</strong></td>
610
+ </tr>
611
+ <tr>
612
+ <td colspan="2" style="text-align: left;">CV-en</td>
613
+ <td style="text-align: center;">9.08</td>
614
+ <td style="text-align: center;">14.49</td>
615
+ <td style="text-align: center;">13.78</td>
616
+ <td style="text-align: center;">9.90</td>
617
+ <td style="text-align: center;">9.90</td>
618
+ <td style="text-align: center;">9.92</td>
619
+ <td style="text-align: center;"><strong>7.39</strong></td>
620
+ </tr>
621
+ <tr>
622
+ <td colspan="2" style="text-align: left;">Fleurs-en</td>
623
+ <td style="text-align: center;"><strong>2.40</strong></td>
624
+ <td style="text-align: center;">2.94</td>
625
+ <td style="text-align: center;">6.31</td>
626
+ <td style="text-align: center;">4.08</td>
627
+ <td style="text-align: center;">5.49</td>
628
+ <td style="text-align: center;">4.39</td>
629
+ <td style="text-align: center;">3.35</td>
630
+ </tr>
631
+ <tr>
632
+ <td colspan="2" style="text-align: left;">MLS-en</td>
633
+ <td style="text-align: center;">5.12</td>
634
+ <td style="text-align: center;"><strong>3.68</strong></td>
635
+ <td style="text-align: center;">7.09</td>
636
+ <td style="text-align: center;">4.87</td>
637
+ <td style="text-align: center;">-</td>
638
+ <td style="text-align: center;">6.00</td>
639
+ <td style="text-align: center;">4.58</td>
640
+ </tr>
641
+ <tr>
642
+ <td colspan="2" style="text-align: left;">Tedlium</td>
643
+ <td style="text-align: center;">7.69</td>
644
+ <td style="text-align: center;">6.15</td>
645
+ <td style="text-align: center;">4.91</td>
646
+ <td style="text-align: center;">6.84</td>
647
+ <td style="text-align: center;">-</td>
648
+ <td style="text-align: center;"><strong>3.85<strong></td>
649
+ <td style="text-align: center;"><strong>4.50</strong></td>
650
+ </tr>
651
+ <tr>
652
+ <td colspan="2" style="text-align: left;">VoxPopuli</td>
653
+ <td style="text-align: center;">10.29</td>
654
+ <td style="text-align: center;">11.36</td>
655
+ <td style="text-align: center;">12.12</td>
656
+ <td style="text-align: center;">12.05</td>
657
+ <td style="text-align: center;">-</td>
658
+ <td style="text-align: center;"><strong>9.96<strong></td>
659
+ <td style="text-align: center;"><strong>9.15</strong></td>
660
+ </tr>
661
+ <tr>
662
+ <td colspan="9" style="text-align: left; font-style: italic; border-top: 1px solid #ddd; border-bottom: 1px solid #ddd;">Chinese (zh)</td>
663
+ </tr>
664
+ <tr>
665
+ <td colspan="2" style="text-align: left;">WenetSpeech<br>net | meeting</td>
666
+ <td style="text-align: center;">15.30 | 32.27</td>
667
+ <td style="text-align: center;">14.43 | 13.47</td>
668
+ <td style="text-align: center;">N/A</td>
669
+ <td style="text-align: center;">9.86 | 19.11</td>
670
+ <td style="text-align: center;">6.35 | -</td>
671
+ <td style="text-align: center;">5.97 | 6.88</td>
672
+ <td style="text-align: center;"><strong>4.97</strong> | <strong>5.88</strong></td>
673
+ </tr>
674
+ <tr>
675
+ <td colspan="2" style="text-align: left;">AISHELL-2-test</td>
676
+ <td style="text-align: center;">4.24</td>
677
+ <td style="text-align: center;">11.62</td>
678
+ <td style="text-align: center;">2.85</td>
679
+ <td style="text-align: center;">5.06</td>
680
+ <td style="text-align: center;">-</td>
681
+ <td style="text-align: center;">3.15</td>
682
+ <td style="text-align: center;"><strong>2.71</strong></td>
683
+ </tr>
684
+ <tr>
685
+ <td colspan="2" style="text-align: left;">SpeechIO</td>
686
+ <td style="text-align: center;">12.86</td>
687
+ <td style="text-align: center;">5.30</td>
688
+ <td style="text-align: center;">2.93</td>
689
+ <td style="text-align: center;">7.56</td>
690
+ <td style="text-align: center;">-</td>
691
+ <td style="text-align: center;">3.44</td>
692
+ <td style="text-align: center;"><strong>2.88</strong></td>
693
+ </tr>
694
+ <tr>
695
+ <td colspan="2" style="text-align: left;">Fleurs-zh</td>
696
+ <td style="text-align: center;">2.44</td>
697
+ <td style="text-align: center;">2.71</td>
698
+ <td style="text-align: center;">2.69</td>
699
+ <td style="text-align: center;">4.09</td>
700
+ <td style="text-align: center;">3.51</td>
701
+ <td style="text-align: center;">2.88</td>
702
+ <td style="text-align: center;"><strong>2.41</strong></td>
703
+ </tr>
704
+ <tr>
705
+ <td colspan="2" style="text-align: left;">CV-zh</td>
706
+ <td style="text-align: center;">6.32</td>
707
+ <td style="text-align: center;">7.70</td>
708
+ <td style="text-align: center;">5.95</td>
709
+ <td style="text-align: center;">12.91</td>
710
+ <td style="text-align: center;">6.20</td>
711
+ <td style="text-align: center;">6.89</td>
712
+ <td style="text-align: center;"><strong>5.35</strong></td>
713
+ </tr>
714
+ <tr>
715
+ <td colspan="9" style="text-align: left; font-style: italic; border-top: 1px solid #ddd; border-bottom: 1px solid #ddd;">Chinese Dialect</td>
716
+ </tr>
717
+ <tr>
718
+ <td colspan="2" style="text-align: left;">KeSpeech</td>
719
+ <td style="text-align: center;">26.87</td>
720
+ <td style="text-align: center;">24.71</td>
721
+ <td style="text-align: center;">5.27</td>
722
+ <td style="text-align: center;">28.79</td>
723
+ <td style="text-align: center;">-</td>
724
+ <td style="text-align: center;">7.08</td>
725
+ <td style="text-align: center;"><strong>5.10</strong></td>
726
+ </tr>
727
+ <tr>
728
+ <td colspan="2" style="text-align: left;">Fleurs-yue</td>
729
+ <td style="text-align: center;">4.98</td>
730
+ <td style="text-align: center;">9.43</td>
731
+ <td style="text-align: center;">4.98</td>
732
+ <td style="text-align: center;">9.18</td>
733
+ <td style="text-align: center;">-</td>
734
+ <td style="text-align: center;">5.79</td>
735
+ <td style="text-align: center;"><strong>3.98</strong></td>
736
+ </tr>
737
+ <tr>
738
+ <td colspan="2" style="text-align: left;">CV-yue</td>
739
+ <td style="text-align: center;">11.36</td>
740
+ <td style="text-align: center;">18.76</td>
741
+ <td style="text-align: center;">13.20</td>
742
+ <td style="text-align: center;">16.23</td>
743
+ <td style="text-align: center;">-</td>
744
+ <td style="text-align: center;">9.50</td>
745
+ <td style="text-align: center;"><strong>7.57</strong></td>
746
+ </tr>
747
+ <tr>
748
+ <td colspan="2" style="text-align: left;">CV-zh-tw</td>
749
+ <td style="text-align: center;">6.32</td>
750
+ <td style="text-align: center;">7.31</td>
751
+ <td style="text-align: center;">4.06</td>
752
+ <td style="text-align: center;">7.84</td>
753
+ <td style="text-align: center;">-</td>
754
+ <td style="text-align: center;">5.59</td>
755
+ <td style="text-align: center;"><strong>3.77</strong></td>
756
+ </tr>
757
+ <tr>
758
+ <td colspan="2" style="text-align: left;">WenetSpeech-Yue<br>short | long</td>
759
+ <td style="text-align: center;">15.62 | 25.29</td>
760
+ <td style="text-align: center;">25.19 | 11.23</td>
761
+ <td style="text-align: center;">9.74 | 11.40</td>
762
+ <td style="text-align: center;">32.26 | 46.64</td>
763
+ <td style="text-align: center;">- | -</td>
764
+ <td style="text-align: center;">7.54 | 9.92</td>
765
+ <td style="text-align: center;"><strong>5.82</strong> | <strong>8.85</strong></td>
766
+ </tr>
767
+ <tr>
768
+ <td colspan="2" style="text-align: left;">WenetSpeech-Chuan<br>easy | hard</td>
769
+ <td style="text-align: center;">34.81 | 53.98</td>
770
+ <td style="text-align: center;">43.79 | 67.30</td>
771
+ <td style="text-align: center;"><strong>11.40<strong> | <strong>20.20</strong></td>
772
+ <td style="text-align: center;">14.35 | 26.80</td>
773
+ <td style="text-align: center;">- | -</td>
774
+ <td style="text-align: center;">13.92 | 24.45</td>
775
+ <td style="text-align: center;">11.99 | 21.63</td>
776
+ </tr>
777
+ </tbody>
778
+ </table>
779
+
780
+ </details>
781
+
782
+ <details>
783
+ <summary>ASR Benchmarks on Internal Datasets (WER ↓)</summary>
784
+
785
+ <table>
786
+ <thead>
787
+ <tr>
788
+ <th style="text-align: left;"></th>
789
+ <th style="text-align: center;">GPT-4o<br>-Transcribe</th>
790
+ <th style="text-align: center;">Gemini-2.5<br>-Pro</th>
791
+ <th style="text-align: center;">Doubao-ASR</th>
792
+ <th style="text-align: center;">Whisper<br>-large-v3</th>
793
+ <th style="text-align: center;">Fun-ASR<br>-MLT-Nano</th>
794
+ <th style="text-align: center;">Qwen3-ASR<br>-0.6B</th>
795
+ <th style="text-align: center;">Qwen3-ASR<br>-1.7B</th>
796
+ </tr>
797
+ </thead>
798
+ <tbody>
799
+ <tr>
800
+ <td colspan="8" style="text-align: left; font-style: italic; border-top: 1px solid #ddd; border-bottom: 1px solid #ddd;">Accented English</td>
801
+ </tr>
802
+ <tr>
803
+ <td style="text-align: left;">Dialog-Accented English</td>
804
+ <td style="text-align: center;">28.56</td>
805
+ <td style="text-align: center;">23.85</td>
806
+ <td style="text-align: center;">20.41</td>
807
+ <td style="text-align: center;">21.30</td>
808
+ <td style="text-align: center;">19.96</td>
809
+ <td style="text-align: center;"><strong>16.62<strong></td>
810
+ <td style="text-align: center;"><strong>16.07</strong></td>
811
+ </tr>
812
+ <tr>
813
+ <td colspan="8" style="text-align: left; font-style: italic; border-top: 1px solid #ddd; border-bottom: 1px solid #ddd;">Chinese Mandarin</td>
814
+ </tr>
815
+ <tr>
816
+ <td style="text-align: left;">Elders&Kids</td>
817
+ <td style="text-align: center;">14.27</td>
818
+ <td style="text-align: center;">36.93</td>
819
+ <td style="text-align: center;">4.17</td>
820
+ <td style="text-align: center;">10.61</td>
821
+ <td style="text-align: center;">4.54</td>
822
+ <td style="text-align: center;">4.48</td>
823
+ <td style="text-align: center;"><strong>3.81</strong></td>
824
+ </tr>
825
+ <tr>
826
+ <td style="text-align: left;">ExtremeNoise</td>
827
+ <td style="text-align: center;">36.11</td>
828
+ <td style="text-align: center;">29.06</td>
829
+ <td style="text-align: center;">17.04</td>
830
+ <td style="text-align: center;">63.17</td>
831
+ <td style="text-align: center;">36.55</td>
832
+ <td style="text-align: center;">17.88</td>
833
+ <td style="text-align: center;"><strong>16.17</strong></td>
834
+ </tr>
835
+ <tr>
836
+ <td style="text-align: left;">TongueTwister</td>
837
+ <td style="text-align: center;">20.87</td>
838
+ <td style="text-align: center;">4.97</td>
839
+ <td style="text-align: center;">3.47</td>
840
+ <td style="text-align: center;">16.63</td>
841
+ <td style="text-align: center;">9.02</td>
842
+ <td style="text-align: center;">4.06</td>
843
+ <td style="text-align: center;"><strong>2.44</strong></td>
844
+ </tr>
845
+ <tr>
846
+ <td style="text-align: left;">Dialog-Mandarin</td>
847
+ <td style="text-align: center;">20.73</td>
848
+ <td style="text-align: center;">12.50</td>
849
+ <td style="text-align: center;">6.61</td>
850
+ <td style="text-align: center;">14.01</td>
851
+ <td style="text-align: center;">7.32</td>
852
+ <td style="text-align: center;">7.06</td>
853
+ <td style="text-align: center;"><strong>6.54</strong></td>
854
+ </tr>
855
+ <tr>
856
+ <td colspan="8" style="text-align: left; font-style: italic; border-top: 1px solid #ddd; border-bottom: 1px solid #ddd;">Chinese Dialect</td>
857
+ </tr>
858
+ <tr>
859
+ <td style="text-align: left;">Dialog-Cantonese</td>
860
+ <td style="text-align: center;">16.05</td>
861
+ <td style="text-align: center;">14.98</td>
862
+ <td style="text-align: center;">7.56</td>
863
+ <td style="text-align: center;">31.04</td>
864
+ <td style="text-align: center;">5.85</td>
865
+ <td style="text-align: center;"><strong>4.80<strong></td>
866
+ <td style="text-align: center;"><strong>4.12</strong></td>
867
+ </tr>
868
+ <tr>
869
+ <td style="text-align: left;">Dialog-Chinese Dialects</td>
870
+ <td style="text-align: center;">45.37</td>
871
+ <td style="text-align: center;">47.70</td>
872
+ <td style="text-align: center;">19.85</td>
873
+ <td style="text-align: center;">44.55</td>
874
+ <td style="text-align: center;">19.41</td>
875
+ <td style="text-align: center;"><strong>18.24<strong></td>
876
+ <td style="text-align: center;"><strong>15.94</strong></td>
877
+ </tr>
878
+ </tbody>
879
+ </table>
880
+ <p><strong>Dialect coverage:</strong> Results for <em>Dialog-Accented English</em> are averaged over 16 accents, and results for <em>Dialog-Chinese Dialects</em> are averaged over 22 Chinese dialects.</p>
881
+
882
+ </details>
883
+
884
+ <details>
885
+ <summary>Multilingual ASR Benchmarks (WER ↓)</summary>
886
+
887
+ <table>
888
+ <thead>
889
+ <tr>
890
+ <th style="text-align: left;"></th>
891
+ <th style="text-align: center;">GLM-ASR<br>-Nano-2512</th>
892
+ <th style="text-align: center;">Whisper<br>-large-v3</th>
893
+ <th style="text-align: center;">Fun-ASR<br>-MLT-Nano</th>
894
+ <th style="text-align: center;">Qwen3-ASR<br>-0.6B</th>
895
+ <th style="text-align: center;">Qwen3-ASR<br>-1.7B</th>
896
+ </tr>
897
+ </thead>
898
+ <tbody>
899
+ <tr>
900
+ <td colspan="6" style="text-align: left; font-style: italic; border-top: 1px solid #ddd; border-bottom: 1px solid #ddd;">Open-sourced Benchmarks</td>
901
+ </tr>
902
+ <tr>
903
+ <td style="text-align: left;">MLS</td>
904
+ <td style="text-align: center;">13.32</td>
905
+ <td style="text-align: center;">8.62</td>
906
+ <td style="text-align: center;">28.70</td>
907
+ <td style="text-align: center;">13.19</td>
908
+ <td style="text-align: center;"><strong>8.55</strong></td>
909
+ </tr>
910
+ <tr>
911
+ <td style="text-align: left;">CommonVoice</td>
912
+ <td style="text-align: center;">19.40</td>
913
+ <td style="text-align: center;">10.77</td>
914
+ <td style="text-align: center;">17.25</td>
915
+ <td style="text-align: center;">12.75</td>
916
+ <td style="text-align: center;"><strong>9.18</strong></td>
917
+ </tr>
918
+ <tr>
919
+ <td style="text-align: left;">MLC-SLM</td>
920
+ <td style="text-align: center;">34.93</td>
921
+ <td style="text-align: center;">15.68</td>
922
+ <td style="text-align: center;">29.94</td>
923
+ <td style="text-align: center;">15.84</td>
924
+ <td style="text-align: center;"><strong>12.74</strong></td>
925
+ </tr>
926
+ <tr>
927
+ <td style="text-align: left;">Fleurs</td>
928
+ <td style="text-align: center;">16.08</td>
929
+ <td style="text-align: center;">5.27</td>
930
+ <td style="text-align: center;">10.03</td>
931
+ <td style="text-align: center;">7.57</td>
932
+ <td style="text-align: center;"><strong>4.90</strong></td>
933
+ </tr>
934
+ <tr>
935
+ <td style="text-align: left;">Fleurs<sup>†</sup></td>
936
+ <td style="text-align: center;">20.05</td>
937
+ <td style="text-align: center;">6.85</td>
938
+ <td style="text-align: center;">31.89</td>
939
+ <td style="text-align: center;">10.37</td>
940
+ <td style="text-align: center;"><strong>6.62</strong></td>
941
+ </tr>
942
+ <tr>
943
+ <td style="text-align: left;">Fleurs<sup>††</sup></td>
944
+ <td style="text-align: center;">24.83</td>
945
+ <td style="text-align: center;"><strong>8.16</strong></td>
946
+ <td style="text-align: center;">47.84</td>
947
+ <td style="text-align: center;">21.80</td>
948
+ <td style="text-align: center;">12.60</td>
949
+ </tr>
950
+ <tr>
951
+ <td colspan="6" style="text-align: left; font-style: italic; border-top: 1px solid #ddd; border-bottom: 1px solid #ddd;">Qwen-ASR Internal Benchmarks</td>
952
+ </tr>
953
+ <tr>
954
+ <td style="text-align: left;">News-Multilingual</td>
955
+ <td style="text-align: center;">49.40</td>
956
+ <td style="text-align: center;">14.80</td>
957
+ <td style="text-align: center;">65.07</td>
958
+ <td style="text-align: center;">17.39</td>
959
+ <td style="text-align: center;"><strong>12.80</strong></td>
960
+ </tr>
961
+ </tbody>
962
+ </table>
963
+ <p><strong>Language coverage:</strong> <em>MLS</em> includes 8 languages: {da, de, en, es, fr, it, pl, pt}.<br><em>CommonVoice</em> includes 13 languages: {en, zh, yue, zh_TW, ar, de, es, fr, it, ja, ko, pt, ru}.<br><em>MLC-SLM</em> includes 11 languages: {en, fr, de, it, pt, es, ja, ko, ru, th, vi}.<br><em>Fleurs</em> includes 12 languages: {en, zh, yue, ar, de, es, fr, it, ja, ko, pt, ru }.<br><em>Fleurs<sup>†</sup></em> includes 8 additional languages beyond Fleurs: {hi, id, ms, nl, pl, th, tr, vi}.<br><em>Fleurs<sup>††</sup></em> includes 10 additional languages beyond Fleurs<sup>†</sup>: {cs, da, el, fa, fi, fil, hu, mk, ro, sv}.<br><em>News-Multilingual</em> includes 15 languages: {ar, de, es, fr, hi, id, it, ja, ko, nl, pl, pt, ru, th, vi}.</p>
964
+
965
+ </details>
966
+
967
+ <details>
968
+ <summary>Language Identification Accuracy (%) ↑</summary>
969
+
970
+ <table>
971
+ <thead>
972
+ <tr>
973
+ <th style="text-align: left;"></th>
974
+ <th style="text-align: center;">Whisper-large-v3</th>
975
+ <th style="text-align: center;">Qwen3-ASR-0.6B</th>
976
+ <th style="text-align: center;">Qwen3-ASR-1.7B</th>
977
+ </tr>
978
+ </thead>
979
+ <tbody>
980
+ <tr>
981
+ <td style="text-align: left;">MLS</td>
982
+ <td style="text-align: center;"><strong>99.9</strong></td>
983
+ <td style="text-align: center;">99.3</td>
984
+ <td style="text-align: center;"><strong>99.9</strong></td>
985
+ </tr>
986
+ <tr>
987
+ <td style="text-align: left;">CommonVoice</td>
988
+ <td style="text-align: center;">92.7</td>
989
+ <td style="text-align: center;"><strong>98.2<strong></td>
990
+ <td style="text-align: center;"><strong>98.7</strong></td>
991
+ </tr>
992
+ <tr>
993
+ <td style="text-align: left;">MLC-SLM</td>
994
+ <td style="text-align: center;">89.2</td>
995
+ <td style="text-align: center;"><strong>92.7<strong></td>
996
+ <td style="text-align: center;"><strong>94.1</strong></td>
997
+ </tr>
998
+ <tr>
999
+ <td style="text-align: left;">Fleurs</td>
1000
+ <td style="text-align: center;">94.6</td>
1001
+ <td style="text-align: center;"><strong>97.1<strong></td>
1002
+ <td style="text-align: center;"><strong>98.7</strong></td>
1003
+ </tr>
1004
+ <tr style="border-top: 1px solid #ddd;">
1005
+ <td style="text-align: left;"><em>Avg.</em></td>
1006
+ <td style="text-align: center;">94.1</td>
1007
+ <td style="text-align: center;"><strong>96.8<strong></td>
1008
+ <td style="text-align: center;"><strong>97.9</strong></td>
1009
+ </tr>
1010
+ </tbody>
1011
+ </table>
1012
+ <p><strong>Language coverage:</strong> The language sets follow Multilingual ASR Benchmarks. Here, Fleurs corresponds to Fleurs<sup>††</sup> in Multilingual ASR Benchmarks and covers 30 languages.</p>
1013
+
1014
+ </details>
1015
+
1016
+ <details>
1017
+ <summary>Singing Voice & Song Transcription (WER ↓)</summary>
1018
+
1019
+ <table>
1020
+ <thead>
1021
+ <tr>
1022
+ <th style="text-align: left;"></th>
1023
+ <th style="text-align: center;">GPT-4o<br>-Transcribe</th>
1024
+ <th style="text-align: center;">Gemini-2.5<br>-Pro</th>
1025
+ <th style="text-align: center;">Doubao-ASR<br>-1.0</th>
1026
+ <th style="text-align: center;">Whisper<br>-large-v3</th>
1027
+ <th style="text-align: center;">Fun-ASR-MLT<br>-Nano</th>
1028
+ <th style="text-align: center;">Qwen3-ASR<br>-1.7B</th>
1029
+ </tr>
1030
+ </thead>
1031
+ <tbody>
1032
+ <tr>
1033
+ <td colspan="7" style="text-align: left; font-style: italic; border-top: 1px solid #ddd; border-bottom: 1px solid #ddd;">Singing</td>
1034
+ </tr>
1035
+ <tr>
1036
+ <td style="text-align: left;">M4Singer</td>
1037
+ <td style="text-align: center;">16.77</td>
1038
+ <td style="text-align: center;">20.88</td>
1039
+ <td style="text-align: center;">7.88</td>
1040
+ <td style="text-align: center;">13.58</td>
1041
+ <td style="text-align: center;">7.29</td>
1042
+ <td style="text-align: center;"><strong>5.98</strong></td>
1043
+ </tr>
1044
+ <tr>
1045
+ <td style="text-align: left;">MIR-1k-vocal</td>
1046
+ <td style="text-align: center;">11.87</td>
1047
+ <td style="text-align: center;">9.85</td>
1048
+ <td style="text-align: center;">6.56</td>
1049
+ <td style="text-align: center;">11.71</td>
1050
+ <td style="text-align: center;">8.17</td>
1051
+ <td style="text-align: center;"><strong>6.25</strong></td>
1052
+ </tr>
1053
+ <tr>
1054
+ <td style="text-align: left;">Opencpop</td>
1055
+ <td style="text-align: center;">7.93</td>
1056
+ <td style="text-align: center;">6.49</td>
1057
+ <td style="text-align: center;">3.80</td>
1058
+ <td style="text-align: center;">9.52</td>
1059
+ <td style="text-align: center;"><strong>2.98</strong></td>
1060
+ <td style="text-align: center;">3.08</td>
1061
+ </tr>
1062
+ <tr>
1063
+ <td style="text-align: left;">Popcs</td>
1064
+ <td style="text-align: center;">32.84</td>
1065
+ <td style="text-align: center;">15.13</td>
1066
+ <td style="text-align: center;">8.97</td>
1067
+ <td style="text-align: center;">13.77</td>
1068
+ <td style="text-align: center;">9.42</td>
1069
+ <td style="text-align: center;"><strong>8.52</strong></td>
1070
+ </tr>
1071
+ <tr>
1072
+ <td colspan="7" style="text-align: left; font-style: italic; border-top: 1px solid #ddd; border-bottom: 1px solid #ddd;">Songs with BGM</td>
1073
+ </tr>
1074
+ <tr>
1075
+ <td style="text-align: left;">EntireSongs-en</td>
1076
+ <td style="text-align: center;">30.71</td>
1077
+ <td style="text-align: center;"><strong>12.18</strong></td>
1078
+ <td style="text-align: center;">33.51</td>
1079
+ <td style="text-align: center;">N/A</td>
1080
+ <td style="text-align: center;">N/A</td>
1081
+ <td style="text-align: center;">14.60</td>
1082
+ </tr>
1083
+ <tr>
1084
+ <td style="text-align: left;">EntireSongs-zh</td>
1085
+ <td style="text-align: center;">34.86</td>
1086
+ <td style="text-align: center;">18.68</td>
1087
+ <td style="text-align: center;">23.99</td>
1088
+ <td style="text-align: center;">N/A</td>
1089
+ <td style="text-align: center;">N/A</td>
1090
+ <td style="text-align: center;"><strong>13.91</strong></td>
1091
+ </tr>
1092
+ </tbody>
1093
+ </table>
1094
+
1095
+ </details>
1096
+
1097
+ <details>
1098
+ <summary>ASR Inference Mode Performance (WER ↓)</summary>
1099
+
1100
+ <table>
1101
+ <thead>
1102
+ <tr>
1103
+ <th style="text-align: left;">Model</th>
1104
+ <th style="text-align: left;">Infer. Mode</th>
1105
+ <th style="text-align: center;">Librispeech</th>
1106
+ <th style="text-align: center;">Fleurs-en</th>
1107
+ <th style="text-align: center;">Fleurs-zh</th>
1108
+ <th style="text-align: center;">Avg.</th>
1109
+ </tr>
1110
+ </thead>
1111
+ <tbody>
1112
+ <tr>
1113
+ <td rowspan="2" style="text-align: left; vertical-align: middle;">Qwen3-ASR-1.7B</td>
1114
+ <td style="text-align: left;">Offline</td>
1115
+ <td style="text-align: center;">1.63 | 3.38</td>
1116
+ <td style="text-align: center;">3.35</td>
1117
+ <td style="text-align: center;">2.41</td>
1118
+ <td style="text-align: center;">2.69</td>
1119
+ </tr>
1120
+ <tr>
1121
+ <td style="text-align: left;">Streaming</td>
1122
+ <td style="text-align: center;">1.95 | 4.51</td>
1123
+ <td style="text-align: center;">4.02</td>
1124
+ <td style="text-align: center;">2.84</td>
1125
+ <td style="text-align: center;">3.33</td>
1126
+ </tr>
1127
+ <tr style="border-top: 1px solid #ddd;">
1128
+ <td rowspan="2" style="text-align: left; vertical-align: middle;">Qwen3-ASR-0.6B</td>
1129
+ <td style="text-align: left;">Offline</td>
1130
+ <td style="text-align: center;">2.11 | 4.55</td>
1131
+ <td style="text-align: center;">4.39</td>
1132
+ <td style="text-align: center;">2.88</td>
1133
+ <td style="text-align: center;">3.48</td>
1134
+ </tr>
1135
+ <tr>
1136
+ <td style="text-align: left;">Streaming</td>
1137
+ <td style="text-align: center;">2.54 | 6.27</td>
1138
+ <td style="text-align: center;">5.38</td>
1139
+ <td style="text-align: center;">3.40</td>
1140
+ <td style="text-align: center;">4.40</td>
1141
+ </tr>
1142
+ </tbody>
1143
+ </table>
1144
+
1145
+ </details>
1146
+
1147
+ <details>
1148
+ <summary>Forced Alignment Benchmarks (AAS ms ↓)</summary>
1149
+
1150
+ <table>
1151
+ <thead>
1152
+ <tr>
1153
+ <th style="text-align: left;"></th>
1154
+ <th style="text-align: center;">Monotonic-Aligner</th>
1155
+ <th style="text-align: center;">NFA</th>
1156
+ <th style="text-align: center;">WhisperX</th>
1157
+ <th style="text-align: center;">Qwen3-ForcedAligner-0.6B</th>
1158
+ </tr>
1159
+ </thead>
1160
+ <tbody>
1161
+ <tr>
1162
+ <td colspan="5" style="text-align: left; font-style: italic; border-top: 1px solid #ddd; border-bottom: 1px solid #ddd;">MFA-Labeled Raw</td>
1163
+ </tr>
1164
+ <tr>
1165
+ <td style="text-align: left;">Chinese</td>
1166
+ <td style="text-align: center;">161.1</td>
1167
+ <td style="text-align: center;">109.8</td>
1168
+ <td style="text-align: center;">-</td>
1169
+ <td style="text-align: center;"><strong>33.1</strong></td>
1170
+ </tr>
1171
+ <tr>
1172
+ <td style="text-align: left;">English</td>
1173
+ <td style="text-align: center;">-</td>
1174
+ <td style="text-align: center;">107.5</td>
1175
+ <td style="text-align: center;">92.1</td>
1176
+ <td style="text-align: center;"><strong>37.5</strong></td>
1177
+ </tr>
1178
+ <tr>
1179
+ <td style="text-align: left;">French</td>
1180
+ <td style="text-align: center;">-</td>
1181
+ <td style="text-align: center;">100.7</td>
1182
+ <td style="text-align: center;">145.3</td>
1183
+ <td style="text-align: center;"><strong>41.7</strong></td>
1184
+ </tr>
1185
+ <tr>
1186
+ <td style="text-align: left;">German</td>
1187
+ <td style="text-align: center;">-</td>
1188
+ <td style="text-align: center;">122.7</td>
1189
+ <td style="text-align: center;">165.1</td>
1190
+ <td style="text-align: center;"><strong>46.5</strong></td>
1191
+ </tr>
1192
+ <tr>
1193
+ <td style="text-align: left;">Italian</td>
1194
+ <td style="text-align: center;">-</td>
1195
+ <td style="text-align: center;">142.7</td>
1196
+ <td style="text-align: center;">155.5</td>
1197
+ <td style="text-align: center;"><strong>75.5</strong></td>
1198
+ </tr>
1199
+ <tr>
1200
+ <td style="text-align: left;">Japanese</td>
1201
+ <td style="text-align: center;">-</td>
1202
+ <td style="text-align: center;">-</td>
1203
+ <td style="text-align: center;">-</td>
1204
+ <td style="text-align: center;"><strong>42.2</strong></td>
1205
+ </tr>
1206
+ <tr>
1207
+ <td style="text-align: left;">Korean</td>
1208
+ <td style="text-align: center;">-</td>
1209
+ <td style="text-align: center;">-</td>
1210
+ <td style="text-align: center;">-</td>
1211
+ <td style="text-align: center;"><strong>37.2</strong></td>
1212
+ </tr>
1213
+ <tr>
1214
+ <td style="text-align: left;">Portuguese</td>
1215
+ <td style="text-align: center;">-</td>
1216
+ <td style="text-align: center;">-</td>
1217
+ <td style="text-align: center;">-</td>
1218
+ <td style="text-align: center;"><strong>38.4</strong></td>
1219
+ </tr>
1220
+ <tr>
1221
+ <td style="text-align: left;">Russian</td>
1222
+ <td style="text-align: center;">-</td>
1223
+ <td style="text-align: center;">200.7</td>
1224
+ <td style="text-align: center;">-</td>
1225
+ <td style="text-align: center;"><strong>40.2</strong></td>
1226
+ </tr>
1227
+ <tr>
1228
+ <td style="text-align: left;">Spanish</td>
1229
+ <td style="text-align: center;">-</td>
1230
+ <td style="text-align: center;">124.7</td>
1231
+ <td style="text-align: center;">108.0</td>
1232
+ <td style="text-align: center;"><strong>36.8</strong></td>
1233
+ </tr>
1234
+ <tr>
1235
+ <td style="text-align: left;"><em>Avg.</em></td>
1236
+ <td style="text-align: center;">161.1</td>
1237
+ <td style="text-align: center;">129.8</td>
1238
+ <td style="text-align: center;">133.2</td>
1239
+ <td style="text-align: center;"><strong>42.9</strong></td>
1240
+ </tr>
1241
+ <tr>
1242
+ <td colspan="5" style="text-align: left; font-style: italic; border-top: 1px solid #ddd; border-bottom: 1px solid #ddd;">MFA-Labeled Concat-300s</td>
1243
+ </tr>
1244
+ <tr>
1245
+ <td style="text-align: left;">Chinese</td>
1246
+ <td style="text-align: center;">1742.4</td>
1247
+ <td style="text-align: center;">235.0</td>
1248
+ <td style="text-align: center;">-</td>
1249
+ <td style="text-align: center;"><strong>36.5</strong></td>
1250
+ </tr>
1251
+ <tr>
1252
+ <td style="text-align: left;">English</td>
1253
+ <td style="text-align: center;">-</td>
1254
+ <td style="text-align: center;">226.7</td>
1255
+ <td style="text-align: center;">227.2</td>
1256
+ <td style="text-align: center;"><strong>58.6</strong></td>
1257
+ </tr>
1258
+ <tr>
1259
+ <td style="text-align: left;">French</td>
1260
+ <td style="text-align: center;">-</td>
1261
+ <td style="text-align: center;">230.6</td>
1262
+ <td style="text-align: center;">2052.2</td>
1263
+ <td style="text-align: center;"><strong>53.4</strong></td>
1264
+ </tr>
1265
+ <tr>
1266
+ <td style="text-align: left;">German</td>
1267
+ <td style="text-align: center;">-</td>
1268
+ <td style="text-align: center;">220.3</td>
1269
+ <td style="text-align: center;">993.4</td>
1270
+ <td style="text-align: center;"><strong>62.4</strong></td>
1271
+ </tr>
1272
+ <tr>
1273
+ <td style="text-align: left;">Italian</td>
1274
+ <td style="text-align: center;">-</td>
1275
+ <td style="text-align: center;">290.5</td>
1276
+ <td style="text-align: center;">5719.4</td>
1277
+ <td style="text-align: center;"><strong>81.6</strong></td>
1278
+ </tr>
1279
+ <tr>
1280
+ <td style="text-align: left;">Japanese</td>
1281
+ <td style="text-align: center;">-</td>
1282
+ <td style="text-align: center;">-</td>
1283
+ <td style="text-align: center;">-</td>
1284
+ <td style="text-align: center;"><strong>81.3</strong></td>
1285
+ </tr>
1286
+ <tr>
1287
+ <td style="text-align: left;">Korean</td>
1288
+ <td style="text-align: center;">-</td>
1289
+ <td style="text-align: center;">-</td>
1290
+ <td style="text-align: center;">-</td>
1291
+ <td style="text-align: center;"><strong>42.2</strong></td>
1292
+ </tr>
1293
+ <tr>
1294
+ <td style="text-align: left;">Portuguese</td>
1295
+ <td style="text-align: center;">-</td>
1296
+ <td style="text-align: center;">-</td>
1297
+ <td style="text-align: center;">-</td>
1298
+ <td style="text-align: center;"><strong>50.0</strong></td>
1299
+ </tr>
1300
+ <tr>
1301
+ <td style="text-align: left;">Russian</td>
1302
+ <td style="text-align: center;">-</td>
1303
+ <td style="text-align: center;">283.3</td>
1304
+ <td style="text-align: center;">-</td>
1305
+ <td style="text-align: center;"><strong>43.0</strong></td>
1306
+ </tr>
1307
+ <tr>
1308
+ <td style="text-align: left;">Spanish</td>
1309
+ <td style="text-align: center;">-</td>
1310
+ <td style="text-align: center;">240.2</td>
1311
+ <td style="text-align: center;">4549.9</td>
1312
+ <td style="text-align: center;"><strong>39.6</strong></td>
1313
+ </tr>
1314
+ <tr>
1315
+ <td style="text-align: left;">Cross-lingual</td>
1316
+ <td style="text-align: center;">-</td>
1317
+ <td style="text-align: center;">-</td>
1318
+ <td style="text-align: center;">-</td>
1319
+ <td style="text-align: center;"><strong>34.2</strong></td>
1320
+ </tr>
1321
+ <tr>
1322
+ <td style="text-align: left;"><em>Avg.</em></td>
1323
+ <td style="text-align: center;">1742.4</td>
1324
+ <td style="text-align: center;">246.7</td>
1325
+ <td style="text-align: center;">2708.4</td>
1326
+ <td style="text-align: center;"><strong>52.9</strong></td>
1327
+ </tr>
1328
+ <tr>
1329
+ <td colspan="5" style="text-align: left; font-style: italic; border-top: 1px solid #ddd; border-bottom: 1px solid #ddd;">Human-Labeled</td>
1330
+ </tr>
1331
+ <tr>
1332
+ <td style="text-align: left;">Raw</td>
1333
+ <td style="text-align: center;">49.9</td>
1334
+ <td style="text-align: center;">88.6</td>
1335
+ <td style="text-align: center;">-</td>
1336
+ <td style="text-align: center;"><strong>27.8</strong></td>
1337
+ </tr>
1338
+ <tr>
1339
+ <td style="text-align: left;">Raw-Noisy</td>
1340
+ <td style="text-align: center;">53.3</td>
1341
+ <td style="text-align: center;">89.5</td>
1342
+ <td style="text-align: center;">-</td>
1343
+ <td style="text-align: center;"><strong>41.8</strong></td>
1344
+ </tr>
1345
+ <tr>
1346
+ <td style="text-align: left;">Concat-60s</td>
1347
+ <td style="text-align: center;">51.1</td>
1348
+ <td style="text-align: center;">86.7</td>
1349
+ <td style="text-align: center;">-</td>
1350
+ <td style="text-align: center;"><strong>25.3</strong></td>
1351
+ </tr>
1352
+ <tr>
1353
+ <td style="text-align: left;">Concat-300s</td>
1354
+ <td style="text-align: center;">410.8</td>
1355
+ <td style="text-align: center;">140.0</td>
1356
+ <td style="text-align: center;">-</td>
1357
+ <td style="text-align: center;"><strong>24.8</strong></td>
1358
+ </tr>
1359
+ <tr>
1360
+ <td style="text-align: left;">Concat-Cross-lingual</td>
1361
+ <td style="text-align: center;">-</td>
1362
+ <td style="text-align: center;">-</td>
1363
+ <td style="text-align: center;">-</td>
1364
+ <td style="text-align: center;"><strong>42.5</strong></td>
1365
+ </tr>
1366
+ <tr>
1367
+ <td style="text-align: left;"><em>Avg.</em></td>
1368
+ <td style="text-align: center;">141.3</td>
1369
+ <td style="text-align: center;">101.2</td>
1370
+ <td style="text-align: center;">-</td>
1371
+ <td style="text-align: center;"><strong>32.4</strong></td>
1372
+ </tr>
1373
+ </tbody>
1374
+ </table>
1375
+
1376
+ </details>
1377
+
1378
+
1379
+ ## Citation
1380
+
1381
+ If you find our paper and code useful in your research, please consider giving a star :star: and citation :pencil: :)
1382
+
1383
+ ```BibTeX
1384
+ @article{Qwen3-ASR,
1385
+ title={Qwen3-ASR Technical Report},
1386
+ author={Xian Shi, Xiong Wang, Zhifang Guo, Yongqi Wang, Pei Zhang, Xinyu Zhang, Zishan Guo, Hongkun Hao, Yu Xi, Baosong Yang, Jin Xu, Jingren Zhou, Junyang Lin},
1387
+ journal={arXiv preprint arXiv:2601.21337},
1388
+ year={2026}
1389
+ }
1390
+ ```
1391
+
1392
+
1393
+ <br>
Qwen3-ForcedAligner-0.6B/chat_template.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"chat_template": "{%- set ns = namespace(system_text=\"\") -%}\n{%- for m in messages -%}\n {%- if m.role == 'system' -%}\n {%- if m.content is string -%}\n {%- set ns.system_text = ns.system_text + m.content -%}\n {%- else -%}\n {%- for c in m.content -%}\n {%- if c.type == 'text' and (c.text is defined) -%}\n {%- set ns.system_text = ns.system_text + c.text -%}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n {%- endif -%}\n{%- endfor -%}\n\n{%- set ns2 = namespace(audio_tokens=\"\") -%}\n{%- for m in messages -%}\n {%- if m.content is not string -%}\n {%- for c in m.content -%}\n {%- if c.type == 'audio' or ('audio' in c) or ('audio_url' in c) -%}\n {%- set ns2.audio_tokens = ns2.audio_tokens + \"<|audio_start|><|audio_pad|><|audio_end|>\" -%}\n {%- endif -%}\n {%- endfor -%}\n {%- endif -%}\n{%- endfor -%}\n\n{{- '<|im_start|>system\\n' + (ns.system_text if ns.system_text is string else '') + '<|im_end|>\\n' -}}\n{{- '<|im_start|>user\\n' + ns2.audio_tokens + '<|im_end|>\\n' -}}\n{%- if add_generation_prompt -%}\n{{- '<|im_start|>assistant\\n' -}}\n{%- endif -%}"}
Qwen3-ForcedAligner-0.6B/config.json ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ASRForConditionalGeneration"
4
+ ],
5
+ "timestamp_token_id": 151705,
6
+ "timestamp_segment_time": 80,
7
+ "model_type": "qwen3_asr",
8
+ "support_languages": [
9
+ "Chinese",
10
+ "Cantonese",
11
+ "English",
12
+ "German",
13
+ "Spanish",
14
+ "French",
15
+ "Italian",
16
+ "Portuguese",
17
+ "Russian",
18
+ "Korean",
19
+ "Japanese"
20
+ ],
21
+ "thinker_config": {
22
+ "model_type": "qwen3_forced_aligner",
23
+ "classify_num": 5000,
24
+ "architectures": [
25
+ "Qwen3ASRForConditionalGeneration"
26
+ ],
27
+ "audio_config": {
28
+ "_name_or_path": "",
29
+ "activation_dropout": 0,
30
+ "activation_function": "gelu",
31
+ "add_cross_attention": false,
32
+ "architectures": null,
33
+ "attention_dropout": 0,
34
+ "bad_words_ids": null,
35
+ "begin_suppress_tokens": null,
36
+ "bos_token_id": null,
37
+ "chunk_size_feed_forward": 0,
38
+ "conv_chunksize": 500,
39
+ "cross_attention_hidden_size": null,
40
+ "d_model": 1024,
41
+ "decoder_start_token_id": null,
42
+ "diversity_penalty": 0.0,
43
+ "do_sample": false,
44
+ "downsample_hidden_size": 480,
45
+ "dropout": 0,
46
+ "dtype": null,
47
+ "early_stopping": false,
48
+ "encoder_attention_heads": 16,
49
+ "encoder_ffn_dim": 4096,
50
+ "encoder_layers": 24,
51
+ "encoder_no_repeat_ngram_size": 0,
52
+ "eos_token_id": null,
53
+ "exponential_decay_length_penalty": null,
54
+ "finetuning_task": null,
55
+ "forced_bos_token_id": null,
56
+ "forced_eos_token_id": null,
57
+ "id2label": {
58
+ "0": "LABEL_0",
59
+ "1": "LABEL_1"
60
+ },
61
+ "initializer_range": 0.02,
62
+ "is_decoder": false,
63
+ "is_encoder_decoder": false,
64
+ "label2id": {
65
+ "LABEL_0": 0,
66
+ "LABEL_1": 1
67
+ },
68
+ "length_penalty": 1.0,
69
+ "max_length": 20,
70
+ "max_source_positions": 1500,
71
+ "min_length": 0,
72
+ "model_type": "qwen3_asr_audio_encoder",
73
+ "n_window": 50,
74
+ "n_window_infer": 800,
75
+ "no_repeat_ngram_size": 0,
76
+ "num_beam_groups": 1,
77
+ "num_beams": 1,
78
+ "num_hidden_layers": 24,
79
+ "num_mel_bins": 128,
80
+ "num_return_sequences": 1,
81
+ "output_attentions": false,
82
+ "output_dim": 1024,
83
+ "output_hidden_states": false,
84
+ "output_scores": false,
85
+ "pad_token_id": null,
86
+ "prefix": null,
87
+ "problem_type": null,
88
+ "pruned_heads": {},
89
+ "remove_invalid_values": false,
90
+ "repetition_penalty": 1.0,
91
+ "return_dict": true,
92
+ "return_dict_in_generate": false,
93
+ "scale_embedding": false,
94
+ "sep_token_id": null,
95
+ "suppress_tokens": null,
96
+ "task_specific_params": null,
97
+ "temperature": 1.0,
98
+ "tf_legacy_loss": false,
99
+ "tie_encoder_decoder": false,
100
+ "tie_word_embeddings": false,
101
+ "tokenizer_class": null,
102
+ "top_k": 50,
103
+ "top_p": 1.0,
104
+ "torchscript": false,
105
+ "typical_p": 1.0,
106
+ "use_bfloat16": false
107
+ },
108
+ "audio_end_token_id": 151670,
109
+ "audio_start_token_id": 151669,
110
+ "audio_token_id": 151676,
111
+ "dtype": "bfloat16",
112
+ "initializer_range": 0.02,
113
+ "text_config": {
114
+ "_name_or_path": "",
115
+ "add_cross_attention": false,
116
+ "architectures": null,
117
+ "attention_bias": false,
118
+ "attention_dropout": 0.0,
119
+ "bad_words_ids": null,
120
+ "begin_suppress_tokens": null,
121
+ "bos_token_id": null,
122
+ "chunk_size_feed_forward": 0,
123
+ "cross_attention_hidden_size": null,
124
+ "decoder_start_token_id": null,
125
+ "diversity_penalty": 0.0,
126
+ "do_sample": false,
127
+ "dtype": null,
128
+ "early_stopping": false,
129
+ "encoder_no_repeat_ngram_size": 0,
130
+ "eos_token_id": null,
131
+ "exponential_decay_length_penalty": null,
132
+ "finetuning_task": null,
133
+ "forced_bos_token_id": null,
134
+ "forced_eos_token_id": null,
135
+ "head_dim": 128,
136
+ "hidden_act": "silu",
137
+ "hidden_size": 1024,
138
+ "id2label": {
139
+ "0": "LABEL_0",
140
+ "1": "LABEL_1"
141
+ },
142
+ "initializer_range": 0.02,
143
+ "intermediate_size": 3072,
144
+ "is_decoder": false,
145
+ "is_encoder_decoder": false,
146
+ "label2id": {
147
+ "LABEL_0": 0,
148
+ "LABEL_1": 1
149
+ },
150
+ "length_penalty": 1.0,
151
+ "max_length": 20,
152
+ "max_position_embeddings": 8192,
153
+ "min_length": 0,
154
+ "model_type": "qwen3",
155
+ "no_repeat_ngram_size": 0,
156
+ "num_attention_heads": 16,
157
+ "num_beam_groups": 1,
158
+ "num_beams": 1,
159
+ "num_hidden_layers": 28,
160
+ "num_key_value_heads": 8,
161
+ "num_return_sequences": 1,
162
+ "output_attentions": false,
163
+ "output_hidden_states": false,
164
+ "output_scores": false,
165
+ "pad_token_id": null,
166
+ "prefix": null,
167
+ "problem_type": null,
168
+ "pruned_heads": {},
169
+ "remove_invalid_values": false,
170
+ "repetition_penalty": 1.0,
171
+ "return_dict": true,
172
+ "return_dict_in_generate": false,
173
+ "rms_norm_eps": 1e-06,
174
+ "rope_scaling": {
175
+ "interleaved": true,
176
+ "mrope_interleaved": true,
177
+ "mrope_section": [
178
+ 24,
179
+ 20,
180
+ 20
181
+ ],
182
+ "rope_type": "default",
183
+ "type": "default"
184
+ },
185
+ "rope_theta": 1000000,
186
+ "sep_token_id": null,
187
+ "suppress_tokens": null,
188
+ "task_specific_params": null,
189
+ "temperature": 1.0,
190
+ "tf_legacy_loss": false,
191
+ "tie_encoder_decoder": false,
192
+ "tie_word_embeddings": false,
193
+ "tokenizer_class": null,
194
+ "top_k": 50,
195
+ "top_p": 1.0,
196
+ "torchscript": false,
197
+ "typical_p": 1.0,
198
+ "use_bfloat16": false,
199
+ "use_cache": true,
200
+ "vocab_size": 152064
201
+ }
202
+ },
203
+ "transformers_version": "4.57.6"
204
+ }
205
+
Qwen3-ForcedAligner-0.6B/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": [151643,151645],
4
+ "pad_token_id": 151643,
5
+ "do_sample": false
6
+ }
Qwen3-ForcedAligner-0.6B/issues.txt ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ----------------------------------------------------------------------
2
+ #3 chatllm.cpp gets support of this model
3
+ ----------------------------------------------------------------------
4
+
5
+ [J22] Feb 1, 2026
6
+
7
+ chatllm.cpp supports Qwen3-ASR & Qwen3-ForcedAligner models.
8
+
9
+ main.exe -m .../qwen3-focedaligner-0.6b.bin --multimedia_file_tags {{ }} -i --set delimiter "|"
10
+ ________ __ __ __ __ ___
11
+ / ____/ /_ ____ _/ /_/ / / / / |/ /_________ ____
12
+ / / / __ \/ __ `/ __/ / / / / /|_/ // ___/ __ \/ __ \
13
+ / /___/ / / / /_/ / /_/ /___/ /___/ / / // /__/ /_/ / /_/ /
14
+ \____/_/ /_/\__,_/\__/_____/_____/_/ /_(_)___/ .___/ .___/
15
+ You are served by Qwen3-ForcedAligner, /_/ /_/
16
+ with 601300992 (0.6B) parameters.
17
+
18
+ You > {{audio:...\Downloads\asr_zh.wav}}甚至|出现交易|几乎停滞|的情况。
19
+ A.I. > 0
20
+ 00:00:00,400 --> 00:00:00,960
21
+ 甚至
22
+
23
+ 1
24
+ 00:00:00,960 --> 00:00:02,000
25
+ 出现交易
26
+
27
+ 2
28
+ 00:00:02,000 --> 00:00:02,879
29
+ 几乎停滞
30
+
31
+ 3
32
+ 00:00:02,879 --> 00:00:03,680
33
+ 的情况。
34
+
35
+ [lbarasc] Feb 10, 2026
36
+
37
+ i want to use this on windows, i downloaded the latest chatllm and a mp3 speech of Obama for example, but i don't have any text in output !
38
+ please submit links to download chatllm, qwen3-focedaligner-0.6b.bin and the .wav.
39
+ can i forgot something ?
40
+ thank you
41
+
42
+ [J22] Feb 11, 2026
43
+
44
+ @lbarasc, you can find quantized models here: https://modelscope.cn/models/judd2024/chatllm_quantized_qwen3/files
45
+
46
+ Or, you can quantize them on your own by using convert.py.
Qwen3-ForcedAligner-0.6B/languages.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Chinese + 22 Chinese dialects
2
+ English
3
+ Yue Chinese
4
+ Arabic
5
+ German
6
+ French
7
+ Spanish
8
+ Portuguese
9
+ Indonesian
10
+ Italian
11
+ Korean
12
+ Russian
13
+ Thai
14
+ Vietnamese
15
+ Japanese
16
+ Turkish
17
+ Hindi
18
+ Malay
19
+ Dutch
20
+ Swedish
21
+ Danish
22
+ Finnish
23
+ Polish
24
+ Czech
25
+ Filipino
26
+ Persian
27
+ Greek
28
+ Hungarian
29
+ Macedonian
30
+ Romanian
Qwen3-ForcedAligner-0.6B/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
Qwen3-ForcedAligner-0.6B/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47831d0e82f96b20e9034dba01a075ee06436654719f6a68289e49f1b65ce0e7
3
+ size 1835544544
Qwen3-ForcedAligner-0.6B/preprocessor_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 128,
6
+ "hop_length": 160,
7
+ "n_fft": 400,
8
+ "n_samples": 480000,
9
+ "nb_max_frames": 3000,
10
+ "padding_side": "right",
11
+ "padding_value": 0.0,
12
+ "processor_class": "Qwen3ASRProcessor",
13
+ "return_attention_mask": true
14
+ }
Qwen3-ForcedAligner-0.6B/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/Qwen/Qwen3-ForcedAligner-0.6B
Qwen3-ForcedAligner-0.6B/tokenizer_config.json ADDED
@@ -0,0 +1,557 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "151669": {
214
+ "content": "<|audio_start|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "151670": {
222
+ "content": "<|audio_end|>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "151671": {
230
+ "content": "<tts_pad>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "151672": {
238
+ "content": "<tts_text_bos>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "151673": {
246
+ "content": "<tts_text_eod>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "151674": {
254
+ "content": "<tts_text_bos_single>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "151675": {
262
+ "content": "<non_speech>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": false
268
+ },
269
+ "151676": {
270
+ "content": "<|audio_pad|>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "151677": {
278
+ "content": "<blank1>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "151678": {
286
+ "content": "<blank2>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "151679": {
294
+ "content": "<blank3>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "151680": {
302
+ "content": "<blank4>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": true
308
+ },
309
+ "151681": {
310
+ "content": "<blank5>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": true
316
+ },
317
+ "151682": {
318
+ "content": "<blank6>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "151683": {
326
+ "content": "<blank7>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": true
332
+ },
333
+ "151684": {
334
+ "content": "<blank8>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": true
340
+ },
341
+ "151685": {
342
+ "content": "<blank9>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": true
348
+ },
349
+ "151686": {
350
+ "content": "<blank10>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": true
356
+ },
357
+ "151687": {
358
+ "content": "<blank11>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": true
364
+ },
365
+ "151688": {
366
+ "content": "<blank12>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": true
372
+ },
373
+ "151689": {
374
+ "content": "<blank13>",
375
+ "lstrip": false,
376
+ "normalized": false,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": true
380
+ },
381
+ "151690": {
382
+ "content": "<blank14>",
383
+ "lstrip": false,
384
+ "normalized": false,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": true
388
+ },
389
+ "151691": {
390
+ "content": "<blank15>",
391
+ "lstrip": false,
392
+ "normalized": false,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": true
396
+ },
397
+ "151692": {
398
+ "content": "<blank16>",
399
+ "lstrip": false,
400
+ "normalized": false,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": true
404
+ },
405
+ "151693": {
406
+ "content": "<blank17>",
407
+ "lstrip": false,
408
+ "normalized": false,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": true
412
+ },
413
+ "151694": {
414
+ "content": "<blank18>",
415
+ "lstrip": false,
416
+ "normalized": false,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": true
420
+ },
421
+ "151695": {
422
+ "content": "<blank19>",
423
+ "lstrip": false,
424
+ "normalized": false,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": true
428
+ },
429
+ "151696": {
430
+ "content": "<blank20>",
431
+ "lstrip": false,
432
+ "normalized": false,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": true
436
+ },
437
+ "151697": {
438
+ "content": "<blank21>",
439
+ "lstrip": false,
440
+ "normalized": false,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": true
444
+ },
445
+ "151698": {
446
+ "content": "<blank22>",
447
+ "lstrip": false,
448
+ "normalized": false,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": true
452
+ },
453
+ "151699": {
454
+ "content": "<blank23>",
455
+ "lstrip": false,
456
+ "normalized": false,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": true
460
+ },
461
+ "151700": {
462
+ "content": "<blank24>",
463
+ "lstrip": false,
464
+ "normalized": false,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": true
468
+ },
469
+ "151701": {
470
+ "content": "<blank25>",
471
+ "lstrip": false,
472
+ "normalized": false,
473
+ "rstrip": false,
474
+ "single_word": false,
475
+ "special": true
476
+ },
477
+ "151702": {
478
+ "content": "<blank26>",
479
+ "lstrip": false,
480
+ "normalized": false,
481
+ "rstrip": false,
482
+ "single_word": false,
483
+ "special": true
484
+ },
485
+ "151703": {
486
+ "content": "<blank27>",
487
+ "lstrip": false,
488
+ "normalized": false,
489
+ "rstrip": false,
490
+ "single_word": false,
491
+ "special": true
492
+ },
493
+ "151704": {
494
+ "content": "<asr_text>",
495
+ "lstrip": false,
496
+ "normalized": false,
497
+ "rstrip": false,
498
+ "single_word": false,
499
+ "special": false
500
+ },
501
+ "151705": {
502
+ "content": "<timestamp>",
503
+ "lstrip": false,
504
+ "normalized": false,
505
+ "rstrip": false,
506
+ "single_word": false,
507
+ "special": false
508
+ }
509
+ },
510
+ "additional_special_tokens": [
511
+ "<|im_start|>",
512
+ "<|im_end|>",
513
+ "<|object_ref_start|>",
514
+ "<|object_ref_end|>",
515
+ "<|box_start|>",
516
+ "<|box_end|>",
517
+ "<|quad_start|>",
518
+ "<|quad_end|>",
519
+ "<|vision_start|>",
520
+ "<|vision_end|>",
521
+ "<|vision_pad|>",
522
+ "<|image_pad|>",
523
+ "<|video_pad|>",
524
+ "<|audio_start|>",
525
+ "<|audio_end|>",
526
+ "<tts_pad>",
527
+ "<tts_text_bos>",
528
+ "<tts_text_bos_single>",
529
+ "<|audio_pad|>"
530
+ ],
531
+ "audio_bos_token": "<|audio_start|>",
532
+ "audio_eos_token": "<|audio_end|>",
533
+ "audio_token": "<|audio_pad|>",
534
+ "bos_token": null,
535
+ "clean_up_tokenization_spaces": false,
536
+ "eos_token": "<|im_end|>",
537
+ "errors": "replace",
538
+ "extra_special_tokens": {
539
+ "audio_bos_token": "<|audio_start|>",
540
+ "audio_eos_token": "<|audio_end|>",
541
+ "audio_token": "<|audio_pad|>",
542
+ "image_token": "<|image_pad|>",
543
+ "video_token": "<|video_pad|>",
544
+ "vision_bos_token": "<|vision_start|>",
545
+ "vision_eos_token": "<|vision_end|>"
546
+ },
547
+ "image_token": "<|image_pad|>",
548
+ "model_max_length": 131072,
549
+ "pad_token": "<|endoftext|>",
550
+ "processor_class": "Qwen3ASRProcessor",
551
+ "split_special_tokens": false,
552
+ "tokenizer_class": "Qwen2Tokenizer",
553
+ "unk_token": null,
554
+ "video_token": "<|video_pad|>",
555
+ "vision_bos_token": "<|vision_start|>",
556
+ "vision_eos_token": "<|vision_end|>"
557
+ }
Qwen3-ForcedAligner-0.6B/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
chatllm_quantized_qwen3/.gitattributes ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+
4
+
5
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.model filter=lfs diff=lfs merge=lfs -text
12
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
13
+ *.onnx filter=lfs diff=lfs merge=lfs -text
14
+ *.ot filter=lfs diff=lfs merge=lfs -text
15
+ *.parquet filter=lfs diff=lfs merge=lfs -text
16
+ *.pb filter=lfs diff=lfs merge=lfs -text
17
+ *.pt filter=lfs diff=lfs merge=lfs -text
18
+ *.pth filter=lfs diff=lfs merge=lfs -text
19
+ *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
21
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
22
+ *.tflite filter=lfs diff=lfs merge=lfs -text
23
+ *.tgz filter=lfs diff=lfs merge=lfs -text
24
+ *.xz filter=lfs diff=lfs merge=lfs -text
25
+ *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
+ *.tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ *.db* filter=lfs diff=lfs merge=lfs -text
29
+ *.ark* filter=lfs diff=lfs merge=lfs -text
30
+ **/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
31
+ **/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
32
+ **/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
33
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
34
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
35
+ *.gguf* filter=lfs diff=lfs merge=lfs -text
36
+ *.ggml filter=lfs diff=lfs merge=lfs -text
37
+ *.llamafile* filter=lfs diff=lfs merge=lfs -text
38
+ *.pt2 filter=lfs diff=lfs merge=lfs -text
39
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
40
+ *.npy filter=lfs diff=lfs merge=lfs -text
41
+ *.npz filter=lfs diff=lfs merge=lfs -text
42
+ *.pickle filter=lfs diff=lfs merge=lfs -text
43
+ *.pkl filter=lfs diff=lfs merge=lfs -text
44
+ *.tar filter=lfs diff=lfs merge=lfs -text
45
+ *.wasm filter=lfs diff=lfs merge=lfs -text
46
+ *.zst filter=lfs diff=lfs merge=lfs -text
47
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
48
+
49
+ qwen3-vl-4b-it.bin filter=lfs diff=lfs merge=lfs -text
50
+
51
+ qwen3-vl-2b-it.bin filter=lfs diff=lfs merge=lfs -text
52
+
53
+ qwen3-vl-a3b-it.bin filter=lfs diff=lfs merge=lfs -text
54
+
55
+ qwen3-vl-a3b-it-q4_1.bin filter=lfs diff=lfs merge=lfs -text
56
+
57
+ mai-ui-2b.bin filter=lfs diff=lfs merge=lfs -text
58
+
59
+ qwen3-vl-a3b-thinking-q4_1.bin filter=lfs diff=lfs merge=lfs -text
60
+
61
+ qwen3-vl-a3b-thinking.bin filter=lfs diff=lfs merge=lfs -text
62
+
63
+ qwen3-vl-emb-2b.bin filter=lfs diff=lfs merge=lfs -text
64
+
65
+ qwen3-vl-reranker-2b.bin filter=lfs diff=lfs merge=lfs -text
66
+
67
+ qwen3-vl-8b-it.bin filter=lfs diff=lfs merge=lfs -text
68
+
69
+ qwen3-asr-0.6b.bin filter=lfs diff=lfs merge=lfs -text
70
+
71
+ qwen3-asr-1.7b.bin filter=lfs diff=lfs merge=lfs -text
72
+
73
+ qwen3-focedaligner-0.6b.bin filter=lfs diff=lfs merge=lfs -text
74
+
75
+ qwen3-focedaligner-0.6b-f16.bin filter=lfs diff=lfs merge=lfs -text
76
+
77
+ qwen3-tts-12hz-0.6b-base.bin filter=lfs diff=lfs merge=lfs -text
78
+
79
+ qwen3-tts-12hz-0.6b-customvoice.bin filter=lfs diff=lfs merge=lfs -text
80
+
81
+ qwen3-tts-12hz-1.7b-base.bin filter=lfs diff=lfs merge=lfs -text
82
+
83
+ qwen3-tts-12hz-1.7b-customvoice.bin filter=lfs diff=lfs merge=lfs -text
84
+
85
+ qwen3-tts-12hz-1.7b-voicedesign.bin filter=lfs diff=lfs merge=lfs -text
86
+
87
+ qwen3-tts-12hz-0.6b-base-f16.bin filter=lfs diff=lfs merge=lfs -text
88
+
89
+ qwen3-tts-12hz-0.6b-customvoice-f16.bin filter=lfs diff=lfs merge=lfs -text
90
+
91
+ qwen3-tts-12hz-1.7b-base-f16.bin filter=lfs diff=lfs merge=lfs -text
92
+
93
+ qwen3-tts-12hz-1.7b-customvoice-f16.bin filter=lfs diff=lfs merge=lfs -text
chatllm_quantized_qwen3/README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ frameworks:
3
+ - other
4
+ license: other
5
+ tasks:
6
+ - text-generation
7
+ ---
8
+
9
+ 这里包含一系列预先量化的 LLM,可供 [ChatLLM.cpp](https://github.com/foldl/chatllm.cpp) 直接使用。
10
+
11
+ **注意**:这里出于研究、学习之目的提供量化模型,每种模型的使用请遵循相应的用户协议。
12
+
13
+
chatllm_quantized_qwen3/mai-ui-2b.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1c2b94e99f3abc43613f91dc0ffbc80aab880abe4e2e5b8ce45f103ea3ef81b
3
+ size 2267737776
chatllm_quantized_qwen3/qwen3-asr-0.6b.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56af60844e85e812641c4c9370af669fdfe4888ca41f93272d9db769efca9afe
3
+ size 1005656864
chatllm_quantized_qwen3/qwen3-asr-1.7b.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6400749342f9ed5ce2f5fabdd0332441ff118a81d6b9d195d462b668d0d017a
3
+ size 2505570400
chatllm_quantized_qwen3/qwen3-focedaligner-0.6b-f16.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:272a24687e3f92cf60daa5a332991176e08feef1bd79b0626e626d49fe23f604
3
+ size 1840552064
chatllm_quantized_qwen3/qwen3-focedaligner-0.6b.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bfccb1316a7bc890660fc5d1c8bbcc6477b05fb0ca13417aa65a1443f85089d5
3
+ size 984439424
chatllm_quantized_qwen3/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://modelscope.cn/models/judd2024/chatllm_quantized_qwen3
qwen3-asr-0.6b-f16/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ qwen3-asr-0.6b-f16.gguf filter=lfs diff=lfs merge=lfs -text
qwen3-asr-0.6b-f16/README.md ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ language:
4
+ - zh
5
+ - en
6
+ - yue
7
+ - ar
8
+ - de
9
+ - fr
10
+ - es
11
+ - pt
12
+ - id
13
+ - it
14
+ - ko
15
+ - ru
16
+ - th
17
+ - vi
18
+ - ja
19
+ - tr
20
+ - hi
21
+ - ms
22
+ - nl
23
+ - sv
24
+ - da
25
+ - fi
26
+ - pl
27
+ - cs
28
+ - fil
29
+ - fa
30
+ - el
31
+ - hu
32
+ - mk
33
+ - ro
34
+ tags:
35
+ - audio
36
+ - speech
37
+ - automatic-speech-recognition
38
+ ---
39
+
40
+ license: apache-2.0
41
+ ---
42
+
43
+ # OVOS - Qwen3 ASR 0.6B F16 (GGUF)
44
+
45
+ This model is a quantized gguf-format export of [Qwen/Qwen3-ASR-0.6B](https://huggingface.co/Qwen/Qwen3-ASR-0.6B)
46
+ for ease of use in edge devices and CPU-based inference environments.
47
+ The original model is transformed into gguf with F16 tensors by the script [convert_hf_to_gguf.py](https://github.com/femelo/qwen3-asr.cpp/blob/main/scripts/convert_hf_to_gguf.py)
48
+ and then further quantized, if needed, using the tool [quantize](https://github.com/femelo/qwen3-asr.cpp/blob/main/src/quantize.cpp) from the same repo.
49
+
50
+ # Requirements
51
+
52
+ The requirements can be installed as
53
+
54
+ ```bash
55
+ $ pip install git+https://github.com/femelo/py-qwen3-asr-cpp
56
+ ```
57
+
58
+ # Usage
59
+
60
+ ```python
61
+ from py_qwen3_asr_cpp.model import Qwen3ASRModel
62
+
63
+ # Initialize the model (it handles downloading from this repo)
64
+ model = Qwen3ASRModel(
65
+ asr_model="qwen3-asr-0.6b-f16",
66
+ n_threads=4
67
+ )
68
+
69
+ # Transcribe from file
70
+ result = model.transcribe("audio.mp3")
71
+ print(f"Detected Language: {result.language}")
72
+ print(f"Transcription: {result.text}")
73
+ ```
74
+
75
+ Refer to [https://github.com/femelo/py-qwen3-asr-cpp](https://github.com/femelo/py-qwen3-asr-cpp) for more details.
76
+
77
+ # Licensing
78
+
79
+ The license is derived from the original model: Apache 2.0. For more details, please refer to [Qwen/Qwen3-ASR-0.6B](https://huggingface.co/Qwen/Qwen3-ASR-0.6B).
80
+
qwen3-asr-0.6b-f16/languages.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Chinese + 22 Chinese dialects
2
+ English
3
+ Yue Chinese
4
+ Arabic
5
+ German
6
+ French
7
+ Spanish
8
+ Portuguese
9
+ Indonesian
10
+ Italian
11
+ Korean
12
+ Russian
13
+ Thai
14
+ Vietnamese
15
+ Japanese
16
+ Turkish
17
+ Hindi
18
+ Malay
19
+ Dutch
20
+ Swedish
21
+ Danish
22
+ Finnish
23
+ Polish
24
+ Czech
25
+ Filipino
26
+ Persian
27
+ Greek
28
+ Hungarian
29
+ Macedonian
30
+ Romanian
qwen3-asr-0.6b-f16/qwen3-asr-0.6b-f16.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8fbc2a779f45002e912fb716b031287659bf6d9036dd78447a3cabc06fe2f43
3
+ size 1882543424
qwen3-asr-0.6b-f16/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/OpenVoiceOS/qwen3-asr-0.6b-f16
qwen3-forced-aligner-0.6b-f16/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ qwen3-forced-aligner-0.6b-f16.gguf filter=lfs diff=lfs merge=lfs -text
qwen3-forced-aligner-0.6b-f16/README.md ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ language:
4
+ - zh
5
+ - en
6
+ - yue
7
+ - ar
8
+ - de
9
+ - fr
10
+ - es
11
+ - pt
12
+ - id
13
+ - it
14
+ - ko
15
+ - ru
16
+ - th
17
+ - vi
18
+ - ja
19
+ - tr
20
+ - hi
21
+ - ms
22
+ - nl
23
+ - sv
24
+ - da
25
+ - fi
26
+ - pl
27
+ - cs
28
+ - fil
29
+ - fa
30
+ - el
31
+ - hu
32
+ - mk
33
+ - ro
34
+ tags:
35
+ - audio
36
+ - speech
37
+ - automatic-speech-recognition
38
+ ---
39
+
40
+ license: apache-2.0
41
+ ---
42
+
43
+ # OVOS - Qwen3 Forced Aligner 0.6B F16 (GGUF)
44
+
45
+ This model is a quantized gguf-format export of [Qwen/Qwen3-ForcedAligner-0.6B](https://huggingface.co/Qwen/Qwen3-ForcedAligner-0.6B)
46
+ for ease of use in edge devices and CPU-based inference environments.
47
+ The original model is transformed into gguf with F16 tensors by the script [convert_hf_to_gguf.py](https://github.com/femelo/qwen3-asr.cpp/blob/main/scripts/convert_hf_to_gguf.py)
48
+ and then further quantized, if needed, using the tool [quantize](https://github.com/femelo/qwen3-asr.cpp/blob/main/src/quantize.cpp) from the same repo.
49
+
50
+ # Requirements
51
+
52
+ The requirements can be installed as
53
+
54
+ ```bash
55
+ $ pip install git+https://github.com/femelo/py-qwen3-asr-cpp
56
+ ```
57
+
58
+ # Usage
59
+
60
+ ```python
61
+ from py_qwen3_asr_cpp.model import Qwen3ASRModel
62
+
63
+ # Initialize the model (it handles downloading from this repo)
64
+ model = Qwen3ASRModel(
65
+ asr_model="qwen3-asr-0.6b-f16",
66
+ align_model="qwen3-forced-aligner-0.6b-f16",
67
+ n_threads=4
68
+ )
69
+
70
+ # Transcribe from file
71
+ result, alignment = model.transcribe_and_align("audio.mp3")
72
+ print(f"Detected Language: {result.language}")
73
+ print(f"Transcription: {result.text}")
74
+ ```
75
+
76
+ Refer to [https://github.com/femelo/py-qwen3-asr-cpp](https://github.com/femelo/py-qwen3-asr-cpp) for more details.
77
+
78
+ # Licensing
79
+
80
+ The license is derived from the original model: Apache 2.0. For more details, please refer to [Qwen/Qwen3-ForcedAligner-0.6B](https://huggingface.co/Qwen/Qwen3-ForcedAligner-0.6B).
81
+
qwen3-forced-aligner-0.6b-f16/languages.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Chinese + 22 Chinese dialects
2
+ English
3
+ Yue Chinese
4
+ Arabic
5
+ German
6
+ French
7
+ Spanish
8
+ Portuguese
9
+ Indonesian
10
+ Italian
11
+ Korean
12
+ Russian
13
+ Thai
14
+ Vietnamese
15
+ Japanese
16
+ Turkish
17
+ Hindi
18
+ Malay
19
+ Dutch
20
+ Swedish
21
+ Danish
22
+ Finnish
23
+ Polish
24
+ Czech
25
+ Filipino
26
+ Persian
27
+ Greek
28
+ Hungarian
29
+ Macedonian
30
+ Romanian
qwen3-forced-aligner-0.6b-f16/qwen3-forced-aligner-0.6b-f16.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a70b52245d6554260bba88c17a6ff090ac60476506e4374eb4ce015ab156abab
3
+ size 1842214208
qwen3-forced-aligner-0.6b-f16/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/OpenVoiceOS/qwen3-forced-aligner-0.6b-f16
qwen3-forced-aligner-0.6b-q4-k-m/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ qwen3-forced-aligner-0.6b-q4_k_m.gguf filter=lfs diff=lfs merge=lfs -text
qwen3-forced-aligner-0.6b-q4-k-m/README.md ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ language:
4
+ - zh
5
+ - en
6
+ - yue
7
+ - ar
8
+ - de
9
+ - fr
10
+ - es
11
+ - pt
12
+ - id
13
+ - it
14
+ - ko
15
+ - ru
16
+ - th
17
+ - vi
18
+ - ja
19
+ - tr
20
+ - hi
21
+ - ms
22
+ - nl
23
+ - sv
24
+ - da
25
+ - fi
26
+ - pl
27
+ - cs
28
+ - fil
29
+ - fa
30
+ - el
31
+ - hu
32
+ - mk
33
+ - ro
34
+ tags:
35
+ - audio
36
+ - speech
37
+ - automatic-speech-recognition
38
+ ---
39
+
40
+ license: apache-2.0
41
+ ---
42
+
43
+ # OVOS - Qwen3 Forced Aligner 0.6B Q4_K_M (GGUF)
44
+
45
+ This model is a quantized gguf-format export of [Qwen/Qwen3-ForcedAligner-0.6B](https://huggingface.co/Qwen/Qwen3-ForcedAligner-0.6B)
46
+ for ease of use in edge devices and CPU-based inference environments.
47
+ The original model is transformed into gguf with F16 tensors by the script [convert_hf_to_gguf.py](https://github.com/femelo/qwen3-asr.cpp/blob/main/scripts/convert_hf_to_gguf.py)
48
+ and then further quantized, if needed, using the tool [quantize](https://github.com/femelo/qwen3-asr.cpp/blob/main/src/quantize.cpp) from the same repo.
49
+
50
+ # Requirements
51
+
52
+ The requirements can be installed as
53
+
54
+ ```bash
55
+ $ pip install git+https://github.com/femelo/py-qwen3-asr-cpp
56
+ ```
57
+
58
+ # Usage
59
+
60
+ ```python
61
+ from py_qwen3_asr_cpp.model import Qwen3ASRModel
62
+
63
+ # Initialize the model (it handles downloading from this repo)
64
+ model = Qwen3ASRModel(
65
+ asr_model="qwen3-asr-0.6b-q4-k-m",
66
+ align_model="qwen3-forced-aligner-0.6b-q4-k-m",
67
+ n_threads=4
68
+ )
69
+
70
+ # Transcribe from file
71
+ result, alignment = model.transcribe_and_align("audio.mp3")
72
+ print(f"Detected Language: {result.language}")
73
+ print(f"Transcription: {result.text}")
74
+ ```
75
+
76
+ Refer to [https://github.com/femelo/py-qwen3-asr-cpp](https://github.com/femelo/py-qwen3-asr-cpp) for more details.
77
+
78
+ # Licensing
79
+
80
+ The license is derived from the original model: Apache 2.0. For more details, please refer to [Qwen/Qwen3-ForcedAligner-0.6B](https://huggingface.co/Qwen/Qwen3-ForcedAligner-0.6B).
81
+
qwen3-forced-aligner-0.6b-q4-k-m/languages.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Chinese + 22 Chinese dialects
2
+ English
3
+ Yue Chinese
4
+ Arabic
5
+ German
6
+ French
7
+ Spanish
8
+ Portuguese
9
+ Indonesian
10
+ Italian
11
+ Korean
12
+ Russian
13
+ Thai
14
+ Vietnamese
15
+ Japanese
16
+ Turkish
17
+ Hindi
18
+ Malay
19
+ Dutch
20
+ Swedish
21
+ Danish
22
+ Finnish
23
+ Polish
24
+ Czech
25
+ Filipino
26
+ Persian
27
+ Greek
28
+ Hungarian
29
+ Macedonian
30
+ Romanian
qwen3-forced-aligner-0.6b-q4-k-m/qwen3-forced-aligner-0.6b-q4_k_m.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:542687c8ddb39f9f6510dd7db99697f70ed1148cd3cd7ddbae22097cce73dd6e
3
+ size 615667968
qwen3-forced-aligner-0.6b-q4-k-m/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/OpenVoiceOS/qwen3-forced-aligner-0.6b-q4-k-m
qwen3-forced-aligner-0.6b-q5-k-m/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ qwen3-forced-aligner-0.6b-q5_k_m.gguf filter=lfs diff=lfs merge=lfs -text
qwen3-forced-aligner-0.6b-q5-k-m/README.md ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ language:
4
+ - zh
5
+ - en
6
+ - yue
7
+ - ar
8
+ - de
9
+ - fr
10
+ - es
11
+ - pt
12
+ - id
13
+ - it
14
+ - ko
15
+ - ru
16
+ - th
17
+ - vi
18
+ - ja
19
+ - tr
20
+ - hi
21
+ - ms
22
+ - nl
23
+ - sv
24
+ - da
25
+ - fi
26
+ - pl
27
+ - cs
28
+ - fil
29
+ - fa
30
+ - el
31
+ - hu
32
+ - mk
33
+ - ro
34
+ tags:
35
+ - audio
36
+ - speech
37
+ - automatic-speech-recognition
38
+ ---
39
+
40
+ license: apache-2.0
41
+ ---
42
+
43
+ # OVOS - Qwen3 Forced Aligner 0.6B Q5_K_M (GGUF)
44
+
45
+ This model is a quantized gguf-format export of [Qwen/Qwen3-ForcedAligner-0.6B](https://huggingface.co/Qwen/Qwen3-ForcedAligner-0.6B)
46
+ for ease of use in edge devices and CPU-based inference environments.
47
+ The original model is transformed into gguf with F16 tensors by the script [convert_hf_to_gguf.py](https://github.com/femelo/qwen3-asr.cpp/blob/main/scripts/convert_hf_to_gguf.py)
48
+ and then further quantized, if needed, using the tool [quantize](https://github.com/femelo/qwen3-asr.cpp/blob/main/src/quantize.cpp) from the same repo.
49
+
50
+ # Requirements
51
+
52
+ The requirements can be installed as
53
+
54
+ ```bash
55
+ $ pip install git+https://github.com/femelo/py-qwen3-asr-cpp
56
+ ```
57
+
58
+ # Usage
59
+
60
+ ```python
61
+ from py_qwen3_asr_cpp.model import Qwen3ASRModel
62
+
63
+ # Initialize the model (it handles downloading from this repo)
64
+ model = Qwen3ASRModel(
65
+ asr_model="qwen3-asr-0.6b-q5-k-m",
66
+ align_model="qwen3-forced-aligner-0.6b-q5-k-m",
67
+ n_threads=4
68
+ )
69
+
70
+ # Transcribe from file
71
+ result, alignment = model.transcribe_and_align("audio.mp3")
72
+ print(f"Detected Language: {result.language}")
73
+ print(f"Transcription: {result.text}")
74
+ ```
75
+
76
+ Refer to [https://github.com/femelo/py-qwen3-asr-cpp](https://github.com/femelo/py-qwen3-asr-cpp) for more details.
77
+
78
+ # Licensing
79
+
80
+ The license is derived from the original model: Apache 2.0. For more details, please refer to [Qwen/Qwen3-ForcedAligner-0.6B](https://huggingface.co/Qwen/Qwen3-ForcedAligner-0.6B).
81
+
qwen3-forced-aligner-0.6b-q5-k-m/languages.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Chinese + 22 Chinese dialects
2
+ English
3
+ Yue Chinese
4
+ Arabic
5
+ German
6
+ French
7
+ Spanish
8
+ Portuguese
9
+ Indonesian
10
+ Italian
11
+ Korean
12
+ Russian
13
+ Thai
14
+ Vietnamese
15
+ Japanese
16
+ Turkish
17
+ Hindi
18
+ Malay
19
+ Dutch
20
+ Swedish
21
+ Danish
22
+ Finnish
23
+ Polish
24
+ Czech
25
+ Filipino
26
+ Persian
27
+ Greek
28
+ Hungarian
29
+ Macedonian
30
+ Romanian
qwen3-forced-aligner-0.6b-q5-k-m/qwen3-forced-aligner-0.6b-q5_k_m.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c40ef91407268fa1a07ffb3df32d4f3bc2eb01c694fa533ba06b27279f0dbbed
3
+ size 710352128
qwen3-forced-aligner-0.6b-q5-k-m/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/OpenVoiceOS/qwen3-forced-aligner-0.6b-q5-k-m
qwen3-forced-aligner-0.6b-q8-0/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ qwen3-forced-aligner-0.6b-q8_0.gguf filter=lfs diff=lfs merge=lfs -text
qwen3-forced-aligner-0.6b-q8-0/README.md ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ ---
3
+ language:
4
+ - zh
5
+ - en
6
+ - yue
7
+ - ar
8
+ - de
9
+ - fr
10
+ - es
11
+ - pt
12
+ - id
13
+ - it
14
+ - ko
15
+ - ru
16
+ - th
17
+ - vi
18
+ - ja
19
+ - tr
20
+ - hi
21
+ - ms
22
+ - nl
23
+ - sv
24
+ - da
25
+ - fi
26
+ - pl
27
+ - cs
28
+ - fil
29
+ - fa
30
+ - el
31
+ - hu
32
+ - mk
33
+ - ro
34
+ tags:
35
+ - audio
36
+ - speech
37
+ - automatic-speech-recognition
38
+ ---
39
+
40
+ license: apache-2.0
41
+ ---
42
+
43
+ # OVOS - Qwen3 Forced Aligner 0.6B Q8_0 (GGUF)
44
+
45
+ This model is a quantized gguf-format export of [Qwen/Qwen3-ForcedAligner-0.6B](https://huggingface.co/Qwen/Qwen3-ForcedAligner-0.6B)
46
+ for ease of use in edge devices and CPU-based inference environments.
47
+ The original model is transformed into gguf with F16 tensors by the script [convert_hf_to_gguf.py](https://github.com/femelo/qwen3-asr.cpp/blob/main/scripts/convert_hf_to_gguf.py)
48
+ and then further quantized, if needed, using the tool [quantize](https://github.com/femelo/qwen3-asr.cpp/blob/main/src/quantize.cpp) from the same repo.
49
+
50
+ # Requirements
51
+
52
+ The requirements can be installed as
53
+
54
+ ```bash
55
+ $ pip install git+https://github.com/femelo/py-qwen3-asr-cpp
56
+ ```
57
+
58
+ # Usage
59
+
60
+ ```python
61
+ from py_qwen3_asr_cpp.model import Qwen3ASRModel
62
+
63
+ # Initialize the model (it handles downloading from this repo)
64
+ model = Qwen3ASRModel(
65
+ asr_model="qwen3-asr-0.6b-q8-0",
66
+ align_model="qwen3-forced-aligner-0.6b-q8-0",
67
+ n_threads=4
68
+ )
69
+
70
+ # Transcribe from file
71
+ result, alignment = model.transcribe_and_align("audio.mp3")
72
+ print(f"Detected Language: {result.language}")
73
+ print(f"Transcription: {result.text}")
74
+ ```
75
+
76
+ Refer to [https://github.com/femelo/py-qwen3-asr-cpp](https://github.com/femelo/py-qwen3-asr-cpp) for more details.
77
+
78
+ # Licensing
79
+
80
+ The license is derived from the original model: Apache 2.0. For more details, please refer to [Qwen/Qwen3-ForcedAligner-0.6B](https://huggingface.co/Qwen/Qwen3-ForcedAligner-0.6B).
81
+
qwen3-forced-aligner-0.6b-q8-0/languages.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Chinese + 22 Chinese dialects
2
+ English
3
+ Yue Chinese
4
+ Arabic
5
+ German
6
+ French
7
+ Spanish
8
+ Portuguese
9
+ Indonesian
10
+ Italian
11
+ Korean
12
+ Russian
13
+ Thai
14
+ Vietnamese
15
+ Japanese
16
+ Turkish
17
+ Hindi
18
+ Malay
19
+ Dutch
20
+ Swedish
21
+ Danish
22
+ Finnish
23
+ Polish
24
+ Czech
25
+ Filipino
26
+ Persian
27
+ Greek
28
+ Hungarian
29
+ Macedonian
30
+ Romanian
qwen3-forced-aligner-0.6b-q8-0/qwen3-forced-aligner-0.6b-q8_0.gguf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5de69a8cfc49c95a6520f50f2f15cfce9af35bc4723a10c56fc64e51dc966b3a
3
+ size 994404608
qwen3-forced-aligner-0.6b-q8-0/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/OpenVoiceOS/qwen3-forced-aligner-0.6b-q8-0