wli1995 commited on Dec 18, 2025

Commit

a1cdf44

verified ·

1 Parent(s): e437b51

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +8 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/model.embed_tokens.weight.bfloat16.bin +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l0_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l10_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l11_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l12_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l13_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l14_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l15_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l16_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l17_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l18_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l19_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l1_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l20_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l21_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l22_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l23_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l24_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l25_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l26_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l27_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l28_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l29_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l2_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l30_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l31_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l32_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l33_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l34_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l35_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l3_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l4_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l5_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l6_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l7_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l8_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l9_together.axmodel +3 -0
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_post.axmodel +3 -0
README.md +245 -3
config.json +0 -0
main_api_ax650 +3 -0
main_api_axcl_aarch64 +3 -0
main_api_axcl_x86 +3 -0
main_ax650 +3 -0
main_axcl_aarch64 +3 -0
main_axcl_x86 +3 -0
post_config.json +14 -0
qwen2.5_tokenizer/merges.txt +0 -0
qwen2.5_tokenizer/tokenizer.json +0 -0

.gitattributes CHANGED Viewed

@@ -32,4 +32,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.xz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
+*.axmodel filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+main_ax650 filter=lfs diff=lfs merge=lfs -text
+main_axcl_aarch64 filter=lfs diff=lfs merge=lfs -text
+main_axcl_x86 filter=lfs diff=lfs merge=lfs -text
+qwen3_tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+main_api_ax650 filter=lfs diff=lfs merge=lfs -text
+main_api_axcl_aarch64 filter=lfs diff=lfs merge=lfs -text
+main_api_axcl_x86 filter=lfs diff=lfs merge=lfs -text

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/model.embed_tokens.weight.bfloat16.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eabe5625fc0575bf517c424041e9701c0fd521889e0f547c8522d2aa20e8c0f8
+size 777912320

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l0_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6706bd04faa7987ac84fcedd44487c5dd0fd3e01b3c354a72d2158412cbe572
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l10_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8bc94bbed29a443bd92b07e6da311758e25fcd5024aa72d148a0535c6b8b78c8
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l11_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f823ae3bcbd505ee94403814877d46b1c9cd1cc20e4d9971d718c1f12f6436de
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l12_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aede41d1d2855c5f5e48a1f1ac73948a0e0611c6a791321e51258625e0e298d4
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l13_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:474f529221fc4d145fa5430ce075dd79bebe38d909a0a9ea5d012eb141aac967
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l14_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1c57875f9474d918c04fdb0dab4ab532a511494b0f4d3ed6c104a0ad07acef4
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l15_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d3bdf22e8f1c85695ffb8d62f9c365357136f1e3f03050d14a17d947774f735a
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l16_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5890dae7fc86f8ceed0649616acab5b5d407a0b71939d4c4da1c9f9c5149948f
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l17_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3cc5a36b246584335aa3bef4bf72bd0cf480c26710c5879a7b6dd0542be0ee8d
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l18_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31b56b453671afdd2ea10444371ac5e8d992dbcae38077e3653b96fdbb143e29
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l19_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0dd27c2bccbecd4198e812062b9a323df28d8d362be9e5d28cc4398f3cad08d5
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l1_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:851f6535c5a09b1e40ae95c8f39ff654d5af0a5dab72286164e80a192e377d75
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l20_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ccafe960507254b793eafed1e5253bc924bd91b1ac7ce7b1c4ee22583729b86a
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l21_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:750e3a04abb85e7a59c4ac2736f154083e92a92b6d0a2e0ac115e3847364d722
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l22_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cab38992c778edc588994cf9398c6ade0e15460f3dc44b591b7f602af0df45fe
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l23_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f82d042f14e24f73837d252c1f2eb68666cc63e41c552b31b9651e503a1bd9a1
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l24_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43dc45f7579e0cf2272a283ae1370a8a2442a91b13f85e94ffe1a3789e653f0f
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l25_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c663ff33b1d455f73f3a4a1ca0b2278f7fd731a980e1072f9f4e5b7be002a5e9
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l26_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c99edb9aabc14cb053e269be031746801011cc7b2c15a1f7385d4229f5b8bbfa
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l27_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b15ecab8b8471ed7fbe170dd93ddec9584c591ba6bea6e7224ec1e76abbbd106
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l28_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2495c57afe2dc507a13076947138b2b4f8876082fd9808a99ca00f01ec8b968c
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l29_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:20480961c16012330bfa504c1a4e55553cffd15607516816b0e4c2103ec1ebc5
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l2_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e5a89ea1587149fac2bb1b1adf66c82433f068bac661690d0c2633240ab11757
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l30_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e26950f6520856fa7a7b15179acde9d7b7ac7290c4f95cfd1acf71ee37aaf640
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l31_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78acc56d21871cd026b2d584d1e61431d45de4363ae29f5576887719f782069c
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l32_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52cedd1013577cab86a53258baaa62eeae99b2552264894f322eab41b66a7133
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l33_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6689879470e8d25a508f120f77c8c5ee657ca934c05024a8f593e94389c50b1b
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l34_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98f7b2ac543f85c9380c97f9805b5d572646d1466b8f40b1e916a5675fe6b7eb
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l35_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92da75168b8ea67f92df5d87f9d5e98d7ae33f3ecc27dd1d76053ef7319d368e
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l3_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a797f620572903f60bc64a6ecac74e795ce51ec9f4091fa80c3250d0c397de84
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l4_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb84b2848fe814b7a4e8113f5f47edeb7cdadb3904c9f14deaa08c5a91359372
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l5_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a41dd48f4a331e0046f3bc11d94aa35a1ca5ea2bc3e8f04ec6389011b6d4647
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l6_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ab2e9c6cc960225aaf8a19d2d16fc920d4c048b379c542ec1a4af1b8ea644dd
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l7_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:511e855a8de9857fb0962437f9520bbaab42e559d249cd93d22f1ee826f8f5c5
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l8_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:261747bb3e06f351084c9754f443da736917ab23e22a113c9baeb3312d759136
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l9_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:357cafbab182da8770c5ba00acae9d76499e2bb114594cf6c116eec8d983b411
+size 130950985

Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_post.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7508737ba0506611ead05d48adba0aeae10fd81679b62d221362dd1d8ac520f
+size 424152083

README.md CHANGED Viewed

@@ -1,3 +1,245 @@
----
-license: bsd-3-clause
----

+---
+license: apache-2.0
+language:
+- en
+base_model:
+- Qwen/Qwen3-4B
+pipeline_tag: text-generation
+tags:
+- Qwen
+- Qwen3
+- Int8
+---
+# Qwen3-4B-Int8
+This version of Qwen3-4B-Int8 has been converted to run on the Axera NPU using **w8a16** quantization.
+This model has been optimized with the following LoRA:
+Compatible with Pulsar2 version: 4.2(Not released yet)
+## Convert tools links:
+For those who are interested in model conversion, you can try to export axmodel through the original repo :
+https://huggingface.co/Qwen/Qwen3-4B
+[Pulsar2 Link, How to Convert LLM from Huggingface to axmodel](https://pulsar2-docs.readthedocs.io/en/latest/appendix/build_llm.html)
+[AXera NPU LLM Runtime](https://github.com/AXERA-TECH/ax-llm)
+## Support Platform
+- AX650
+  - [M4N-Dock(爱芯派Pro)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
+  - [M.2 Accelerator card](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html)
+|Chips|w8a16|w4a16|
+|--|--|--|
+|AX650| 4.5 tokens/sec|TBD|
+## How to use
+Download all files from this repository to the device
+```
+root@ax650:/mnt/qtang/llm-test/qwen3-4b# tree -L 1
+.
+|-- config.json
+|-- main_ax650
+|-- main_axcl_aarch64
+|-- main_axcl_x86
+|-- post_config.json
+|-- qwen2.5_tokenizer
+|-- qwen3-4b-ax650
+|-- qwen3_tokenizer
+|-- qwen3_tokenizer_uid.py
+|-- run_qwen3_4b_int8_ctx_ax650.sh
+|-- run_qwen3_4b_int8_ctx_axcl_aarch64.sh
+`-- run_qwen3_4b_int8_ctx_axcl_x86.sh
+3 directories, 9 files
+root@ax650:/mnt/qtang/llm-test/qwen3-4b#
+```
+#### Start the Tokenizer service
+Install requirement
+```
+pip install transformers jinja2
+```
+```
+root@ax650:/mnt/qtang/llm-test/qwen3-4b# python3 qwen3_tokenizer_uid.py
+None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
+Server running at http://0.0.0.0:12345
+```
+#### Inference with AX650 Host, such as M4N-Dock(爱芯派Pro) or AX650N DEMO Board
+Open another terminal and run `run_qwen3_4b_int8_ctx_ax650.sh`
+```
+root@ax650:/mnt/qtang/llm-test/qwen3-4b# ./run_qwen3_4b_int8_ctx_ax650.sh
+[I][                            Init][ 110]: LLM init start
+[I][                            Init][  34]: connect http://127.0.0.1:12345 ok
+[I][                            Init][  57]: uid: 6e90ff82-b9c9-42dc-8f61-081203389166
+bos_id: -1, eos_id: 151645
+  2% | █                                 |   1 /  39 [3.95s<153.89s, 0.25 count/s] tokenizer init ok
+[I][                            Init][  26]: LLaMaEmbedSelector use mmap
+100% | ████████████████████████████████ |  39 /  39 [48.03s<48.03s, 0.81 count/s] init post axmodel ok,remain_cmm(5621 MB)
+[I][                            Init][ 188]: max_token_len : 2559
+[I][                            Init][ 193]: kv_cache_size : 1024, kv_cache_num: 2559
+[I][                            Init][ 201]: prefill_token_num : 128
+[I][                            Init][ 205]: grp: 1, prefill_max_token_num : 1
+[I][                            Init][ 205]: grp: 2, prefill_max_token_num : 256
+[I][                            Init][ 205]: grp: 3, prefill_max_token_num : 512
+[I][                            Init][ 205]: grp: 4, prefill_max_token_num : 1024
+[I][                            Init][ 205]: grp: 5, prefill_max_token_num : 1536
+[I][                            Init][ 205]: grp: 6, prefill_max_token_num : 2048
+[I][                            Init][ 209]: prefill_max_token_num : 2048
+[I][                     load_config][ 282]: load config:
+{
+    "enable_repetition_penalty": false,
+    "enable_temperature": false,
+    "enable_top_k_sampling": true,
+    "enable_top_p_sampling": false,
+    "penalty_window": 20,
+    "repetition_penalty": 1.2,
+    "temperature": 0.9,
+    "top_k": 1,
+    "top_p": 0.8
+}
+[I][                            Init][ 218]: LLM init ok
+Type "q" to exit, Ctrl+c to stop current running
+[I][          GenerateKVCachePrefill][ 270]: input token num : 21, prefill_split_num : 1 prefill_grpid : 2
+[I][          GenerateKVCachePrefill][ 307]: input_num_token:21
+[I][                            main][ 230]: precompute_len: 21
+[I][                            main][ 231]: system_prompt: You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
+prompt >> 1+3=?
+[I][                      SetKVCache][ 530]: prefill_grpid:2 kv_cache_num:256 precompute_len:21 input_num_token:16
+[I][                      SetKVCache][ 533]: current prefill_max_token_num:1920
+[I][                             Run][ 659]: input token num : 16, prefill_split_num : 1
+[I][                             Run][ 685]: input_num_token:16
+[I][                             Run][ 808]: ttft: 1169.05 ms
+<think>
+</think>
+1 + 3 = 4
+[N][                             Run][ 922]: hit eos,avg 4.22 token/s
+[I][                      GetKVCache][ 499]: precompute_len:48, remaining:2000
+prompt >> who are you?
+[I][                      SetKVCache][ 530]: prefill_grpid:2 kv_cache_num:256 precompute_len:48 input_num_token:16
+[I][                      SetKVCache][ 533]: current prefill_max_token_num:1920
+[I][                             Run][ 659]: input token num : 16, prefill_split_num : 1
+[I][                             Run][ 685]: input_num_token:16
+[I][                             Run][ 808]: ttft: 1168.56 ms
+<think>
+</think>
+I am Qwen, a large-scale language model developed by Alibaba Cloud. I can answer questions, create content,
+and help with a variety of tasks. How can I assist you today?
+[N][                             Run][ 922]: hit eos,avg 4.22 token/s
+[I][                      GetKVCache][ 499]: precompute_len:106, remaining:1942
+prompt >> q
+root@ax650:/mnt/qtang/llm-test/qwen3-4b#
+```
+#### Inference with M.2 Accelerator card
+[What is M.2 Accelerator card?](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html), Show this DEMO based on Raspberry PI 5.
+```
+(base) axera@raspberrypi:~/samples/qwen3-4b $ ./run_qwen3_4b_int8_ctx_axcl_aarch64.sh
+[I][                            Init][ 136]: LLM init start
+[I][                            Init][  34]: connect http://127.0.0.1:12345 ok
+[I][                            Init][  57]: uid: a5b1e427-0cdf-4da6-b3a7-f5e0517da0bb
+bos_id: -1, eos_id: 151645
+  2% | █                                 |   1 /  39 [0.99s<38.45s, 1.01 count/s] tokenizer init ok
+[I][                            Init][  45]: LLaMaEmbedSelector use mmap
+  5% | ██                                |   2 /  39 [0.99s<19.23s, 2.03 count/s] embed_selector init ok
+[I][                             run][  30]: AXCLWorker start with devid 0
+100% | ████████████████████████████████ |  39 /  39 [133.16s<133.16s, 0.29 count/s] init post axmodel ok,remain_cmm(691 MB)(1096 MB)000000000
+[I][                            Init][ 237]: max_token_len : 2559
+[I][                            Init][ 240]: kv_cache_size : 1024, kv_cache_num: 2559
+[I][                            Init][ 248]: prefill_token_num : 128
+[I][                            Init][ 252]: grp: 1, prefill_max_token_num : 1
+[I][                            Init][ 252]: grp: 2, prefill_max_token_num : 256
+[I][                            Init][ 252]: grp: 3, prefill_max_token_num : 512
+[I][                            Init][ 252]: grp: 4, prefill_max_token_num : 1024
+[I][                            Init][ 252]: grp: 5, prefill_max_token_num : 1536
+[I][                            Init][ 252]: grp: 6, prefill_max_token_num : 2048
+[I][                            Init][ 256]: prefill_max_token_num : 2048
+________________________
+|    ID| remain cmm(MB)|
+========================
+|     0|            691|
+¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
+[I][                     load_config][ 282]: load config:
+{
+    "enable_repetition_penalty": false,
+    "enable_temperature": false,
+    "enable_top_k_sampling": true,
+    "enable_top_p_sampling": false,
+    "penalty_window": 20,
+    "repetition_penalty": 1.2,
+    "temperature": 0.9,
+    "top_k": 1,
+    "top_p": 0.8
+}
+[I][                            Init][ 279]: LLM init ok
+Type "q" to exit, Ctrl+c to stop current running
+[I][          GenerateKVCachePrefill][ 335]: input token num : 21, prefill_split_num : 1 prefill_grpid : 2
+[I][          GenerateKVCachePrefill][ 372]: input_num_token:21
+[I][                            main][ 236]: precompute_len: 21
+[I][                            main][ 237]: system_prompt: You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
+prompt >> who are you
+[I][                      SetKVCache][ 628]: prefill_grpid:2 kv_cache_num:256 precompute_len:21 input_num_token:27
+[I][                      SetKVCache][ 631]: current prefill_max_token_num:1920
+[I][                             Run][ 869]: input token num : 27, prefill_split_num : 1
+[I][                             Run][ 901]: input_num_token:27
+[I][                             Run][1030]: ttft: 1339.01 ms
+<think>
+</think>
+I am Qwen, a large-scale language model developed by Alibaba Cloud. I can answer questions,
+create content, and help with a variety of tasks. What can I assist you with?
+[N][                             Run][1182]: hit eos,avg 3.65 token/s
+[I][                      GetKVCache][ 597]: precompute_len:90, remaining:1958
+prompt >> q
+[I][                             run][  80]: AXCLWorker exit with devid 0
+(base) axera@raspberrypi:~/samples/qwen3-4b $
+(base) axera@raspberrypi:~ $ axcl-smi
++------------------------------------------------------------------------------------------------+
+| AXCL-SMI  V3.4.0_20250423020139                                  Driver  V3.4.0_20250423020139 |
++-----------------------------------------+--------------+---------------------------------------+
+| Card  Name                     Firmware | Bus-Id       |                          Memory-Usage |
+| Fan   Temp                Pwr:Usage/Cap | CPU      NPU |                             CMM-Usage |
+|=========================================+==============+=======================================|
+|    0  AX650N                     V3.4.0 | 0000:01:00.0 |                193 MiB /      945 MiB |
+|   --   37C                      -- / -- | 2%        0% |               6348 MiB /     7040 MiB |
++-----------------------------------------+--------------+---------------------------------------+
++------------------------------------------------------------------------------------------------+
+| Processes:                                                                                     |
+| Card      PID  Process Name                                                   NPU Memory Usage |
+|================================================================================================|
+|    0    84643  /home/axera/samples/qwen3-4b/main_axcl_aarch64                      4894032 KiB |
++------------------------------------------------------------------------------------------------+
+(base) axera@raspberrypi:~ $
+```

config.json ADDED Viewed

File without changes

main_api_ax650 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b917bd4b79e702c0cc9b416cc5c681144563a5e3182e8cf485ae74e20ba79a32
+size 1134952

main_api_axcl_aarch64 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3558444d93ce7459db247421128aca6ba3fdbde5932eff6aea66653fa7370cdf
+size 1816560

main_api_axcl_x86 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d1628c4b204088badd37a3a29a36a5d981c124bb0f74280bc2553836d64153d9
+size 1913024

main_ax650 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddeb85a2fff13f2e009e1d9ac7ea9bbddbede20eab12e48a399bf6bf3fdaf7af
+size 1023280

main_axcl_aarch64 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f9f1a1ca329b47f70840e8b6d104ce8248a82326aa2402bccb31144590a8fb2
+size 1725008

main_axcl_x86 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8b200b6dac4a7019abb8f13e229cca5096cd1f70a5faf0a554b50b00f0b7e41
+size 1844336

post_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+    "enable_temperature" : false,
+    "temperature" : 0.9,
+    "enable_repetition_penalty" : false,
+    "repetition_penalty" : 1.2,
+    "penalty_window" : 20,
+    "enable_top_p_sampling" : false,
+    "top_p" : 0.8,
+    "enable_top_k_sampling" : true,
+    "top_k" : 1
+}

qwen2.5_tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

qwen2.5_tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff