wli1995 commited on
Commit
a1cdf44
·
verified ·
1 Parent(s): e437b51

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +8 -0
  2. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/model.embed_tokens.weight.bfloat16.bin +3 -0
  3. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l0_together.axmodel +3 -0
  4. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l10_together.axmodel +3 -0
  5. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l11_together.axmodel +3 -0
  6. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l12_together.axmodel +3 -0
  7. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l13_together.axmodel +3 -0
  8. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l14_together.axmodel +3 -0
  9. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l15_together.axmodel +3 -0
  10. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l16_together.axmodel +3 -0
  11. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l17_together.axmodel +3 -0
  12. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l18_together.axmodel +3 -0
  13. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l19_together.axmodel +3 -0
  14. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l1_together.axmodel +3 -0
  15. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l20_together.axmodel +3 -0
  16. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l21_together.axmodel +3 -0
  17. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l22_together.axmodel +3 -0
  18. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l23_together.axmodel +3 -0
  19. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l24_together.axmodel +3 -0
  20. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l25_together.axmodel +3 -0
  21. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l26_together.axmodel +3 -0
  22. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l27_together.axmodel +3 -0
  23. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l28_together.axmodel +3 -0
  24. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l29_together.axmodel +3 -0
  25. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l2_together.axmodel +3 -0
  26. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l30_together.axmodel +3 -0
  27. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l31_together.axmodel +3 -0
  28. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l32_together.axmodel +3 -0
  29. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l33_together.axmodel +3 -0
  30. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l34_together.axmodel +3 -0
  31. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l35_together.axmodel +3 -0
  32. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l3_together.axmodel +3 -0
  33. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l4_together.axmodel +3 -0
  34. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l5_together.axmodel +3 -0
  35. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l6_together.axmodel +3 -0
  36. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l7_together.axmodel +3 -0
  37. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l8_together.axmodel +3 -0
  38. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l9_together.axmodel +3 -0
  39. Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_post.axmodel +3 -0
  40. README.md +245 -3
  41. config.json +0 -0
  42. main_api_ax650 +3 -0
  43. main_api_axcl_aarch64 +3 -0
  44. main_api_axcl_x86 +3 -0
  45. main_ax650 +3 -0
  46. main_axcl_aarch64 +3 -0
  47. main_axcl_x86 +3 -0
  48. post_config.json +14 -0
  49. qwen2.5_tokenizer/merges.txt +0 -0
  50. qwen2.5_tokenizer/tokenizer.json +0 -0
.gitattributes CHANGED
@@ -32,4 +32,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *.axmodel filter=lfs diff=lfs merge=lfs -text
36
  *tfevents* filter=lfs diff=lfs merge=lfs -text
37
+ main_ax650 filter=lfs diff=lfs merge=lfs -text
38
+ main_axcl_aarch64 filter=lfs diff=lfs merge=lfs -text
39
+ main_axcl_x86 filter=lfs diff=lfs merge=lfs -text
40
+ qwen3_tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
+ main_api_ax650 filter=lfs diff=lfs merge=lfs -text
42
+ main_api_axcl_aarch64 filter=lfs diff=lfs merge=lfs -text
43
+ main_api_axcl_x86 filter=lfs diff=lfs merge=lfs -text
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/model.embed_tokens.weight.bfloat16.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eabe5625fc0575bf517c424041e9701c0fd521889e0f547c8522d2aa20e8c0f8
3
+ size 777912320
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l0_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6706bd04faa7987ac84fcedd44487c5dd0fd3e01b3c354a72d2158412cbe572
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l10_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bc94bbed29a443bd92b07e6da311758e25fcd5024aa72d148a0535c6b8b78c8
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l11_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f823ae3bcbd505ee94403814877d46b1c9cd1cc20e4d9971d718c1f12f6436de
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l12_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aede41d1d2855c5f5e48a1f1ac73948a0e0611c6a791321e51258625e0e298d4
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l13_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:474f529221fc4d145fa5430ce075dd79bebe38d909a0a9ea5d012eb141aac967
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l14_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1c57875f9474d918c04fdb0dab4ab532a511494b0f4d3ed6c104a0ad07acef4
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l15_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3bdf22e8f1c85695ffb8d62f9c365357136f1e3f03050d14a17d947774f735a
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l16_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5890dae7fc86f8ceed0649616acab5b5d407a0b71939d4c4da1c9f9c5149948f
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l17_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cc5a36b246584335aa3bef4bf72bd0cf480c26710c5879a7b6dd0542be0ee8d
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l18_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31b56b453671afdd2ea10444371ac5e8d992dbcae38077e3653b96fdbb143e29
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l19_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dd27c2bccbecd4198e812062b9a323df28d8d362be9e5d28cc4398f3cad08d5
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l1_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:851f6535c5a09b1e40ae95c8f39ff654d5af0a5dab72286164e80a192e377d75
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l20_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccafe960507254b793eafed1e5253bc924bd91b1ac7ce7b1c4ee22583729b86a
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l21_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:750e3a04abb85e7a59c4ac2736f154083e92a92b6d0a2e0ac115e3847364d722
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l22_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cab38992c778edc588994cf9398c6ade0e15460f3dc44b591b7f602af0df45fe
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l23_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f82d042f14e24f73837d252c1f2eb68666cc63e41c552b31b9651e503a1bd9a1
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l24_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43dc45f7579e0cf2272a283ae1370a8a2442a91b13f85e94ffe1a3789e653f0f
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l25_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c663ff33b1d455f73f3a4a1ca0b2278f7fd731a980e1072f9f4e5b7be002a5e9
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l26_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c99edb9aabc14cb053e269be031746801011cc7b2c15a1f7385d4229f5b8bbfa
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l27_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b15ecab8b8471ed7fbe170dd93ddec9584c591ba6bea6e7224ec1e76abbbd106
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l28_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2495c57afe2dc507a13076947138b2b4f8876082fd9808a99ca00f01ec8b968c
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l29_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20480961c16012330bfa504c1a4e55553cffd15607516816b0e4c2103ec1ebc5
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l2_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5a89ea1587149fac2bb1b1adf66c82433f068bac661690d0c2633240ab11757
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l30_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e26950f6520856fa7a7b15179acde9d7b7ac7290c4f95cfd1acf71ee37aaf640
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l31_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78acc56d21871cd026b2d584d1e61431d45de4363ae29f5576887719f782069c
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l32_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52cedd1013577cab86a53258baaa62eeae99b2552264894f322eab41b66a7133
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l33_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6689879470e8d25a508f120f77c8c5ee657ca934c05024a8f593e94389c50b1b
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l34_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98f7b2ac543f85c9380c97f9805b5d572646d1466b8f40b1e916a5675fe6b7eb
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l35_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92da75168b8ea67f92df5d87f9d5e98d7ae33f3ecc27dd1d76053ef7319d368e
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l3_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a797f620572903f60bc64a6ecac74e795ce51ec9f4091fa80c3250d0c397de84
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l4_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb84b2848fe814b7a4e8113f5f47edeb7cdadb3904c9f14deaa08c5a91359372
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l5_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a41dd48f4a331e0046f3bc11d94aa35a1ca5ea2bc3e8f04ec6389011b6d4647
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l6_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ab2e9c6cc960225aaf8a19d2d16fc920d4c048b379c542ec1a4af1b8ea644dd
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l7_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:511e855a8de9857fb0962437f9520bbaab42e559d249cd93d22f1ee826f8f5c5
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l8_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:261747bb3e06f351084c9754f443da736917ab23e22a113c9baeb3312d759136
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_p256_l9_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:357cafbab182da8770c5ba00acae9d76499e2bb114594cf6c116eec8d983b411
3
+ size 130950985
Qwen3-4B-Instruct-2507-GPTQ-Int8-context-4k-prefill-3584/qwen3_post.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7508737ba0506611ead05d48adba0aeae10fd81679b62d221362dd1d8ac520f
3
+ size 424152083
README.md CHANGED
@@ -1,3 +1,245 @@
1
- ---
2
- license: bsd-3-clause
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - en
5
+ base_model:
6
+ - Qwen/Qwen3-4B
7
+ pipeline_tag: text-generation
8
+ tags:
9
+ - Qwen
10
+ - Qwen3
11
+ - Int8
12
+ ---
13
+
14
+
15
+ # Qwen3-4B-Int8
16
+
17
+ This version of Qwen3-4B-Int8 has been converted to run on the Axera NPU using **w8a16** quantization.
18
+
19
+ This model has been optimized with the following LoRA:
20
+
21
+ Compatible with Pulsar2 version: 4.2(Not released yet)
22
+
23
+ ## Convert tools links:
24
+
25
+ For those who are interested in model conversion, you can try to export axmodel through the original repo :
26
+ https://huggingface.co/Qwen/Qwen3-4B
27
+
28
+ [Pulsar2 Link, How to Convert LLM from Huggingface to axmodel](https://pulsar2-docs.readthedocs.io/en/latest/appendix/build_llm.html)
29
+
30
+ [AXera NPU LLM Runtime](https://github.com/AXERA-TECH/ax-llm)
31
+
32
+
33
+ ## Support Platform
34
+
35
+ - AX650
36
+ - [M4N-Dock(爱芯派Pro)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
37
+ - [M.2 Accelerator card](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html)
38
+
39
+ |Chips|w8a16|w4a16|
40
+ |--|--|--|
41
+ |AX650| 4.5 tokens/sec|TBD|
42
+
43
+ ## How to use
44
+
45
+ Download all files from this repository to the device
46
+
47
+ ```
48
+ root@ax650:/mnt/qtang/llm-test/qwen3-4b# tree -L 1
49
+ .
50
+ |-- config.json
51
+ |-- main_ax650
52
+ |-- main_axcl_aarch64
53
+ |-- main_axcl_x86
54
+ |-- post_config.json
55
+ |-- qwen2.5_tokenizer
56
+ |-- qwen3-4b-ax650
57
+ |-- qwen3_tokenizer
58
+ |-- qwen3_tokenizer_uid.py
59
+ |-- run_qwen3_4b_int8_ctx_ax650.sh
60
+ |-- run_qwen3_4b_int8_ctx_axcl_aarch64.sh
61
+ `-- run_qwen3_4b_int8_ctx_axcl_x86.sh
62
+
63
+ 3 directories, 9 files
64
+ root@ax650:/mnt/qtang/llm-test/qwen3-4b#
65
+
66
+ ```
67
+
68
+ #### Start the Tokenizer service
69
+
70
+ Install requirement
71
+
72
+ ```
73
+ pip install transformers jinja2
74
+ ```
75
+
76
+ ```
77
+ root@ax650:/mnt/qtang/llm-test/qwen3-4b# python3 qwen3_tokenizer_uid.py
78
+ None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
79
+ Server running at http://0.0.0.0:12345
80
+ ```
81
+
82
+ #### Inference with AX650 Host, such as M4N-Dock(爱芯派Pro) or AX650N DEMO Board
83
+
84
+ Open another terminal and run `run_qwen3_4b_int8_ctx_ax650.sh`
85
+
86
+ ```
87
+ root@ax650:/mnt/qtang/llm-test/qwen3-4b# ./run_qwen3_4b_int8_ctx_ax650.sh
88
+ [I][ Init][ 110]: LLM init start
89
+ [I][ Init][ 34]: connect http://127.0.0.1:12345 ok
90
+ [I][ Init][ 57]: uid: 6e90ff82-b9c9-42dc-8f61-081203389166
91
+ bos_id: -1, eos_id: 151645
92
+ 2% | â–ˆ | 1 / 39 [3.95s<153.89s, 0.25 count/s] tokenizer init ok
93
+ [I][ Init][ 26]: LLaMaEmbedSelector use mmap
94
+ 100% | ████████████████████████████████ | 39 / 39 [48.03s<48.03s, 0.81 count/s] init post axmodel ok,remain_cmm(5621 MB)
95
+ [I][ Init][ 188]: max_token_len : 2559
96
+ [I][ Init][ 193]: kv_cache_size : 1024, kv_cache_num: 2559
97
+ [I][ Init][ 201]: prefill_token_num : 128
98
+ [I][ Init][ 205]: grp: 1, prefill_max_token_num : 1
99
+ [I][ Init][ 205]: grp: 2, prefill_max_token_num : 256
100
+ [I][ Init][ 205]: grp: 3, prefill_max_token_num : 512
101
+ [I][ Init][ 205]: grp: 4, prefill_max_token_num : 1024
102
+ [I][ Init][ 205]: grp: 5, prefill_max_token_num : 1536
103
+ [I][ Init][ 205]: grp: 6, prefill_max_token_num : 2048
104
+ [I][ Init][ 209]: prefill_max_token_num : 2048
105
+ [I][ load_config][ 282]: load config:
106
+ {
107
+ "enable_repetition_penalty": false,
108
+ "enable_temperature": false,
109
+ "enable_top_k_sampling": true,
110
+ "enable_top_p_sampling": false,
111
+ "penalty_window": 20,
112
+ "repetition_penalty": 1.2,
113
+ "temperature": 0.9,
114
+ "top_k": 1,
115
+ "top_p": 0.8
116
+ }
117
+
118
+ [I][ Init][ 218]: LLM init ok
119
+ Type "q" to exit, Ctrl+c to stop current running
120
+ [I][ GenerateKVCachePrefill][ 270]: input token num : 21, prefill_split_num : 1 prefill_grpid : 2
121
+ [I][ GenerateKVCachePrefill][ 307]: input_num_token:21
122
+ [I][ main][ 230]: precompute_len: 21
123
+ [I][ main][ 231]: system_prompt: You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
124
+ prompt >> 1+3=?
125
+ [I][ SetKVCache][ 530]: prefill_grpid:2 kv_cache_num:256 precompute_len:21 input_num_token:16
126
+ [I][ SetKVCache][ 533]: current prefill_max_token_num:1920
127
+ [I][ Run][ 659]: input token num : 16, prefill_split_num : 1
128
+ [I][ Run][ 685]: input_num_token:16
129
+ [I][ Run][ 808]: ttft: 1169.05 ms
130
+ <think>
131
+
132
+ </think>
133
+
134
+ 1 + 3 = 4
135
+
136
+ [N][ Run][ 922]: hit eos,avg 4.22 token/s
137
+
138
+ [I][ GetKVCache][ 499]: precompute_len:48, remaining:2000
139
+ prompt >> who are you?
140
+ [I][ SetKVCache][ 530]: prefill_grpid:2 kv_cache_num:256 precompute_len:48 input_num_token:16
141
+ [I][ SetKVCache][ 533]: current prefill_max_token_num:1920
142
+ [I][ Run][ 659]: input token num : 16, prefill_split_num : 1
143
+ [I][ Run][ 685]: input_num_token:16
144
+ [I][ Run][ 808]: ttft: 1168.56 ms
145
+ <think>
146
+
147
+ </think>
148
+
149
+ I am Qwen, a large-scale language model developed by Alibaba Cloud. I can answer questions, create content,
150
+ and help with a variety of tasks. How can I assist you today?
151
+
152
+ [N][ Run][ 922]: hit eos,avg 4.22 token/s
153
+
154
+ [I][ GetKVCache][ 499]: precompute_len:106, remaining:1942
155
+ prompt >> q
156
+ root@ax650:/mnt/qtang/llm-test/qwen3-4b#
157
+ ```
158
+
159
+ #### Inference with M.2 Accelerator card
160
+
161
+ [What is M.2 Accelerator card?](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html), Show this DEMO based on Raspberry PI 5.
162
+
163
+ ```
164
+ (base) axera@raspberrypi:~/samples/qwen3-4b $ ./run_qwen3_4b_int8_ctx_axcl_aarch64.sh
165
+ [I][ Init][ 136]: LLM init start
166
+ [I][ Init][ 34]: connect http://127.0.0.1:12345 ok
167
+ [I][ Init][ 57]: uid: a5b1e427-0cdf-4da6-b3a7-f5e0517da0bb
168
+ bos_id: -1, eos_id: 151645
169
+ 2% | â–ˆ | 1 / 39 [0.99s<38.45s, 1.01 count/s] tokenizer init ok
170
+ [I][ Init][ 45]: LLaMaEmbedSelector use mmap
171
+ 5% | ██ | 2 / 39 [0.99s<19.23s, 2.03 count/s] embed_selector init ok
172
+ [I][ run][ 30]: AXCLWorker start with devid 0
173
+ 100% | ████████████████████████████████ | 39 / 39 [133.16s<133.16s, 0.29 count/s] init post axmodel ok,remain_cmm(691 MB)(1096 MB)000000000
174
+ [I][ Init][ 237]: max_token_len : 2559
175
+ [I][ Init][ 240]: kv_cache_size : 1024, kv_cache_num: 2559
176
+ [I][ Init][ 248]: prefill_token_num : 128
177
+ [I][ Init][ 252]: grp: 1, prefill_max_token_num : 1
178
+ [I][ Init][ 252]: grp: 2, prefill_max_token_num : 256
179
+ [I][ Init][ 252]: grp: 3, prefill_max_token_num : 512
180
+ [I][ Init][ 252]: grp: 4, prefill_max_token_num : 1024
181
+ [I][ Init][ 252]: grp: 5, prefill_max_token_num : 1536
182
+ [I][ Init][ 252]: grp: 6, prefill_max_token_num : 2048
183
+ [I][ Init][ 256]: prefill_max_token_num : 2048
184
+ ________________________
185
+ | ID| remain cmm(MB)|
186
+ ========================
187
+ | 0| 691|
188
+ ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
189
+ [I][ load_config][ 282]: load config:
190
+ {
191
+ "enable_repetition_penalty": false,
192
+ "enable_temperature": false,
193
+ "enable_top_k_sampling": true,
194
+ "enable_top_p_sampling": false,
195
+ "penalty_window": 20,
196
+ "repetition_penalty": 1.2,
197
+ "temperature": 0.9,
198
+ "top_k": 1,
199
+ "top_p": 0.8
200
+ }
201
+
202
+ [I][ Init][ 279]: LLM init ok
203
+ Type "q" to exit, Ctrl+c to stop current running
204
+ [I][ GenerateKVCachePrefill][ 335]: input token num : 21, prefill_split_num : 1 prefill_grpid : 2
205
+ [I][ GenerateKVCachePrefill][ 372]: input_num_token:21
206
+ [I][ main][ 236]: precompute_len: 21
207
+ [I][ main][ 237]: system_prompt: You are Qwen, created by Alibaba Cloud. You are a helpful assistant.
208
+ prompt >> who are you
209
+ [I][ SetKVCache][ 628]: prefill_grpid:2 kv_cache_num:256 precompute_len:21 input_num_token:27
210
+ [I][ SetKVCache][ 631]: current prefill_max_token_num:1920
211
+ [I][ Run][ 869]: input token num : 27, prefill_split_num : 1
212
+ [I][ Run][ 901]: input_num_token:27
213
+ [I][ Run][1030]: ttft: 1339.01 ms
214
+ <think>
215
+
216
+ </think>
217
+
218
+ I am Qwen, a large-scale language model developed by Alibaba Cloud. I can answer questions,
219
+ create content, and help with a variety of tasks. What can I assist you with?
220
+
221
+ [N][ Run][1182]: hit eos,avg 3.65 token/s
222
+
223
+ [I][ GetKVCache][ 597]: precompute_len:90, remaining:1958
224
+ prompt >> q
225
+ [I][ run][ 80]: AXCLWorker exit with devid 0
226
+ (base) axera@raspberrypi:~/samples/qwen3-4b $
227
+ (base) axera@raspberrypi:~ $ axcl-smi
228
+ +------------------------------------------------------------------------------------------------+
229
+ | AXCL-SMI V3.4.0_20250423020139 Driver V3.4.0_20250423020139 |
230
+ +-----------------------------------------+--------------+---------------------------------------+
231
+ | Card Name Firmware | Bus-Id | Memory-Usage |
232
+ | Fan Temp Pwr:Usage/Cap | CPU NPU | CMM-Usage |
233
+ |=========================================+==============+=======================================|
234
+ | 0 AX650N V3.4.0 | 0000:01:00.0 | 193 MiB / 945 MiB |
235
+ | -- 37C -- / -- | 2% 0% | 6348 MiB / 7040 MiB |
236
+ +-----------------------------------------+--------------+---------------------------------------+
237
+
238
+ +------------------------------------------------------------------------------------------------+
239
+ | Processes: |
240
+ | Card PID Process Name NPU Memory Usage |
241
+ |================================================================================================|
242
+ | 0 84643 /home/axera/samples/qwen3-4b/main_axcl_aarch64 4894032 KiB |
243
+ +------------------------------------------------------------------------------------------------+
244
+ (base) axera@raspberrypi:~ $
245
+ ```
config.json ADDED
File without changes
main_api_ax650 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b917bd4b79e702c0cc9b416cc5c681144563a5e3182e8cf485ae74e20ba79a32
3
+ size 1134952
main_api_axcl_aarch64 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3558444d93ce7459db247421128aca6ba3fdbde5932eff6aea66653fa7370cdf
3
+ size 1816560
main_api_axcl_x86 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1628c4b204088badd37a3a29a36a5d981c124bb0f74280bc2553836d64153d9
3
+ size 1913024
main_ax650 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddeb85a2fff13f2e009e1d9ac7ea9bbddbede20eab12e48a399bf6bf3fdaf7af
3
+ size 1023280
main_axcl_aarch64 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f9f1a1ca329b47f70840e8b6d104ce8248a82326aa2402bccb31144590a8fb2
3
+ size 1725008
main_axcl_x86 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8b200b6dac4a7019abb8f13e229cca5096cd1f70a5faf0a554b50b00f0b7e41
3
+ size 1844336
post_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "enable_temperature" : false,
3
+ "temperature" : 0.9,
4
+
5
+ "enable_repetition_penalty" : false,
6
+ "repetition_penalty" : 1.2,
7
+ "penalty_window" : 20,
8
+
9
+ "enable_top_p_sampling" : false,
10
+ "top_p" : 0.8,
11
+
12
+ "enable_top_k_sampling" : true,
13
+ "top_k" : 1
14
+ }
qwen2.5_tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
qwen2.5_tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff