first commit

Files changed (17) hide show

.gitattributes +11 -0
README.md +241 -3
config.json +0 -0
image/ssd_car.jpg +3 -0
image/ssd_horse.jpg +3 -0
main_ax650 +3 -0
qwen2.5_tokenizer.txt +0 -0
run_qwen2_5_vl_image.sh +22 -0
run_qwen2_5_vl_video.sh +22 -0
video/frame_0000.jpg +3 -0
video/frame_0008.jpg +3 -0
video/frame_0016.jpg +3 -0
video/frame_0024.jpg +3 -0
video/frame_0032.jpg +3 -0
video/frame_0040.jpg +3 -0
video/frame_0048.jpg +3 -0
video/frame_0056.jpg +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,14 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+main_ax650 filter=lfs diff=lfs merge=lfs -text
+video/frame_0000.jpg filter=lfs diff=lfs merge=lfs -text
+video/frame_0008.jpg filter=lfs diff=lfs merge=lfs -text
+video/frame_0016.jpg filter=lfs diff=lfs merge=lfs -text
+video/frame_0024.jpg filter=lfs diff=lfs merge=lfs -text
+video/frame_0032.jpg filter=lfs diff=lfs merge=lfs -text
+video/frame_0040.jpg filter=lfs diff=lfs merge=lfs -text
+video/frame_0048.jpg filter=lfs diff=lfs merge=lfs -text
+video/frame_0056.jpg filter=lfs diff=lfs merge=lfs -text
+image/ssd_car.jpg filter=lfs diff=lfs merge=lfs -text
+image/ssd_horse.jpg filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,241 @@
----
-license: mit
----

+---
+license: mit
+language:
+- en
+- zh
+base_model:
+- hfl/Qwen2.5-VL-3B-Instruct-GPTQ-Int4
+pipeline_tag: image-text-to-text
+library_name: transformers
+tags:
+- Qwen2.5-VL
+- Qwen2.5-VL-3B-Instruct
+- Int4
+- VLM
+---
+# Qwen2.5-VL-3B-Instruct
+This version of Qwen2.5-VL-3B-Instruct-GPTQ-Int4 has been converted to run on the Axera NPU using **w4a16** quantization.
+This model has been optimized with the following LoRA:
+Compatible with Pulsar2 version: 3.4
+## Convert tools links:
+For those who are interested in model conversion, you can try to export axmodel through the original repo :
+https://huggingface.co/hfl/Qwen2.5-VL-3B-Instruct-GPTQ-Int4
+[Pulsar2 Link, How to Convert LLM from Huggingface to axmodel](https://pulsar2-docs.readthedocs.io/en/latest/appendix/build_llm.html)
+[AXera NPU HOST LLM Runtime](https://github.com/AXERA-TECH/Qwen2.5-VL-3B-Instruct.axera)
+## Support Platform
+- AX650
+  - AX650N DEMO Board
+  - [M4N-Dock(爱芯派Pro)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
+  - [M.2 Accelerator card](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html)
+**Image Process**
+|Chips| input size | image num | image encoder | ttft(320 tokens) | w4a16 | DDR | Flash |
+|--|--|--|--|--|--|--|--|
+|AX650|  | 1 |  ms |  ms |  tokens/sec|  GiB |   GiB  |
+**Video Process**
+|Chips| input size | image num | image encoder |ttft(512 tokens) | w4a16 | DDR | Flash |
+|--|--|--|--|--|--|--|--|
+|AX650|  | 8  |  ms |  ms |  tokens/sec|  GiB |   GiB  |
+The DDR capacity refers to the CMM memory that needs to be consumed. Ensure that the CMM memory allocation on the development board is greater than this value.
+## How to use
+Download all files from this repository to the device
+**If you using AX650 Board**
+### Demo Run
+#### Image understand demo
+- input text
+```
+描述下图片
+```
+- input image
+![](./image/ssd_car.jpg)
+```
+root@ax650:/mnt/qtang/llm-test/qwen2.5-vl-3b# ./run_qwen2_5_vl_image.sh
+[I][                            Init][ 129]: LLM init start
+bos_id: -1, eos_id: 151645
+  2% | █                                 |   1 /  40 [0.01s<0.24s, 166.67 count/s] tokenizer init ok
+[I][                            Init][  26]: LLaMaEmbedSelector use mmap
+100% | ████████████████████████████████ |  40 /  40 [38.23s<38.23s, 1.05 count/s] init vpm axmodel ok,remain_cmm(7600 MB)
+[I][                            Init][ 277]: max_token_len : 1023
+[I][                            Init][ 282]: kv_cache_size : 256, kv_cache_num: 1023
+[I][                            Init][ 290]: prefill_token_num : 320
+[I][                            Init][ 292]: vpm_height : 1024,vpm_width : 392
+[I][                            Init][ 301]: LLM init ok
+Type "q" to exit, Ctrl+c to stop current running
+prompt >> who are you?
+image >>
+[I][                             Run][ 638]: ttft: 2854.47 ms
+I am a large language model created by Alibaba Cloud. I am called Qwen.
+[N][                             Run][ 779]: hit eos,avg 6.05 token/s
+prompt >> 描述下图片
+image >> image/ssd_car.jpg
+[I][                          Encode][ 416]: image encode time : 795.614014 ms, size : 524288
+[I][                             Run][ 638]: ttft: 2856.88 ms
+这张图片展示了一条繁忙的城市街道。前景中，一名女子站在人行道上，她穿着黑色外套，面带微笑。她旁边是一辆红色的双层巴士，巴士上有一个广告，
+上面写着“THINGS GET MORE EXITING WHEN YOU SAY ‘YES’”。巴士的车牌号是“L15”。巴士旁边停着一辆黑色的小型货车。背景中可以看到一些商店和行人，
+街道两旁的建筑物是现代的玻璃幕墙建筑。整体氛围显得繁忙而充满活力。
+[N][                             Run][ 779]: hit eos,avg 5.96 token/s
+```
+#### Video understand demo
+Please pre-process the image of the video file into a 308x308 size picture
+```
+root@ax650:/mnt/qtang/llm-test/qwen2.5-vl-3b# ./run_qwen2_5_vl_video.sh
+[I][                            Init][ 129]: LLM init start
+bos_id: -1, eos_id: 151645
+  2% | █                                 |   1 /  40 [0.00s<0.12s, 333.33 count/s] tokenizer init ok
+[I][                            Init][  26]: LLaMaEmbedSelector use mmap
+100% | ████████████████████████████████ |  40 /  40 [40.05s<40.05s, 1.00 count/s] init vpm axmodel ok,remain_cmm(7680 MB)
+[I][                            Init][ 277]: max_token_len : 1023
+[I][                            Init][ 282]: kv_cache_size : 256, kv_cache_num: 1023
+[I][                            Init][ 290]: prefill_token_num : 512
+[I][                            Init][ 292]: vpm_height : 484,vpm_width : 392
+[I][                            Init][ 301]: LLM init ok
+Type "q" to exit, Ctrl+c to stop current running
+prompt >> 描述下视频
+image >> video
+video/frame_0000.jpg
+video/frame_0008.jpg
+video/frame_0016.jpg
+video/frame_0024.jpg
+video/frame_0032.jpg
+video/frame_0040.jpg
+video/frame_0048.jpg
+video/frame_0056.jpg
+[I][                          Encode][ 416]: image encode time : 1487.557007 ms, size : 991232
+[I][                             Run][ 638]: ttft: 5488.29 ms
+视频展示了两只松鼠在户外的场景。背景是模糊的山脉和蓝天，前景中有松鼠在互动。松鼠的毛色主要是棕色和白色，它们的爪子是橙色的。松鼠似乎在互相玩耍或争抢，它们的爪子和嘴巴都伸向对方。整个场景显得非常自然和生动。
+```
+#### Inference with M.2 Accelerator card
+What is M.2 Accelerator card?, Show this DEMO based on Raspberry PI 5.
+#### Image understand demo
+- input text
+```
+描述这张图片
+```
+- input image
+![](./image/ssd_car.jpg)
+```
+(base) axera@raspberrypi:~/lhj/Qwen2.5-VL-3B-Instruct $ bash run_qwen2_5_vl_image_axcl_aarch64.sh
+[I][                            Init][ 162]: LLM init start
+[I][                            Init][  34]: connect http://127.0.0.1:12345 ok
+[I][                            Init][ 267]: IMAGE_CONTEXT_TOKEN: 151655, IMAGE_START_TOKEN: 151652
+[I][                            Init][ 328]: image encoder output float32
+[I][                            Init][ 340]: max_token_len : 1023
+[I][                            Init][ 343]: kv_cache_size : 256, kv_cache_num: 1023
+[I][                            Init][ 351]: prefill_token_num : 128
+[I][                            Init][ 355]: grp: 1, prefill_max_token_num : 1
+[I][                            Init][ 355]: grp: 2, prefill_max_token_num : 128
+[I][                            Init][ 355]: grp: 3, prefill_max_token_num : 256
+[I][                            Init][ 355]: grp: 4, prefill_max_token_num : 384
+[I][                            Init][ 355]: grp: 5, prefill_max_token_num : 512
+[I][                            Init][ 359]: prefill_max_token_num : 512
+________________________
+|    ID| remain cmm(MB)|
+========================
+|     0|           2286|
+¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
+[E][                     load_config][ 278]: config file(post_config.json) open failed
+[W][                            Init][ 452]: load postprocess config(post_config.json) failed
+[I][                            Init][ 456]: LLM init ok
+Type "q" to exit, Ctrl+c to stop current running
+prompt >> 描述这张图片
+image >> image/ssd_car.jpg
+[I][                          Encode][ 539]: image encode time : 772.851990 ms, size : 524288
+[I][                             Run][ 625]: input token num : 280, prefill_split_num : 3
+[I][                             Run][ 659]: input_num_token:128
+[I][                             Run][ 659]: input_num_token:128
+[I][                             Run][ 659]: input_num_token:24
+[I][                             Run][ 796]: ttft: 2067.18 ms
+这张图片展示了一条繁忙的城市街道。前景中，一名女子站在人行道上，穿着黑色外套，面带微笑。她旁边是一辆红色的双层巴士，巴士上有一个广告，上面写着“THINGS GET MORE EXITING WHEN YOU SAY ‘YES’ VirginMoney.co.uk”。巴士的车牌号是“L15”。巴士旁边停着一辆黑色的面包车。背景中可以看到一些商店和行人，街道两旁有路灯和商店的招牌。整体环境显得非常繁忙和现代。
+[N][                             Run][ 949]: hit eos,avg 4.12 token/s
+```
+#### Video understand demo
+Please pre-process the image of the video file into a 308x308 size picture
+```
+(base) axera@raspberrypi:~/lhj/Qwen2.5-VL-3B-Instruct $ bash run_qwen2_5_vl_video_axcl_aarch64.sh
+[I][                            Init][ 162]: LLM init start
+[I][                            Init][  34]: connect http://127.0.0.1:12345 ok
+[I][                            Init][ 267]: IMAGE_CONTEXT_TOKEN: 151656, IMAGE_START_TOKEN: 151652
+[I][                            Init][ 328]: image encoder output float32
+[I][                            Init][ 340]: max_token_len : 1023
+[I][                            Init][ 343]: kv_cache_size : 256, kv_cache_num: 1023
+[I][                            Init][ 351]: prefill_token_num : 128
+[I][                            Init][ 355]: grp: 1, prefill_max_token_num : 1
+[I][                            Init][ 355]: grp: 2, prefill_max_token_num : 128
+[I][                            Init][ 355]: grp: 3, prefill_max_token_num : 256
+[I][                            Init][ 355]: grp: 4, prefill_max_token_num : 384
+[I][                            Init][ 355]: grp: 5, prefill_max_token_num : 512
+[I][                            Init][ 359]: prefill_max_token_num : 512
+________________________
+|    ID| remain cmm(MB)|
+========================
+|     0|           2464|
+¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
+[E][                     load_config][ 278]: config file(post_config.json) open failed
+[W][                            Init][ 452]: load postprocess config(post_config.json) failed
+[I][                            Init][ 456]: LLM init ok
+Type "q" to exit, Ctrl+c to stop current running
+prompt >> 描述这个视频的内容
+image >> video
+video/frame_0000.jpg
+video/frame_0008.jpg
+video/frame_0016.jpg
+video/frame_0024.jpg
+video/frame_0032.jpg
+video/frame_0040.jpg
+video/frame_0048.jpg
+video/frame_0056.jpg
+[I][                          Encode][ 539]: image encode time : 1481.107056 ms, size : 991232
+[I][                             Run][ 625]: input token num : 509, prefill_split_num : 4
+[I][                             Run][ 659]: input_num_token:128
+[I][                             Run][ 659]: input_num_token:128
+[I][                             Run][ 659]: input_num_token:128
+[I][                             Run][ 659]: input_num_token:125
+[I][                             Run][ 796]: ttft: 3049.59 ms
+视频展示了两只松鼠在户外的场景。背景是模糊的山脉和蓝天，前景中有松鼠在互动。松鼠的毛色是棕色和灰色的混合，它们的爪子是橙色的。松鼠似乎在互相玩耍或争抢，它们的爪子和嘴巴都伸向对方。整个场景显得非常自然和生动。
+[N][                             Run][ 949]: hit eos,avg 4.15 token/s
+```

config.json ADDED Viewed

File without changes

image/ssd_car.jpg ADDED Viewed

Git LFS Details

SHA256: 92d459a39a9eef03956257cf9fec84114d9e5df8fb9c0662fb257488cdd4f365
Pointer size: 130 Bytes
Size of remote file: 50.5 kB

image/ssd_horse.jpg ADDED Viewed

Git LFS Details

SHA256: ed22f6b4c8c33e50e391e089ede14e8fa9402c623b09dbcf010e804770698fbb
Pointer size: 131 Bytes
Size of remote file: 123 kB

main_ax650 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f6cfcf0021a15a3baea0513e0eb6b17bdfbe08928de0e74619e6684a13a1493
+size 6808392

qwen2.5_tokenizer.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

run_qwen2_5_vl_image.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+AXMODEL_DIR=./Qwen2.5-VL-3B-Instruct-AX650-chunk_prefill_512
+./main_ax650 \
+--template_filename_axmodel "${AXMODEL_DIR}/qwen2_5_vl_p128_l%d_together.axmodel" \
+--axmodel_num 36 \
+--filename_image_encoder_axmodedl "${AXMODEL_DIR}/Qwen2.5-VL-3B-Instruct_vision_nchw448.axmodel" \
+--bos 0 --eos 0 \
+--dynamic_load_axmodel_layer 0 \
+--use_mmap_load_embed 1 \
+--filename_tokenizer_model "qwen2.5_tokenizer.txt" \
+--filename_post_axmodel "${AXMODEL_DIR}/qwen2_5_vl_post.axmodel" \
+--use_topk 0 \
+--filename_tokens_embed "${AXMODEL_DIR}/model.embed_tokens.weight.bfloat16.bin" \
+--tokens_embed_num 151936 \
+--tokens_embed_size 2048 \
+--live_print 1 \
+--continue 1 \
+--video 0 \
+--img_width 448 \
+--img_height 448 \
+--vision_start_token_id 151652 \
+--post_config_path post_config.json

run_qwen2_5_vl_video.sh ADDED Viewed

	@@ -0,0 +1,22 @@

+AXMODEL_DIR=./Qwen2.5-VL-3B-Instruct-AX650-chunk_prefill_512
+./main_ax650 \
+--template_filename_axmodel "${AXMODEL_DIR}/qwen2_5_vl_p128_l%d_together.axmodel" \
+--axmodel_num 36 \
+--filename_image_encoder_axmodedl "${AXMODEL_DIR}/Qwen2.5-VL-3B-Instruct_vision_nhwc.axmodel" \
+--bos 0 --eos 0 \
+--dynamic_load_axmodel_layer 0 \
+--use_mmap_load_embed 1 \
+--filename_tokenizer_model "qwen2.5_tokenizer.txt" \
+--filename_post_axmodel "${AXMODEL_DIR}/qwen2_5_vl_post.axmodel" \
+--use_topk 0 \
+--filename_tokens_embed "${AXMODEL_DIR}/model.embed_tokens.weight.bfloat16.bin" \
+--tokens_embed_num 151936 \
+--tokens_embed_size 2048 \
+--live_print 1 \
+--continue 1 \
+--video 1 \
+--img_width 308 \
+--img_height 308 \
+--vision_start_token_id 151652 \
+--post_config_path post_config.json