lihongjie commited on Dec 10, 2025

Commit

d0bd47b

1 Parent(s): d082af0

first commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +46 -0
README.md +182 -3
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l0_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l10_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l11_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l12_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l13_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l14_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l15_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l16_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l17_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l18_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l19_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l1_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l20_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l21_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l22_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l23_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l24_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l25_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l26_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l27_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l28_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l29_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l2_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l30_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l31_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l3_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l4_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l5_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l6_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l7_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l8_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l9_together.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_post.axmodel +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/model.embed_tokens.weight.bfloat16.bin +3 -0
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/vision_model_1x3x512x512_NHwC_U8.axmodel +3 -0
axera_logo.png +3 -0
gradio_demo.py +261 -0
main_api_ax650 +3 -0
main_ax650 +3 -0
openai_cli.py +70 -0
run_api_ax650.sh +19 -0
run_ax650.sh +26 -0
smolvlm2_tokenizer.txt +0 -0
video/frame_0000.jpg +3 -0
video/frame_0008.jpg +3 -0
video/frame_0016.jpg +3 -0
video/frame_0024.jpg +3 -0
video/frame_0032.jpg +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,49 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l29_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/vision_model_1x3x512x512_NHwC_U8.axmodel filter=lfs diff=lfs merge=lfs -text
+video/frame_0000.jpg filter=lfs diff=lfs merge=lfs -text
+video/frame_0040.jpg filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l13_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l14_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l7_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l20_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l28_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l5_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_post.axmodel filter=lfs diff=lfs merge=lfs -text
+video/frame_0024.jpg filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l10_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l11_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l16_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l31_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l3_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l6_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l0_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l17_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l26_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l23_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l4_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l9_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/model.embed_tokens.weight.bfloat16.bin filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l15_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l19_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l22_together.axmodel filter=lfs diff=lfs merge=lfs -text
+video/frame_0032.jpg filter=lfs diff=lfs merge=lfs -text
+main_api_ax650 filter=lfs diff=lfs merge=lfs -text
+main_ax650 filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l24_together.axmodel filter=lfs diff=lfs merge=lfs -text
+video/frame_0008.jpg filter=lfs diff=lfs merge=lfs -text
+video/frame_0016.jpg filter=lfs diff=lfs merge=lfs -text
+axera_logo.png filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l18_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l21_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l2_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l27_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l30_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l8_together.axmodel filter=lfs diff=lfs merge=lfs -text
+video/frame_0048.jpg filter=lfs diff=lfs merge=lfs -text
+video/frame_0056.jpg filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l12_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l1_together.axmodel filter=lfs diff=lfs merge=lfs -text
+SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l25_together.axmodel filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,182 @@
----
-license: mit
----

+---
+license: mit
+language:
+- en
+- zh
+base_model:
+- SmolVLM2-500M-Video-Instruct
+pipeline_tag: image-text-to-text
+library_name: transformers
+tags:
+- Int8
+- VLM
+---
+# Qwen3-VL
+This version of SmolVLM2-500M-Video-Instructhas been converted to run on the Axera NPU using **w8a16** quantization.
+Compatible with Pulsar2 version: 5.0
+## Convert tools links:
+For those who are interested in model conversion, you can try to export axmodel through the original repo :
+- https://huggingface.co/HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+[Pulsar2 Link, How to Convert LLM from Huggingface to axmodel](https://pulsar2-docs.readthedocs.io/en/latest/appendix/build_llm.html)
+## Support Platform
+- AX650
+  - AX650N DEMO Board
+  - [M4N-Dock(爱芯派Pro)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
+  - [M.2 Accelerator card](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html)
+**Image Process**
+|Chips| input size | image num | image encoder | ttft(168 tokens) | w8a16 | CMM | Flash |
+|--|--|--|--|--|--|--|--|
+|AX650| 512*512 | 1 | 516 ms | 510 ms | 35.23 tokens/sec| 773 MB | 813MB |
+**Video Process**
+|Chips| input size | image num | image encoder |ttft(600 tokens) | w8a16 | CMM | Flash |
+|--|--|--|--|--|--|--|--|
+|AX650| 512*512 | 8  | 1052 ms | 1523 ms | 35.32 tokens/sec| 773 MB | 813MB |
+The DDR capacity refers to the CMM memory that needs to be consumed. Ensure that the CMM memory allocation on the development board is greater than this value.
+## How to use
+Download all files from this repository to the device
+**If you using AX650 Board**
+### Demo Run
+#### Image understand demo
+Set the `video` parameter in run_ax650.sh to 0 .
+- input text
+```
+describe this image
+```
+- input image
+![](./video/frame_0000.jpg)
+```
+root@ax650 ~/SmolVLM2-500M-Video-Instruct_Ax650 # run_ax650.sh
+prompt >> describe this image
+image >> video/frame_0000.jpg
+read image
+[I][                     EncodeImage][ 409]: pixel_values size 5
+[I][                     EncodeImage][ 437]: image encode time : 516.138977 ms, size : 5
+[I][                          Encode][ 488]: img_embed.size :5, is video:0, num_media_tokens:64, real num of image:
+[I][                          Encode][ 498]: input_ids size:344
+[I][                          Encode][ 508]: offset 5
+[I][                          Encode][ 508]: offset 71
+[I][                          Encode][ 508]: offset 138
+[I][                          Encode][ 508]: offset 204
+[I][                          Encode][ 508]: offset 271
+[I][                          Encode][ 530]: img_embed.size:5, 36864
+[I][                          Encode][ 546]: out_embed size:198144
+[I][                          Encode][ 547]: input_ids size 344
+[I][                          Encode][ 549]: position_ids size:344
+[I][                             Run][ 568]: input token num : 344, prefill_split_num : 3
+[I][                             Run][ 602]: input_num_token:128
+[I][                             Run][ 602]: input_num_token:128
+[I][                             Run][ 602]: input_num_token:88
+[I][                             Run][ 791]: ttft: 271.32 ms
+ In the image, there are two animals, one on the left and the other on the right, both of which are bears. The bear on the left is standing on all fours, its body oriented towards the right side of the image. It has a black and white coat with a blue patch on its chest. The bear on the right is standing on all fours, its body oriented towards the left side of the image. It has a brown and white coat with a blue patch on its chest. Both bears are standing on a rocky terrain, with a mountainous background in the background. The sky in the background is a gradient of orange and yellow, suggesting a sunny day.
+[N][                             Run][ 918]: hit eos,avg 76.61 token/s
+```
+#### Video understand demo
+Set the `video` parameter in run_ax650.sh to 1 .
+- input text
+```
+描述这个视频
+```
+- input video
+./video
+```
+root@ax650 ~/SmolVLM2-500M-Video-Instruct_Ax650 # run_ax650.sh
+prompt >> describe this video
+video >> video
+video/frame_0000.jpg
+video/frame_0008.jpg
+video/frame_0016.jpg
+video/frame_0024.jpg
+video/frame_0032.jpg
+video/frame_0040.jpg
+video/frame_0048.jpg
+video/frame_0056.jpg
+[I][                     EncodeImage][ 409]: pixel_values size 8
+[I][                     EncodeImage][ 437]: image encode time : 834.026978 ms, size : 8
+[I][                          Encode][ 488]: img_embed.size :8, is video:1, num_media_tokens:64, real num of image:
+[I][                          Encode][ 498]: input_ids size:656
+[I][                          Encode][ 508]: offset 43
+[I][                          Encode][ 508]: offset 120
+[I][                          Encode][ 508]: offset 197
+[I][                          Encode][ 508]: offset 274
+[I][                          Encode][ 508]: offset 351
+[I][                          Encode][ 508]: offset 428
+[I][                          Encode][ 508]: offset 505
+[I][                          Encode][ 508]: offset 582
+[I][                          Encode][ 530]: img_embed.size:8, 36864
+[I][                          Encode][ 546]: out_embed size:377856
+[I][                          Encode][ 547]: input_ids size 656
+[I][                          Encode][ 549]: position_ids size:656
+[I][                             Run][ 568]: input token num : 656, prefill_split_num : 6
+[I][                             Run][ 602]: input_num_token:128
+[I][                             Run][ 602]: input_num_token:128
+[I][                             Run][ 602]: input_num_token:128
+[I][                             Run][ 602]: input_num_token:128
+[I][                             Run][ 602]: input_num_token:128
+[I][                             Run][ 602]: input_num_token:16
+[I][                             Run][ 791]: ttft: 827.08 ms
+ The video depicts two Siberian foxes in a rocky terrain, engaged in a playful interaction. The fox on the left is standing on its hind legs, while the one on the right is lying down. They are both looking at each other, possibly in a playful or affectionate manner. The background is a natural landscape with a mountainous terrain, suggesting a location where these foxes might be found. The video does not provide any specific actions or movements of the foxes, but the interaction between them is captured in a way that suggests a playful or affectionate moment.
+[N][                             Run][ 918]: hit eos,avg 75.46 token/s
+```
+### Gradio demo
+#### start openai style api server
+```shell
+./run_api_ax650.sh
+```
+#### start gradio demo
+if the api server is not run in the same machine,please modify the api url in gradio web ui.
+```shell
+python gradio_demo.py
+```
+![image](https://cdn-uploads.huggingface.co/production/uploads/64b7837c17570fdff9b906b9/Og9fPNi0chg768gicse7M.png)
+### HTTP demo
+#### start openai style api server
+```shell
+./run_api_ax650.sh
+```
+#### run http demo
+```
+python3 openai_cli.py
+```

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l0_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4368582f7a6b252ff052d34c3add8d4cb0518934eccbd0ca5345ab7672b3e9cd
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l10_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6975bbfb2551099ed26253ef54a2ed1e22570f60c2a0768568a34a002b9fc2f9
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l11_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01d1e822daec2902a9eeccf929079b674fa59b0d35d0ba079d96fa7a78717658
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l12_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:261b127b372daf1bc9b2b7af19fedaf61325f39ab2ddafe6a3b76db3ef3f3104
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l13_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3359120628124f6e94368e5579bc767925b435dc60caa17afa76a2fce6fe7eac
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l14_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37a0284e572c602c5014c6b56ae3b3a12215f568fd606381088cf257a0139a42
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l15_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74a2c79b377fdf3e659a92f3254d34b5d96ed42016b42b18ee100ae73302b856
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l16_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fff6baa15e8fc65af5f5faf350e6668774caf6ea4ef770787eb74893061055c1
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l17_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95072933679eddbb2e33c2d4bb117b3ef96cd4e6241248171dae918be87e180d
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l18_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8ef7858dd612e9285b8e0ee9c1bb09255a4e0a6ea550d955022f8b7bc1ff3015
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l19_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8179fc75698ed3b0825cc575b85636b8e9f7d357bf04ba7646a57efabd2f1982
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l1_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:975c15c763af80cd64b47e62cef16db1dc25b4e4a3b7f4f5505b08b8366bb4ba
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l20_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:39ee5e1641f6332c315d5af11c2b4cdbdb288f72deb67f752008ff3ca148354c
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l21_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da484595aa75613f93b9c1dbec579557bd3fff8422643ff17c5353e8ecb828c0
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l22_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef87a5693552635d27076e4da0ee7b3d7137b68ed10f9a1ad38ddfd2ddeaeeb3
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l23_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34cda76ab2481ca9cf92ad90e06cc871224cf4138833d66ebe9f5094c930c5e7
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l24_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f502e157e88956c2ba1401c669d815126ecd655b5d9904f2ce67d5734893149
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l25_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:26c3f92d25311c7ca0cbc184596c8a2cf87910499a7c02c4ca4a8364bb4d9c15
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l26_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7ae1d96a36b5bcb4137cbf7c9b29afc736603c5fbe75f80b13561799d40dfcd
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l27_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d85de855b64a3651a4351de684e199065dbf394b80b84977fc3eeb85de14a627
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l28_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1889c4bbe4f9212bcce9c93e7fd53ea7662acd866107025475b4e418b6a009c3
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l29_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09ce1afbde43c5fbf9647876017336d40c96f1d2350b88d7a9b67d68d9088f7f
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l2_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:08adf88e73dc01edecb78b6d66163361227dad7d51565a9ba3531c26faea4429
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l30_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:afc323915789b6c2c7a2c517100e0fa5b19bb73af4a1e7614c877c6896617a94
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l31_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:550dcede45169f6512a0fe265802441ca2e8036f2e829619b3243a5cc8d936b7
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l3_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6aa6f61e880811aa7e27b6be48b57d72a29e751badecb3da4b505c30507b56b4
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l4_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19bdda3fed4a07326e7b32f2967bfb0f5a86b38455f2aa637fdd94dcfcdcf02c
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l5_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0233f08a89a7ee07d6121d1dd1c5226a9e0e62d8a050da5167d8e6c2ec273a0f
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l6_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98446428c35db49a4218b6252101f2d7b1b10ec8f5d2ef893b107b5999d509cf
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l7_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8951a9bfc22b346da8736230da01afa294d5a35af9993b335d20fb785f10a7e
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l8_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:44171cfe42b7cc640b8987466c116926876b003cf4341b560062c2b9a3b8af0f
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l9_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:146d1b6a5bc3d37000c42a26e2df0503ebe6de6feda769c093fa43e9d643af86
+size 12234691

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_post.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f48bac7ef886adc346c8e1d3757e76c9441ad1e7a1d9863ef47d7a4a2b6a4e3
+size 51580701

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/model.embed_tokens.weight.bfloat16.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f221975e25aa56b36684ff886153f88569d8289ee9758ec80cf50c0105bb1a4a
+size 94617600

SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/vision_model_1x3x512x512_NHwC_U8.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4fbb70b1409a6beb113bba499cbfb3c9722548e0caf9ae5741961e991e4ec82
+size 114557007

axera_logo.png ADDED Viewed

Git LFS Details

SHA256: 6f3729509adf9e0c8baffcda3d7c1228f7d6bcd74374fc592c2995a3c1a3dfc1
Pointer size: 131 Bytes
Size of remote file: 157 kB

gradio_demo.py ADDED Viewed

	@@ -0,0 +1,261 @@

+# gradio_chat_single_turn.py
+import re
+import subprocess
+import gradio as gr
+import base64, cv2, os, tempfile
+from openai import OpenAI
+import requests
+def get_all_local_ips():
+    result = subprocess.run(['ip', 'a'], capture_output=True, text=True)
+    output = result.stdout
+    # 匹配所有IPv4
+    ips = re.findall(r'inet (\d+\.\d+\.\d+\.\d+)', output)
+    # 过滤掉回环地址
+    real_ips = [ip for ip in ips if not ip.startswith('127.')]
+    return real_ips
+# ---------- Helpers ----------
+def img_to_data_url_from_cvframe(frame):
+    import base64, cv2
+    ok, buf = cv2.imencode(".jpg", frame, [int(cv2.IMWRITE_JPEG_QUALITY), 85])
+    b64 = base64.b64encode(buf).decode("ascii")
+    return f"data:image/jpeg;base64,{b64}"
+def img_to_data_url_from_path(img_path: str) -> str:
+    import cv2, base64
+    img = cv2.imread(img_path)
+    return img_to_data_url_from_cvframe(img)
+def video_to_data_urls(video_path: str, frame_stride: int = 30, max_frames: int = 8):
+    import cv2, base64
+    cap = cv2.VideoCapture(video_path)
+    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if total / frame_stride > max_frames:
+        frame_stride = int(total/max_frames)
+    urls = []
+    idx = 0
+    first_preview = None
+    while len(urls) < max_frames and idx < total:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+        ret, frame = cap.read()
+        if not ret:
+            break
+        ok, buf = cv2.imencode(".jpg", frame, [int(cv2.IMWRITE_JPEG_QUALITY), 85])
+        if not ok:
+            break
+        b64 = base64.b64encode(buf).decode("ascii")
+        data_url = f"data:image/jpeg;base64,{b64}"
+        urls.append(data_url)
+        if first_preview is None:
+            first_preview = data_url
+        idx += frame_stride
+    cap.release()
+    return urls, first_preview
+def save_preview_image_from_data_url(data_url: str) -> str:
+    # 仅用于在 Chatbot 里显示缩略图
+    comma = data_url.find(",")
+    if comma == -1:
+        return ""
+    b64 = data_url[comma+1:]
+    raw = base64.b64decode(b64)
+    fd, tmp_path = tempfile.mkstemp(suffix=".jpg", prefix="preview_")
+    os.close(fd)
+    with open(tmp_path, "wb") as f:
+        f.write(raw)
+    return tmp_path
+def build_messages(prompt: str, image_path: str | None, video_path: str | None,
+                   prefer_video: bool, frame_stride: int, max_frames: int):
+    content = []
+    if prompt and prompt.strip():
+        content.append({"type": "text", "text": prompt.strip()})
+    if video_path and os.path.exists(video_path) and prefer_video:
+        urls, first_preview = video_to_data_urls(video_path, frame_stride=frame_stride, max_frames=max_frames)
+        content.append({"type": "image_url", "is_video":True, "image_url": urls})
+        media_desc = f"（视频抽帧：{len(urls)} 帧，步长 {frame_stride}）"
+        return {"role": "user", "content": content}, first_preview, media_desc
+    if image_path and os.path.exists(image_path):
+        u = img_to_data_url_from_path(image_path)
+        content.append({"type": "image_url", "image_url": u})
+        media_desc = "（已附带图片）"
+        return {"role": "user", "content": content}, u, media_desc
+    if video_path and os.path.exists(video_path):
+        urls, first_preview = video_to_data_urls(video_path, frame_stride=frame_stride, max_frames=max_frames)
+        content.append({"type": "image_url", "is_video":True, "image_url": urls})
+        media_desc = f"（视频抽帧：{len(urls)} 帧，步长 {frame_stride}）"
+        return {"role": "user", "content": content}, first_preview, media_desc
+    return {"role": "user", "content": content if content else [{"type": "text", "text": prompt or ""}]}, None, ""
+# ---------- Gradio callback (single-turn, stream) ----------
+def run_single_turn(prompt, image_file, video_file, prefer_video, frame_stride, max_frames,
+                    base_url, model, api_key, chatbot_state):
+    """
+    单轮：每次发送都会重置聊天历史，只显示本轮的 user/assistant 两个气泡。
+    """
+    try:
+        # 清空历史（单轮），构造用户气泡
+        chatbot_state = []
+        # 准备文件路径
+        image_path = image_file if isinstance(image_file, str) else (image_file.name if image_file else None)
+        video_path = video_file if isinstance(video_file, str) else (video_file.name if video_file else None)
+        # 构造 messages 和预览
+        messages, preview_data_url, media_desc = build_messages(
+            prompt=prompt or "",
+            image_path=image_path,
+            video_path=video_path,
+            prefer_video=bool(prefer_video),
+            frame_stride=int(frame_stride),
+            max_frames=int(max_frames),
+        )
+        # 组装用户气泡（Markdown）：文本 + 预览图/视频说明
+        user_md = (prompt or "").strip()
+        if media_desc:
+            user_md = (user_md + "\n\n" if user_md else "") + f"> {media_desc}"
+        if preview_data_url:
+            # user_md = (user_md + "\n\n" if user_md else "") + f"![preview]({preview_path})"
+            user_md = (user_md + "\n\n" if user_md else "") + f"![preview]({preview_data_url})"
+        chatbot_state.append((user_md or "(空提示)", ""))  # assistant 先空字符串，等待流式填充
+        yield chatbot_state  # 先把用户气泡渲染出来
+        # 调后端（流式）
+        client = OpenAI(api_key=api_key or "not-needed", base_url=base_url.strip())
+        stream = client.chat.completions.create(
+            model=model.strip(),
+            messages=messages,
+            stream=True,
+        )
+        bot_chunks = []
+        # 先补一个空 assistant 气泡
+        if len(chatbot_state) == 1:
+            chatbot_state[0] = (chatbot_state[0][0], "")
+            yield chatbot_state
+        # 逐 chunk 更新 assistant 气泡（Markdown）
+        for ev in stream:
+            delta = getattr(ev.choices[0], "delta", None)
+            if delta and getattr(delta, "content", None):
+                bot_chunks.append(delta.content)
+                chatbot_state[-1] = (chatbot_state[-1][0], "".join(bot_chunks))
+                yield chatbot_state
+        # 结束再确保收尾
+        chatbot_state[-1] = (chatbot_state[-1][0], "".join(bot_chunks) if bot_chunks else "(empty response)")
+        yield chatbot_state
+    except Exception as e:
+        chatbot_state.append((
+            chatbot_state[-1][0] if chatbot_state else "(request)",
+            f"**Error:** {e}"
+        ))
+        yield chatbot_state
+# ---------- Gradio UI ----------
+with gr.Blocks(css="""
+    #chat,
+    #chat * {
+        font-size: 18px !important;
+        line-height: 1.6 !important;
+    }
+    #chat .message,
+    #chat [data-testid="bot"],
+    #chat [data-testid="user"] {
+        font-size: 18px !important;
+    }
+""",title="AXERA Qwen3 VL") as demo:
+    axera_logo = img_to_data_url_from_path("./axera_logo.png")
+    gr.Markdown(
+        f"""
+        <div style="display: flex; align-items: center; gap: 10px;">
+            <img src="{axera_logo}" alt="axera_logo" style="height: 60px;">
+        </div>
+        """
+    )
+    chatbot = gr.Chatbot(
+        label="对话",
+        bubble_full_width=False,
+        height=500,
+        avatar_images=(None, None),  # 可替换头像
+        latex_delimiters=[{"left": "$$", "right": "$$", "display": True},
+                          {"left": "$", "right": "$", "display": False}],
+        show_copy_button=True,
+        render_markdown=True,
+        elem_id="chat"
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            prompt = gr.Textbox(label="Prompt", placeholder="输入你的提示语", lines=2)
+            with gr.Row():
+                send_btn = gr.Button("发送 ▶️", variant="primary")
+                clear_btn = gr.Button("清空")
+                stop_btn = gr.Button("停止 ■", variant="stop")
+            with gr.Row():
+                image = gr.Image(type="filepath", label="上传图片（可选）")
+                video = gr.Video(label="上传视频（可选）")
+        with gr.Column(scale=1):
+            base_url = gr.Textbox(value="http://localhost:8000/v1", label="Base URL")
+            model = gr.Textbox(value="AXERA-TECH/SmolVLM2-500M-Video-Instruct", label="Model")
+            api_key = gr.Textbox(value="not-needed", label="API Key", type="password")
+            with gr.Row():
+                prefer_video = gr.Checkbox(True, label="如果有视频，优先使用视频抽帧")
+                frame_stride = gr.Slider(1, 90, value=30, step=1, label="视频抽帧间隔")
+                max_frames = gr.Slider(1, 8, value=8, step=1, label="最多抽帧数")
+    # 单轮对话需要一个 state 来承载当前这轮的气泡
+    state = gr.State([])
+    send_btn.click(
+        fn=run_single_turn,
+        inputs=[prompt, image, video, prefer_video, frame_stride, max_frames, base_url, model, api_key, state],
+        outputs=chatbot,
+        show_progress=True,
+        queue=True,
+    )
+    def stop_stream():
+        url = "http://localhost:8000/v1/stop"
+        response = requests.get(url)
+        if response.status_code == 200:
+            print("Stream stopped successfully")
+        else:
+            print(f"Failed to stop stream: {response.status_code} - {response.text}")
+    stop_btn.click(
+        fn=stop_stream,
+        outputs=chatbot,
+        show_progress=True,
+        queue=True,
+    )
+    def clear_all():
+        return [], "", None, None, True, 30, 8
+    clear_btn.click(clear_all, None, [chatbot, prompt, image, video, prefer_video, frame_stride, max_frames])
+if __name__ == "__main__":
+    ips = get_all_local_ips()
+    for ip in ips:
+        print(f"* Running on local URL:  http://{ip}:7860")
+    ip = "0.0.0.0"
+    demo.launch(server_name=ip, server_port=7860)

main_api_ax650 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ccb96231d5f0f6767b11d1c3174b1564c232bf3d7834bc0918189f06b18f17c
+size 6931024

main_ax650 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2424fb3c239998edf2d04d852deb4d738ec1d6b82e681a59dd8e2b6ed4204e96
+size 6803824

openai_cli.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import base64
+import glob
+from openai import OpenAI
+import cv2
+BASE_URL = "http://localhost:8000/v1"
+def img_to_data_url(img_path: str):
+    img = cv2.imread(img_path)
+    if img is None:
+        raise FileNotFoundError(f"Cannot read image: {img_path}")
+    ok, buf = cv2.imencode(".jpg", img)
+    if not ok:
+        raise RuntimeError("cv2.imencode failed")
+    b64 = base64.b64encode(buf).decode("ascii")
+    return f"data:image/jpeg;base64,{b64}"
+def test(openai_messages):
+    client = OpenAI(api_key="not-needed", base_url=BASE_URL)
+    stream = client.chat.completions.create(
+        model="AXERA-TECH/SmolVLM2-500M-Video-Instruct",
+        messages=openai_messages,
+        stream=True,
+    )
+    out_chunks = []
+    for ev in stream:
+        delta = ev.choices[0].delta
+        if delta and delta.content:
+            out_chunks.append(delta.content)
+            print(delta.content, end="", flush=True)
+    print()
+    assistant_text = "".join(out_chunks).strip()
+def test_image():
+    image_data = img_to_data_url("video/frame_0000.jpg")
+    openai_messages = {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Describe this image"},
+            {"type": "image_url", "image_url": image_data},
+        ],
+    }
+    test(openai_messages)
+def test_video():
+    image_list = glob.glob("video/*.jpg")
+    image_list.sort()
+    image_data_list = [img_to_data_url(img) for img in image_list]
+    openai_messages = {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Describe this video"},
+            {"type": "image_url", "is_video":True, "image_url": image_data_list},
+        ],
+    }
+    test(openai_messages)
+print("Test image")
+test_image()
+print("Test video")
+test_video()

run_api_ax650.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+# SmolVLM2-500M
+AXMODEL_DIR=SmolVLM2-500M-Video-Instruct_Ax650/
+VIT=SmolVLM2-500M-Video-Instruct_Ax650/vision_model_1x3x512x512_NHwC_U8.axmodel
+LAYER_NUM=32
+EMBED_SIZE=960
+./main_api_ax650 \
+--template_filename_axmodel "${AXMODEL_DIR}/llama_p128_l%d_together.axmodel" \
+--axmodel_num $LAYER_NUM \
+--filename_image_encoder_axmodedl $VIT \
+--use_mmap_load_embed 1 \
+--filename_tokenizer_model smolvlm2_tokenizer.txt \
+--filename_post_axmodel "${AXMODEL_DIR}/llama_post.axmodel" \
+--filename_tokens_embed "${AXMODEL_DIR}/model.embed_tokens.weight.bfloat16.bin" \
+--tokens_embed_num 49280 \
+--tokens_embed_size $EMBED_SIZE \
+--post_config_path post_config1.json

run_ax650.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+# SmolVLM2-500M
+AXMODEL_DIR=SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/
+VIT=SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/vision_model_1x3x512x512_NHwC_U8.axmodel
+LAYER_NUM=32
+EMBED_SIZE=960
+./main_ax650 \
+--template_filename_axmodel "${AXMODEL_DIR}/llama_p128_l%d_together.axmodel" \
+--axmodel_num $LAYER_NUM \
+--filename_image_encoder_axmodedl $VIT \
+--bos 0 --eos 0 \
+--dynamic_load_axmodel_layer 0 \
+--use_mmap_load_embed 1 \
+--filename_tokenizer_model smolvlm2_tokenizer.txt \
+--filename_post_axmodel "${AXMODEL_DIR}/llama_post.axmodel" \
+--filename_tokens_embed "${AXMODEL_DIR}/model.embed_tokens.weight.bfloat16.bin" \
+--tokens_embed_num 49280 \
+--tokens_embed_size $EMBED_SIZE \
+--live_print 1 \
+--continue 1 \
+--video 0 \
+--post_config_path post_config1.json
+# --video 0 表示图像理解； --video 1 表示适配理解
+# 传图像路径时可以传一个图像路径（单张图像理解）或者，多张图像所在的目录（多张图像理解）
+# 视频理解要传一个视频帧所在的目录路径，只支持一个视频