lihongjie commited on
Commit
d0bd47b
·
1 Parent(s): d082af0

first commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +46 -0
  2. README.md +182 -3
  3. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l0_together.axmodel +3 -0
  4. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l10_together.axmodel +3 -0
  5. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l11_together.axmodel +3 -0
  6. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l12_together.axmodel +3 -0
  7. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l13_together.axmodel +3 -0
  8. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l14_together.axmodel +3 -0
  9. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l15_together.axmodel +3 -0
  10. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l16_together.axmodel +3 -0
  11. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l17_together.axmodel +3 -0
  12. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l18_together.axmodel +3 -0
  13. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l19_together.axmodel +3 -0
  14. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l1_together.axmodel +3 -0
  15. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l20_together.axmodel +3 -0
  16. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l21_together.axmodel +3 -0
  17. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l22_together.axmodel +3 -0
  18. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l23_together.axmodel +3 -0
  19. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l24_together.axmodel +3 -0
  20. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l25_together.axmodel +3 -0
  21. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l26_together.axmodel +3 -0
  22. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l27_together.axmodel +3 -0
  23. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l28_together.axmodel +3 -0
  24. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l29_together.axmodel +3 -0
  25. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l2_together.axmodel +3 -0
  26. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l30_together.axmodel +3 -0
  27. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l31_together.axmodel +3 -0
  28. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l3_together.axmodel +3 -0
  29. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l4_together.axmodel +3 -0
  30. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l5_together.axmodel +3 -0
  31. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l6_together.axmodel +3 -0
  32. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l7_together.axmodel +3 -0
  33. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l8_together.axmodel +3 -0
  34. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l9_together.axmodel +3 -0
  35. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_post.axmodel +3 -0
  36. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/model.embed_tokens.weight.bfloat16.bin +3 -0
  37. SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/vision_model_1x3x512x512_NHwC_U8.axmodel +3 -0
  38. axera_logo.png +3 -0
  39. gradio_demo.py +261 -0
  40. main_api_ax650 +3 -0
  41. main_ax650 +3 -0
  42. openai_cli.py +70 -0
  43. run_api_ax650.sh +19 -0
  44. run_ax650.sh +26 -0
  45. smolvlm2_tokenizer.txt +0 -0
  46. video/frame_0000.jpg +3 -0
  47. video/frame_0008.jpg +3 -0
  48. video/frame_0016.jpg +3 -0
  49. video/frame_0024.jpg +3 -0
  50. video/frame_0032.jpg +3 -0
.gitattributes CHANGED
@@ -33,3 +33,49 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l29_together.axmodel filter=lfs diff=lfs merge=lfs -text
37
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/vision_model_1x3x512x512_NHwC_U8.axmodel filter=lfs diff=lfs merge=lfs -text
38
+ video/frame_0000.jpg filter=lfs diff=lfs merge=lfs -text
39
+ video/frame_0040.jpg filter=lfs diff=lfs merge=lfs -text
40
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l13_together.axmodel filter=lfs diff=lfs merge=lfs -text
41
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l14_together.axmodel filter=lfs diff=lfs merge=lfs -text
42
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l7_together.axmodel filter=lfs diff=lfs merge=lfs -text
43
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l20_together.axmodel filter=lfs diff=lfs merge=lfs -text
44
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l28_together.axmodel filter=lfs diff=lfs merge=lfs -text
45
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l5_together.axmodel filter=lfs diff=lfs merge=lfs -text
46
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_post.axmodel filter=lfs diff=lfs merge=lfs -text
47
+ video/frame_0024.jpg filter=lfs diff=lfs merge=lfs -text
48
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l10_together.axmodel filter=lfs diff=lfs merge=lfs -text
49
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l11_together.axmodel filter=lfs diff=lfs merge=lfs -text
50
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l16_together.axmodel filter=lfs diff=lfs merge=lfs -text
51
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l31_together.axmodel filter=lfs diff=lfs merge=lfs -text
52
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l3_together.axmodel filter=lfs diff=lfs merge=lfs -text
53
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l6_together.axmodel filter=lfs diff=lfs merge=lfs -text
54
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l0_together.axmodel filter=lfs diff=lfs merge=lfs -text
55
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l17_together.axmodel filter=lfs diff=lfs merge=lfs -text
56
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l26_together.axmodel filter=lfs diff=lfs merge=lfs -text
57
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l23_together.axmodel filter=lfs diff=lfs merge=lfs -text
58
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l4_together.axmodel filter=lfs diff=lfs merge=lfs -text
59
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l9_together.axmodel filter=lfs diff=lfs merge=lfs -text
60
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/model.embed_tokens.weight.bfloat16.bin filter=lfs diff=lfs merge=lfs -text
61
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l15_together.axmodel filter=lfs diff=lfs merge=lfs -text
62
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l19_together.axmodel filter=lfs diff=lfs merge=lfs -text
63
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l22_together.axmodel filter=lfs diff=lfs merge=lfs -text
64
+ video/frame_0032.jpg filter=lfs diff=lfs merge=lfs -text
65
+ main_api_ax650 filter=lfs diff=lfs merge=lfs -text
66
+ main_ax650 filter=lfs diff=lfs merge=lfs -text
67
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l24_together.axmodel filter=lfs diff=lfs merge=lfs -text
68
+ video/frame_0008.jpg filter=lfs diff=lfs merge=lfs -text
69
+ video/frame_0016.jpg filter=lfs diff=lfs merge=lfs -text
70
+ axera_logo.png filter=lfs diff=lfs merge=lfs -text
71
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l18_together.axmodel filter=lfs diff=lfs merge=lfs -text
72
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l21_together.axmodel filter=lfs diff=lfs merge=lfs -text
73
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l2_together.axmodel filter=lfs diff=lfs merge=lfs -text
74
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l27_together.axmodel filter=lfs diff=lfs merge=lfs -text
75
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l30_together.axmodel filter=lfs diff=lfs merge=lfs -text
76
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l8_together.axmodel filter=lfs diff=lfs merge=lfs -text
77
+ video/frame_0048.jpg filter=lfs diff=lfs merge=lfs -text
78
+ video/frame_0056.jpg filter=lfs diff=lfs merge=lfs -text
79
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l12_together.axmodel filter=lfs diff=lfs merge=lfs -text
80
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l1_together.axmodel filter=lfs diff=lfs merge=lfs -text
81
+ SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l25_together.axmodel filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,182 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ language:
4
+ - en
5
+ - zh
6
+ base_model:
7
+ - SmolVLM2-500M-Video-Instruct
8
+ pipeline_tag: image-text-to-text
9
+ library_name: transformers
10
+ tags:
11
+ - Int8
12
+ - VLM
13
+ ---
14
+
15
+ # Qwen3-VL
16
+
17
+ This version of SmolVLM2-500M-Video-Instructhas been converted to run on the Axera NPU using **w8a16** quantization.
18
+
19
+ Compatible with Pulsar2 version: 5.0
20
+
21
+ ## Convert tools links:
22
+
23
+ For those who are interested in model conversion, you can try to export axmodel through the original repo :
24
+
25
+ - https://huggingface.co/HuggingFaceTB/SmolVLM2-500M-Video-Instruct
26
+
27
+ [Pulsar2 Link, How to Convert LLM from Huggingface to axmodel](https://pulsar2-docs.readthedocs.io/en/latest/appendix/build_llm.html)
28
+
29
+
30
+
31
+ ## Support Platform
32
+
33
+ - AX650
34
+ - AX650N DEMO Board
35
+ - [M4N-Dock(爱芯派Pro)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
36
+ - [M.2 Accelerator card](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html)
37
+
38
+ **Image Process**
39
+ |Chips| input size | image num | image encoder | ttft(168 tokens) | w8a16 | CMM | Flash |
40
+ |--|--|--|--|--|--|--|--|
41
+ |AX650| 512*512 | 1 | 516 ms | 510 ms | 35.23 tokens/sec| 773 MB | 813MB |
42
+
43
+ **Video Process**
44
+ |Chips| input size | image num | image encoder |ttft(600 tokens) | w8a16 | CMM | Flash |
45
+ |--|--|--|--|--|--|--|--|
46
+ |AX650| 512*512 | 8 | 1052 ms | 1523 ms | 35.32 tokens/sec| 773 MB | 813MB |
47
+
48
+
49
+ The DDR capacity refers to the CMM memory that needs to be consumed. Ensure that the CMM memory allocation on the development board is greater than this value.
50
+
51
+ ## How to use
52
+
53
+ Download all files from this repository to the device
54
+
55
+ **If you using AX650 Board**
56
+
57
+ ### Demo Run
58
+
59
+ #### Image understand demo
60
+ Set the `video` parameter in run_ax650.sh to 0 .
61
+
62
+ - input text
63
+
64
+ ```
65
+ describe this image
66
+ ```
67
+
68
+ - input image
69
+
70
+ ![](./video/frame_0000.jpg)
71
+
72
+ ```
73
+ root@ax650 ~/SmolVLM2-500M-Video-Instruct_Ax650 # run_ax650.sh
74
+ prompt >> describe this image
75
+ image >> video/frame_0000.jpg
76
+ read image
77
+ [I][ EncodeImage][ 409]: pixel_values size 5
78
+ [I][ EncodeImage][ 437]: image encode time : 516.138977 ms, size : 5
79
+ [I][ Encode][ 488]: img_embed.size :5, is video:0, num_media_tokens:64, real num of image:
80
+ [I][ Encode][ 498]: input_ids size:344
81
+ [I][ Encode][ 508]: offset 5
82
+ [I][ Encode][ 508]: offset 71
83
+ [I][ Encode][ 508]: offset 138
84
+ [I][ Encode][ 508]: offset 204
85
+ [I][ Encode][ 508]: offset 271
86
+ [I][ Encode][ 530]: img_embed.size:5, 36864
87
+ [I][ Encode][ 546]: out_embed size:198144
88
+ [I][ Encode][ 547]: input_ids size 344
89
+ [I][ Encode][ 549]: position_ids size:344
90
+ [I][ Run][ 568]: input token num : 344, prefill_split_num : 3
91
+ [I][ Run][ 602]: input_num_token:128
92
+ [I][ Run][ 602]: input_num_token:128
93
+ [I][ Run][ 602]: input_num_token:88
94
+ [I][ Run][ 791]: ttft: 271.32 ms
95
+ In the image, there are two animals, one on the left and the other on the right, both of which are bears. The bear on the left is standing on all fours, its body oriented towards the right side of the image. It has a black and white coat with a blue patch on its chest. The bear on the right is standing on all fours, its body oriented towards the left side of the image. It has a brown and white coat with a blue patch on its chest. Both bears are standing on a rocky terrain, with a mountainous background in the background. The sky in the background is a gradient of orange and yellow, suggesting a sunny day.
96
+
97
+ [N][ Run][ 918]: hit eos,avg 76.61 token/s
98
+ ```
99
+
100
+ #### Video understand demo
101
+ Set the `video` parameter in run_ax650.sh to 1 .
102
+
103
+ - input text
104
+
105
+ ```
106
+ 描述这个视频
107
+ ```
108
+
109
+ - input video
110
+
111
+ ./video
112
+
113
+ ```
114
+ root@ax650 ~/SmolVLM2-500M-Video-Instruct_Ax650 # run_ax650.sh
115
+ prompt >> describe this video
116
+ video >> video
117
+ video/frame_0000.jpg
118
+ video/frame_0008.jpg
119
+ video/frame_0016.jpg
120
+ video/frame_0024.jpg
121
+ video/frame_0032.jpg
122
+ video/frame_0040.jpg
123
+ video/frame_0048.jpg
124
+ video/frame_0056.jpg
125
+ [I][ EncodeImage][ 409]: pixel_values size 8
126
+ [I][ EncodeImage][ 437]: image encode time : 834.026978 ms, size : 8
127
+ [I][ Encode][ 488]: img_embed.size :8, is video:1, num_media_tokens:64, real num of image:
128
+ [I][ Encode][ 498]: input_ids size:656
129
+ [I][ Encode][ 508]: offset 43
130
+ [I][ Encode][ 508]: offset 120
131
+ [I][ Encode][ 508]: offset 197
132
+ [I][ Encode][ 508]: offset 274
133
+ [I][ Encode][ 508]: offset 351
134
+ [I][ Encode][ 508]: offset 428
135
+ [I][ Encode][ 508]: offset 505
136
+ [I][ Encode][ 508]: offset 582
137
+ [I][ Encode][ 530]: img_embed.size:8, 36864
138
+ [I][ Encode][ 546]: out_embed size:377856
139
+ [I][ Encode][ 547]: input_ids size 656
140
+ [I][ Encode][ 549]: position_ids size:656
141
+ [I][ Run][ 568]: input token num : 656, prefill_split_num : 6
142
+ [I][ Run][ 602]: input_num_token:128
143
+ [I][ Run][ 602]: input_num_token:128
144
+ [I][ Run][ 602]: input_num_token:128
145
+ [I][ Run][ 602]: input_num_token:128
146
+ [I][ Run][ 602]: input_num_token:128
147
+ [I][ Run][ 602]: input_num_token:16
148
+ [I][ Run][ 791]: ttft: 827.08 ms
149
+ The video depicts two Siberian foxes in a rocky terrain, engaged in a playful interaction. The fox on the left is standing on its hind legs, while the one on the right is lying down. They are both looking at each other, possibly in a playful or affectionate manner. The background is a natural landscape with a mountainous terrain, suggesting a location where these foxes might be found. The video does not provide any specific actions or movements of the foxes, but the interaction between them is captured in a way that suggests a playful or affectionate moment.
150
+
151
+ [N][ Run][ 918]: hit eos,avg 75.46 token/s
152
+
153
+ ```
154
+
155
+ ### Gradio demo
156
+
157
+
158
+ #### start openai style api server
159
+ ```shell
160
+ ./run_api_ax650.sh
161
+ ```
162
+
163
+ #### start gradio demo
164
+ if the api server is not run in the same machine,please modify the api url in gradio web ui.
165
+ ```shell
166
+ python gradio_demo.py
167
+ ```
168
+
169
+ ![image](https://cdn-uploads.huggingface.co/production/uploads/64b7837c17570fdff9b906b9/Og9fPNi0chg768gicse7M.png)
170
+
171
+
172
+ ### HTTP demo
173
+
174
+ #### start openai style api server
175
+ ```shell
176
+ ./run_api_ax650.sh
177
+ ```
178
+
179
+ #### run http demo
180
+ ```
181
+ python3 openai_cli.py
182
+ ```
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l0_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4368582f7a6b252ff052d34c3add8d4cb0518934eccbd0ca5345ab7672b3e9cd
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l10_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6975bbfb2551099ed26253ef54a2ed1e22570f60c2a0768568a34a002b9fc2f9
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l11_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01d1e822daec2902a9eeccf929079b674fa59b0d35d0ba079d96fa7a78717658
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l12_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:261b127b372daf1bc9b2b7af19fedaf61325f39ab2ddafe6a3b76db3ef3f3104
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l13_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3359120628124f6e94368e5579bc767925b435dc60caa17afa76a2fce6fe7eac
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l14_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37a0284e572c602c5014c6b56ae3b3a12215f568fd606381088cf257a0139a42
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l15_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74a2c79b377fdf3e659a92f3254d34b5d96ed42016b42b18ee100ae73302b856
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l16_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fff6baa15e8fc65af5f5faf350e6668774caf6ea4ef770787eb74893061055c1
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l17_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95072933679eddbb2e33c2d4bb117b3ef96cd4e6241248171dae918be87e180d
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l18_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ef7858dd612e9285b8e0ee9c1bb09255a4e0a6ea550d955022f8b7bc1ff3015
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l19_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8179fc75698ed3b0825cc575b85636b8e9f7d357bf04ba7646a57efabd2f1982
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l1_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:975c15c763af80cd64b47e62cef16db1dc25b4e4a3b7f4f5505b08b8366bb4ba
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l20_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39ee5e1641f6332c315d5af11c2b4cdbdb288f72deb67f752008ff3ca148354c
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l21_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da484595aa75613f93b9c1dbec579557bd3fff8422643ff17c5353e8ecb828c0
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l22_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef87a5693552635d27076e4da0ee7b3d7137b68ed10f9a1ad38ddfd2ddeaeeb3
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l23_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34cda76ab2481ca9cf92ad90e06cc871224cf4138833d66ebe9f5094c930c5e7
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l24_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f502e157e88956c2ba1401c669d815126ecd655b5d9904f2ce67d5734893149
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l25_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26c3f92d25311c7ca0cbc184596c8a2cf87910499a7c02c4ca4a8364bb4d9c15
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l26_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7ae1d96a36b5bcb4137cbf7c9b29afc736603c5fbe75f80b13561799d40dfcd
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l27_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d85de855b64a3651a4351de684e199065dbf394b80b84977fc3eeb85de14a627
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l28_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1889c4bbe4f9212bcce9c93e7fd53ea7662acd866107025475b4e418b6a009c3
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l29_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09ce1afbde43c5fbf9647876017336d40c96f1d2350b88d7a9b67d68d9088f7f
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l2_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08adf88e73dc01edecb78b6d66163361227dad7d51565a9ba3531c26faea4429
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l30_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afc323915789b6c2c7a2c517100e0fa5b19bb73af4a1e7614c877c6896617a94
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l31_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:550dcede45169f6512a0fe265802441ca2e8036f2e829619b3243a5cc8d936b7
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l3_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6aa6f61e880811aa7e27b6be48b57d72a29e751badecb3da4b505c30507b56b4
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l4_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19bdda3fed4a07326e7b32f2967bfb0f5a86b38455f2aa637fdd94dcfcdcf02c
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l5_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0233f08a89a7ee07d6121d1dd1c5226a9e0e62d8a050da5167d8e6c2ec273a0f
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l6_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98446428c35db49a4218b6252101f2d7b1b10ec8f5d2ef893b107b5999d509cf
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l7_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8951a9bfc22b346da8736230da01afa294d5a35af9993b335d20fb785f10a7e
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l8_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44171cfe42b7cc640b8987466c116926876b003cf4341b560062c2b9a3b8af0f
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_p128_l9_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:146d1b6a5bc3d37000c42a26e2df0503ebe6de6feda769c093fa43e9d643af86
3
+ size 12234691
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/llama_post.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f48bac7ef886adc346c8e1d3757e76c9441ad1e7a1d9863ef47d7a4a2b6a4e3
3
+ size 51580701
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/model.embed_tokens.weight.bfloat16.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f221975e25aa56b36684ff886153f88569d8289ee9758ec80cf50c0105bb1a4a
3
+ size 94617600
SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/vision_model_1x3x512x512_NHwC_U8.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4fbb70b1409a6beb113bba499cbfb3c9722548e0caf9ae5741961e991e4ec82
3
+ size 114557007
axera_logo.png ADDED

Git LFS Details

  • SHA256: 6f3729509adf9e0c8baffcda3d7c1228f7d6bcd74374fc592c2995a3c1a3dfc1
  • Pointer size: 131 Bytes
  • Size of remote file: 157 kB
gradio_demo.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # gradio_chat_single_turn.py
2
+ import re
3
+ import subprocess
4
+ import gradio as gr
5
+ import base64, cv2, os, tempfile
6
+ from openai import OpenAI
7
+ import requests
8
+
9
+ def get_all_local_ips():
10
+ result = subprocess.run(['ip', 'a'], capture_output=True, text=True)
11
+ output = result.stdout
12
+
13
+ # 匹配所有IPv4
14
+ ips = re.findall(r'inet (\d+\.\d+\.\d+\.\d+)', output)
15
+
16
+ # 过滤掉回环地址
17
+ real_ips = [ip for ip in ips if not ip.startswith('127.')]
18
+
19
+ return real_ips
20
+
21
+
22
+
23
+ # ---------- Helpers ----------
24
+ def img_to_data_url_from_cvframe(frame):
25
+ import base64, cv2
26
+ ok, buf = cv2.imencode(".jpg", frame, [int(cv2.IMWRITE_JPEG_QUALITY), 85])
27
+ b64 = base64.b64encode(buf).decode("ascii")
28
+ return f"data:image/jpeg;base64,{b64}"
29
+
30
+ def img_to_data_url_from_path(img_path: str) -> str:
31
+ import cv2, base64
32
+ img = cv2.imread(img_path)
33
+ return img_to_data_url_from_cvframe(img)
34
+
35
+ def video_to_data_urls(video_path: str, frame_stride: int = 30, max_frames: int = 8):
36
+ import cv2, base64
37
+ cap = cv2.VideoCapture(video_path)
38
+ total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
39
+
40
+ if total / frame_stride > max_frames:
41
+ frame_stride = int(total/max_frames)
42
+
43
+ urls = []
44
+ idx = 0
45
+ first_preview = None
46
+ while len(urls) < max_frames and idx < total:
47
+ cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
48
+ ret, frame = cap.read()
49
+ if not ret:
50
+ break
51
+ ok, buf = cv2.imencode(".jpg", frame, [int(cv2.IMWRITE_JPEG_QUALITY), 85])
52
+ if not ok:
53
+ break
54
+ b64 = base64.b64encode(buf).decode("ascii")
55
+ data_url = f"data:image/jpeg;base64,{b64}"
56
+ urls.append(data_url)
57
+ if first_preview is None:
58
+ first_preview = data_url
59
+ idx += frame_stride
60
+ cap.release()
61
+ return urls, first_preview
62
+
63
+ def save_preview_image_from_data_url(data_url: str) -> str:
64
+ # 仅用于在 Chatbot 里显示缩略图
65
+ comma = data_url.find(",")
66
+ if comma == -1:
67
+ return ""
68
+ b64 = data_url[comma+1:]
69
+ raw = base64.b64decode(b64)
70
+ fd, tmp_path = tempfile.mkstemp(suffix=".jpg", prefix="preview_")
71
+ os.close(fd)
72
+ with open(tmp_path, "wb") as f:
73
+ f.write(raw)
74
+ return tmp_path
75
+
76
+ def build_messages(prompt: str, image_path: str | None, video_path: str | None,
77
+ prefer_video: bool, frame_stride: int, max_frames: int):
78
+ content = []
79
+ if prompt and prompt.strip():
80
+ content.append({"type": "text", "text": prompt.strip()})
81
+
82
+ if video_path and os.path.exists(video_path) and prefer_video:
83
+ urls, first_preview = video_to_data_urls(video_path, frame_stride=frame_stride, max_frames=max_frames)
84
+ content.append({"type": "image_url", "is_video":True, "image_url": urls})
85
+ media_desc = f"(视频抽帧:{len(urls)} 帧,步长 {frame_stride})"
86
+ return {"role": "user", "content": content}, first_preview, media_desc
87
+
88
+ if image_path and os.path.exists(image_path):
89
+ u = img_to_data_url_from_path(image_path)
90
+ content.append({"type": "image_url", "image_url": u})
91
+ media_desc = "(已附带图片)"
92
+ return {"role": "user", "content": content}, u, media_desc
93
+
94
+ if video_path and os.path.exists(video_path):
95
+ urls, first_preview = video_to_data_urls(video_path, frame_stride=frame_stride, max_frames=max_frames)
96
+ content.append({"type": "image_url", "is_video":True, "image_url": urls})
97
+ media_desc = f"(视频抽帧:{len(urls)} 帧,步长 {frame_stride})"
98
+ return {"role": "user", "content": content}, first_preview, media_desc
99
+
100
+ return {"role": "user", "content": content if content else [{"type": "text", "text": prompt or ""}]}, None, ""
101
+
102
+ # ---------- Gradio callback (single-turn, stream) ----------
103
+ def run_single_turn(prompt, image_file, video_file, prefer_video, frame_stride, max_frames,
104
+ base_url, model, api_key, chatbot_state):
105
+ """
106
+ 单轮:每次发送都会重置聊天历史,只显示本轮的 user/assistant 两个气泡。
107
+ """
108
+ try:
109
+ # 清空历史(单轮),构造用户气泡
110
+ chatbot_state = []
111
+
112
+ # 准备文件路径
113
+ image_path = image_file if isinstance(image_file, str) else (image_file.name if image_file else None)
114
+ video_path = video_file if isinstance(video_file, str) else (video_file.name if video_file else None)
115
+
116
+ # 构造 messages 和预览
117
+ messages, preview_data_url, media_desc = build_messages(
118
+ prompt=prompt or "",
119
+ image_path=image_path,
120
+ video_path=video_path,
121
+ prefer_video=bool(prefer_video),
122
+ frame_stride=int(frame_stride),
123
+ max_frames=int(max_frames),
124
+ )
125
+
126
+ # 组装用户气泡(Markdown):文本 + 预览图/视频说明
127
+ user_md = (prompt or "").strip()
128
+ if media_desc:
129
+ user_md = (user_md + "\n\n" if user_md else "") + f"> {media_desc}"
130
+ if preview_data_url:
131
+ # user_md = (user_md + "\n\n" if user_md else "") + f"![preview]({preview_path})"
132
+ user_md = (user_md + "\n\n" if user_md else "") + f"![preview]({preview_data_url})"
133
+
134
+ chatbot_state.append((user_md or "(空提示)", "")) # assistant 先空字符串,等待流式填充
135
+ yield chatbot_state # 先把用户气泡渲染出来
136
+
137
+ # 调后端(流式)
138
+ client = OpenAI(api_key=api_key or "not-needed", base_url=base_url.strip())
139
+ stream = client.chat.completions.create(
140
+ model=model.strip(),
141
+ messages=messages,
142
+ stream=True,
143
+ )
144
+
145
+ bot_chunks = []
146
+ # 先补一个空 assistant 气泡
147
+ if len(chatbot_state) == 1:
148
+ chatbot_state[0] = (chatbot_state[0][0], "")
149
+ yield chatbot_state
150
+
151
+ # 逐 chunk 更新 assistant 气泡(Markdown)
152
+ for ev in stream:
153
+ delta = getattr(ev.choices[0], "delta", None)
154
+ if delta and getattr(delta, "content", None):
155
+ bot_chunks.append(delta.content)
156
+ chatbot_state[-1] = (chatbot_state[-1][0], "".join(bot_chunks))
157
+ yield chatbot_state
158
+
159
+ # 结束再确保收尾
160
+ chatbot_state[-1] = (chatbot_state[-1][0], "".join(bot_chunks) if bot_chunks else "(empty response)")
161
+ yield chatbot_state
162
+
163
+ except Exception as e:
164
+ chatbot_state.append((
165
+ chatbot_state[-1][0] if chatbot_state else "(request)",
166
+ f"**Error:** {e}"
167
+ ))
168
+ yield chatbot_state
169
+
170
+ # ---------- Gradio UI ----------
171
+ with gr.Blocks(css="""
172
+ #chat,
173
+ #chat * {
174
+ font-size: 18px !important;
175
+ line-height: 1.6 !important;
176
+ }
177
+
178
+ #chat .message,
179
+ #chat [data-testid="bot"],
180
+ #chat [data-testid="user"] {
181
+ font-size: 18px !important;
182
+ }
183
+ """,title="AXERA Qwen3 VL") as demo:
184
+ axera_logo = img_to_data_url_from_path("./axera_logo.png")
185
+ gr.Markdown(
186
+ f"""
187
+ <div style="display: flex; align-items: center; gap: 10px;">
188
+ <img src="{axera_logo}" alt="axera_logo" style="height: 60px;">
189
+ </div>
190
+ """
191
+ )
192
+
193
+ chatbot = gr.Chatbot(
194
+ label="对话",
195
+ bubble_full_width=False,
196
+ height=500,
197
+ avatar_images=(None, None), # 可替换头像
198
+ latex_delimiters=[{"left": "$$", "right": "$$", "display": True},
199
+ {"left": "$", "right": "$", "display": False}],
200
+ show_copy_button=True,
201
+ render_markdown=True,
202
+ elem_id="chat"
203
+ )
204
+
205
+ with gr.Row():
206
+ with gr.Column(scale=2):
207
+ prompt = gr.Textbox(label="Prompt", placeholder="输入你的提示语", lines=2)
208
+ with gr.Row():
209
+ send_btn = gr.Button("发送 ▶️", variant="primary")
210
+ clear_btn = gr.Button("清空")
211
+ stop_btn = gr.Button("停止 ■", variant="stop")
212
+ with gr.Row():
213
+ image = gr.Image(type="filepath", label="上传图片(可选)")
214
+ video = gr.Video(label="上传视频(可选)")
215
+
216
+ with gr.Column(scale=1):
217
+ base_url = gr.Textbox(value="http://localhost:8000/v1", label="Base URL")
218
+ model = gr.Textbox(value="AXERA-TECH/SmolVLM2-500M-Video-Instruct", label="Model")
219
+ api_key = gr.Textbox(value="not-needed", label="API Key", type="password")
220
+ with gr.Row():
221
+ prefer_video = gr.Checkbox(True, label="如果有视频,优先使用视频抽帧")
222
+ frame_stride = gr.Slider(1, 90, value=30, step=1, label="视频抽帧间隔")
223
+ max_frames = gr.Slider(1, 8, value=8, step=1, label="最多抽帧数")
224
+
225
+
226
+ # 单轮对话需要一个 state 来承载当前这轮的气泡
227
+ state = gr.State([])
228
+
229
+ send_btn.click(
230
+ fn=run_single_turn,
231
+ inputs=[prompt, image, video, prefer_video, frame_stride, max_frames, base_url, model, api_key, state],
232
+ outputs=chatbot,
233
+ show_progress=True,
234
+ queue=True,
235
+ )
236
+
237
+ def stop_stream():
238
+ url = "http://localhost:8000/v1/stop"
239
+ response = requests.get(url)
240
+ if response.status_code == 200:
241
+ print("Stream stopped successfully")
242
+ else:
243
+ print(f"Failed to stop stream: {response.status_code} - {response.text}")
244
+
245
+ stop_btn.click(
246
+ fn=stop_stream,
247
+ outputs=chatbot,
248
+ show_progress=True,
249
+ queue=True,
250
+ )
251
+
252
+ def clear_all():
253
+ return [], "", None, None, True, 30, 8
254
+ clear_btn.click(clear_all, None, [chatbot, prompt, image, video, prefer_video, frame_stride, max_frames])
255
+
256
+ if __name__ == "__main__":
257
+ ips = get_all_local_ips()
258
+ for ip in ips:
259
+ print(f"* Running on local URL: http://{ip}:7860")
260
+ ip = "0.0.0.0"
261
+ demo.launch(server_name=ip, server_port=7860)
main_api_ax650 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ccb96231d5f0f6767b11d1c3174b1564c232bf3d7834bc0918189f06b18f17c
3
+ size 6931024
main_ax650 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2424fb3c239998edf2d04d852deb4d738ec1d6b82e681a59dd8e2b6ed4204e96
3
+ size 6803824
openai_cli.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import glob
3
+ from openai import OpenAI
4
+ import cv2
5
+
6
+ BASE_URL = "http://localhost:8000/v1"
7
+
8
+ def img_to_data_url(img_path: str):
9
+ img = cv2.imread(img_path)
10
+ if img is None:
11
+ raise FileNotFoundError(f"Cannot read image: {img_path}")
12
+ ok, buf = cv2.imencode(".jpg", img)
13
+ if not ok:
14
+ raise RuntimeError("cv2.imencode failed")
15
+ b64 = base64.b64encode(buf).decode("ascii")
16
+ return f"data:image/jpeg;base64,{b64}"
17
+
18
+
19
+ def test(openai_messages):
20
+ client = OpenAI(api_key="not-needed", base_url=BASE_URL)
21
+
22
+ stream = client.chat.completions.create(
23
+ model="AXERA-TECH/SmolVLM2-500M-Video-Instruct",
24
+ messages=openai_messages,
25
+ stream=True,
26
+ )
27
+ out_chunks = []
28
+ for ev in stream:
29
+ delta = ev.choices[0].delta
30
+ if delta and delta.content:
31
+ out_chunks.append(delta.content)
32
+ print(delta.content, end="", flush=True)
33
+ print()
34
+ assistant_text = "".join(out_chunks).strip()
35
+
36
+ def test_image():
37
+ image_data = img_to_data_url("video/frame_0000.jpg")
38
+
39
+ openai_messages = {
40
+ "role": "user",
41
+ "content": [
42
+ {"type": "text", "text": "Describe this image"},
43
+ {"type": "image_url", "image_url": image_data},
44
+ ],
45
+ }
46
+
47
+
48
+ test(openai_messages)
49
+
50
+ def test_video():
51
+ image_list = glob.glob("video/*.jpg")
52
+ image_list.sort()
53
+
54
+ image_data_list = [img_to_data_url(img) for img in image_list]
55
+
56
+ openai_messages = {
57
+ "role": "user",
58
+ "content": [
59
+ {"type": "text", "text": "Describe this video"},
60
+ {"type": "image_url", "is_video":True, "image_url": image_data_list},
61
+ ],
62
+ }
63
+
64
+ test(openai_messages)
65
+
66
+ print("Test image")
67
+ test_image()
68
+
69
+ print("Test video")
70
+ test_video()
run_api_ax650.sh ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SmolVLM2-500M
2
+ AXMODEL_DIR=SmolVLM2-500M-Video-Instruct_Ax650/
3
+ VIT=SmolVLM2-500M-Video-Instruct_Ax650/vision_model_1x3x512x512_NHwC_U8.axmodel
4
+ LAYER_NUM=32
5
+ EMBED_SIZE=960
6
+
7
+
8
+ ./main_api_ax650 \
9
+ --template_filename_axmodel "${AXMODEL_DIR}/llama_p128_l%d_together.axmodel" \
10
+ --axmodel_num $LAYER_NUM \
11
+ --filename_image_encoder_axmodedl $VIT \
12
+ --use_mmap_load_embed 1 \
13
+ --filename_tokenizer_model smolvlm2_tokenizer.txt \
14
+ --filename_post_axmodel "${AXMODEL_DIR}/llama_post.axmodel" \
15
+ --filename_tokens_embed "${AXMODEL_DIR}/model.embed_tokens.weight.bfloat16.bin" \
16
+ --tokens_embed_num 49280 \
17
+ --tokens_embed_size $EMBED_SIZE \
18
+ --post_config_path post_config1.json
19
+
run_ax650.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SmolVLM2-500M
2
+ AXMODEL_DIR=SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/
3
+ VIT=SmolVLM2-500M-Video-Instruct_Ax650-C128-P768-CTX1024/vision_model_1x3x512x512_NHwC_U8.axmodel
4
+ LAYER_NUM=32
5
+ EMBED_SIZE=960
6
+
7
+ ./main_ax650 \
8
+ --template_filename_axmodel "${AXMODEL_DIR}/llama_p128_l%d_together.axmodel" \
9
+ --axmodel_num $LAYER_NUM \
10
+ --filename_image_encoder_axmodedl $VIT \
11
+ --bos 0 --eos 0 \
12
+ --dynamic_load_axmodel_layer 0 \
13
+ --use_mmap_load_embed 1 \
14
+ --filename_tokenizer_model smolvlm2_tokenizer.txt \
15
+ --filename_post_axmodel "${AXMODEL_DIR}/llama_post.axmodel" \
16
+ --filename_tokens_embed "${AXMODEL_DIR}/model.embed_tokens.weight.bfloat16.bin" \
17
+ --tokens_embed_num 49280 \
18
+ --tokens_embed_size $EMBED_SIZE \
19
+ --live_print 1 \
20
+ --continue 1 \
21
+ --video 0 \
22
+ --post_config_path post_config1.json
23
+
24
+ # --video 0 表示图像理解; --video 1 表示适配理解
25
+ # 传图像路径时可以传一个图像路径(单张图像理解)或者,多张图像所在的目录(多张图像理解)
26
+ # 视频理解要传一个视频帧所在的目录路径,只支持一个视频
smolvlm2_tokenizer.txt ADDED
The diff for this file is too large to render. See raw diff
 
video/frame_0000.jpg ADDED

Git LFS Details

  • SHA256: d0cea2769fd052ce3b24c3982a17135dbffd600cd612014c3cffe014c0224ffa
  • Pointer size: 130 Bytes
  • Size of remote file: 54.1 kB
video/frame_0008.jpg ADDED

Git LFS Details

  • SHA256: c812aed3407b41d474d859fedd4d9eaab971482e1dd0e22c5da16a627a740394
  • Pointer size: 130 Bytes
  • Size of remote file: 52.7 kB
video/frame_0016.jpg ADDED

Git LFS Details

  • SHA256: 3cc72377820bd9c47a41ebcae744acd8b3952b54e02854a9cf0b4a70e49def60
  • Pointer size: 130 Bytes
  • Size of remote file: 48.9 kB
video/frame_0024.jpg ADDED

Git LFS Details

  • SHA256: afee75df68ffda9f5ae59b0ba3badf29e56a60acce64554ecc9e49f20854c47c
  • Pointer size: 130 Bytes
  • Size of remote file: 49.2 kB
video/frame_0032.jpg ADDED

Git LFS Details

  • SHA256: 1cea98a54747fb32c1bf7375aae020b3703ee70da6eb967d1a7d590d9f997038
  • Pointer size: 130 Bytes
  • Size of remote file: 49.1 kB