yongqiang commited on
Commit ·
1601280
0
Parent(s):
Initialize the repository
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +44 -0
- .gitignore +3 -0
- README.md +148 -0
- assets/demo_1.png +3 -0
- assets/demo_2.png +3 -0
- config.json +0 -0
- examples/image_0.jpg +3 -0
- examples/image_1.jpg +3 -0
- examples/image_2.png +3 -0
- examples/image_3.png +3 -0
- examples/laorenshuaidao.mp4 +3 -0
- examples/red-panda.mp4 +3 -0
- examples/tuboshu.mp4 +3 -0
- gradio_demo.py +392 -0
- infer_axmodel.py +186 -0
- infer_torch.py +212 -0
- internvl3-5_axmodel/model.embed_tokens.weight.bfloat16.bin +3 -0
- internvl3-5_axmodel/model.embed_tokens.weight.float32.bin +3 -0
- internvl3-5_axmodel/model.embed_tokens.weight.npy +3 -0
- internvl3-5_axmodel/qwen3_p128_l0_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l10_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l11_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l12_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l13_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l14_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l15_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l16_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l17_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l18_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l19_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l1_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l20_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l21_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l22_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l23_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l24_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l25_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l26_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l27_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l2_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l3_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l4_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l5_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l6_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l7_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l8_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_p128_l9_together.axmodel +3 -0
- internvl3-5_axmodel/qwen3_post.axmodel +3 -0
- internvl3-5_tokenizer/added_tokens.json +37 -0
- internvl3-5_tokenizer/config.json +89 -0
.gitattributes
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.axmodel filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
main_api_ax650 filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
main_api_axcl_x86 filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
main_ax650 filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
main_axcl_x86 filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
internvl3-5_tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
*tmp/
|
| 3 |
+
|
README.md
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: transformers
|
| 3 |
+
license: bsd-3-clause
|
| 4 |
+
base_model:
|
| 5 |
+
- OpenGVLab/InternVL3_5-2B
|
| 6 |
+
tags:
|
| 7 |
+
- InternVL3
|
| 8 |
+
- InternVL3_5-2B
|
| 9 |
+
- InternVL3_5-2B_GPTQ_INT4
|
| 10 |
+
- Int8
|
| 11 |
+
- VLM
|
| 12 |
+
pipeline_tag: image-text-to-text
|
| 13 |
+
language:
|
| 14 |
+
- en
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
# InternVL3_5-2B_GPTQ_INT4
|
| 18 |
+
|
| 19 |
+
This version of InternVL3_5-2B_GPTQ_INT4 has been converted to run on the Axera NPU using **w4a16** quantization.
|
| 20 |
+
|
| 21 |
+
This model has been optimized with the following LoRA:
|
| 22 |
+
|
| 23 |
+
Compatible with Pulsar2 version: 5.1-patch1.
|
| 24 |
+
|
| 25 |
+
Please note that the context of the model is 2k and the maximum prefill length is 1k.
|
| 26 |
+
|
| 27 |
+
## Convert tools links:
|
| 28 |
+
|
| 29 |
+
For those who are interested in model conversion, you can try to export axmodel through the original repo:
|
| 30 |
+
|
| 31 |
+
https://huggingface.co/OpenGVLab/InternVL3_5-2B
|
| 32 |
+
|
| 33 |
+
[How to Convert LLM from Huggingface to axmodel](https://github.com/AXERA-TECH/InternVL3_5-2B_GPTQ_INT4.axera/tree/main/model_convert)
|
| 34 |
+
|
| 35 |
+
[AXera NPU HOST LLM Runtime](https://github.com/AXERA-TECH/ax-llm/tree/ax-internvl)
|
| 36 |
+
|
| 37 |
+
[AXera NPU AXCL LLM Runtime](https://github.com/AXERA-TECH/ax-llm/tree/axcl-internvl)
|
| 38 |
+
|
| 39 |
+
## Support Platform
|
| 40 |
+
|
| 41 |
+
- AX650
|
| 42 |
+
- AX650N DEMO Board
|
| 43 |
+
- [M4N-Dock(爱芯派Pro)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
|
| 44 |
+
- [M.2 Accelerator card](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html)
|
| 45 |
+
|
| 46 |
+
|Chips|image encoder 448|ttft|w8a16|
|
| 47 |
+
|--|--|--|--|
|
| 48 |
+
|AX650| 364.412 ms | 4951.50 ms | 28.07 tokens/sec|
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
## How to use
|
| 52 |
+
|
| 53 |
+
Download all files from this repository to the device
|
| 54 |
+
|
| 55 |
+
```
|
| 56 |
+
$ tree -L 1
|
| 57 |
+
.
|
| 58 |
+
├── assets
|
| 59 |
+
├── config.json
|
| 60 |
+
├── examples
|
| 61 |
+
├── gradio_demo.py
|
| 62 |
+
├── infer_axmodel.py
|
| 63 |
+
├── infer_torch.py
|
| 64 |
+
├── internvl3-5_axmodel
|
| 65 |
+
├── internvl3-5_tokenizer
|
| 66 |
+
├── README.md
|
| 67 |
+
├── utils
|
| 68 |
+
└── vit-models
|
| 69 |
+
|
| 70 |
+
6 directories, 5 files
|
| 71 |
+
```
|
| 72 |
+
|
| 73 |
+
#### Install transformer
|
| 74 |
+
|
| 75 |
+
```
|
| 76 |
+
pip install transformers==4.57.1
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
#### Inference with AX650 Host, such as M4N-Dock(爱芯派Pro) or AX650 DEMO Board
|
| 80 |
+
|
| 81 |
+
Interactive conversations using the `Gradio API`:
|
| 82 |
+
|
| 83 |
+
```bash
|
| 84 |
+
$ python3 gradio_demo.py --hf_model internvl3-5_tokenizer/ --axmodel_path internvl3-5_axmodel/ --vit_model vit-models/internvl_vit_model_1x3x448x448.axmodel
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
Plain text dialogue:
|
| 88 |
+
|
| 89 |
+

|
| 90 |
+
|
| 91 |
+
Image understanding:
|
| 92 |
+
|
| 93 |
+

|
| 94 |
+
|
| 95 |
+
---
|
| 96 |
+
|
| 97 |
+
Run the following command on the Axera board to start a chat conversation:
|
| 98 |
+
|
| 99 |
+
```sh
|
| 100 |
+
$ python3 infer_axmodel.py --hf_model internvl3-5_tokenizer/ --axmodel_path internvl3-5_axmodel/ --question "请计算函数[y=2x^2+2]的导数, 并提供 markdown 格式的推理过程"
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
output:
|
| 104 |
+
|
| 105 |
+
```bash
|
| 106 |
+
[INFO] Using provider: AxEngineExecutionProvider
|
| 107 |
+
[INFO] Model type: 2 (triple core)
|
| 108 |
+
[INFO] Compiler version: 5.1-dirty 0fdbfe15-dirty
|
| 109 |
+
Model loaded successfully!
|
| 110 |
+
slice_indices: [0]
|
| 111 |
+
Slice prefill done: 0
|
| 112 |
+
answer >> 函数 \( y = 2x^2 + 2 \) 的导数可以通过求导法则来计算。首先,我们对函数中的每一项分别求导:
|
| 113 |
+
|
| 114 |
+
1. 对于 \( 2x^2 \),使用幂法则求导:
|
| 115 |
+
\[
|
| 116 |
+
\frac{d}{dx}(2x^2) = 2 \cdot 2x = 4x
|
| 117 |
+
\]
|
| 118 |
+
|
| 119 |
+
2. 对于常数项 \( 2 \),其导数为 0,因为常数的导数为 0。
|
| 120 |
+
|
| 121 |
+
将这两部分的结果相加,得到函数 \( y \) 的导数:
|
| 122 |
+
\[
|
| 123 |
+
y' = 4x
|
| 124 |
+
\]
|
| 125 |
+
|
| 126 |
+
因此,函数 \( y = 2x^2 + 2 \) 的导数为 \( y' = 4x \)。
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
Enter the following command to perform the single-image understanding task:
|
| 130 |
+
|
| 131 |
+
```sh
|
| 132 |
+
$ python3 infer_axmodel.py --hf_model internvl3-5_tokenizer/ --axmodel_path internvl3-5_axmodel/ --question "请描述这幅图" -i examples/image_0.jpg --vit_model vit-models/internvl_vit_model_1x3x448x448.axmodel
|
| 133 |
+
```
|
| 134 |
+
|
| 135 |
+

|
| 136 |
+
|
| 137 |
+
output:
|
| 138 |
+
|
| 139 |
+
```bash
|
| 140 |
+
[INFO] Model type: 2 (triple core)
|
| 141 |
+
[INFO] Compiler version: 5.1-dirty 0fdbfe15-dirty
|
| 142 |
+
Model loaded successfully!
|
| 143 |
+
slice_indices: [0, 1, 2]
|
| 144 |
+
Slice prefill done: 0
|
| 145 |
+
Slice prefill done: 1
|
| 146 |
+
Slice prefill done: 2
|
| 147 |
+
answer >> 这是一张红熊猫的照片。红熊猫是一种红棕色的哺乳动物,通常生活在亚洲的森林中。它们以捕食昆虫和小型无脊椎动物为生。图片中,红熊猫正坐在一个木制的平台上,背景是绿色的树木和植被,显得非常自然和生动。红熊猫的表情看起来很友好,似乎在观察或等待什么。
|
| 148 |
+
```
|
assets/demo_1.png
ADDED
|
Git LFS Details
|
assets/demo_2.png
ADDED
|
Git LFS Details
|
config.json
ADDED
|
File without changes
|
examples/image_0.jpg
ADDED
|
Git LFS Details
|
examples/image_1.jpg
ADDED
|
Git LFS Details
|
examples/image_2.png
ADDED
|
Git LFS Details
|
examples/image_3.png
ADDED
|
Git LFS Details
|
examples/laorenshuaidao.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8f5c00b37b23af3d01d133da880eb7f6e50d4af608e3575784be7063eb137011
|
| 3 |
+
size 2704112
|
examples/red-panda.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d921c07bb97224d65a37801541d246067f0d506f08723ffa1ad85c217907ccb8
|
| 3 |
+
size 1867237
|
examples/tuboshu.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ced4d95877b9a7f8b48f79bdfe4287eff8837f20348daec2f2e2987459ec1712
|
| 3 |
+
size 5952043
|
gradio_demo.py
ADDED
|
@@ -0,0 +1,392 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import os
|
| 3 |
+
import time
|
| 4 |
+
from typing import Any, Dict, List, Optional, Generator, Tuple
|
| 5 |
+
|
| 6 |
+
import gradio as gr
|
| 7 |
+
import numpy as np
|
| 8 |
+
import torch
|
| 9 |
+
import torchvision.transforms as T
|
| 10 |
+
from ml_dtypes import bfloat16
|
| 11 |
+
from PIL import Image
|
| 12 |
+
from torchvision.transforms.functional import InterpolationMode
|
| 13 |
+
from transformers import AutoConfig, AutoTokenizer
|
| 14 |
+
|
| 15 |
+
from utils.infer_func import InferManager
|
| 16 |
+
from axengine import InferenceSession
|
| 17 |
+
|
| 18 |
+
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
| 19 |
+
IMAGENET_STD = (0.229, 0.224, 0.225)
|
| 20 |
+
IMG_PLACEHOLDER_TOKEN_ID = 151669 # <img>
|
| 21 |
+
IMG_CONTEXT_REPEAT = 256 # number of image context tokens expected by the model
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
SYSTEM_PROMPT = (
|
| 25 |
+
"<|im_start|>system\n"
|
| 26 |
+
"你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型, 英文名叫 InternVL3, "
|
| 27 |
+
"是一个有用无害的人工智能助手, 擅长思考和回答用户的问题. 请你在回答问题时使用简体中文."
|
| 28 |
+
"<|im_end|>\n"
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def build_transform(input_size: int):
|
| 33 |
+
transform = T.Compose([
|
| 34 |
+
T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
|
| 35 |
+
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
|
| 36 |
+
T.ToTensor(),
|
| 37 |
+
T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
|
| 38 |
+
])
|
| 39 |
+
return transform
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def dynamic_preprocess(image: Image.Image, min_num: int = 1, max_num: int = 12, image_size: int = 448,
|
| 43 |
+
use_thumbnail: bool = False):
|
| 44 |
+
orig_width, orig_height = image.size
|
| 45 |
+
aspect_ratio = orig_width / orig_height
|
| 46 |
+
|
| 47 |
+
target_ratios = set(
|
| 48 |
+
(i, j)
|
| 49 |
+
for n in range(min_num, max_num + 1)
|
| 50 |
+
for i in range(1, n + 1)
|
| 51 |
+
for j in range(1, n + 1)
|
| 52 |
+
if i * j <= max_num and i * j >= min_num
|
| 53 |
+
)
|
| 54 |
+
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
| 55 |
+
|
| 56 |
+
def find_closest_aspect_ratio(ar: float, ratios: List[tuple]):
|
| 57 |
+
best_ratio_diff = float("inf")
|
| 58 |
+
best_ratio = (1, 1)
|
| 59 |
+
area = orig_width * orig_height
|
| 60 |
+
for ratio in ratios:
|
| 61 |
+
target_aspect_ratio = ratio[0] / ratio[1]
|
| 62 |
+
ratio_diff = abs(ar - target_aspect_ratio)
|
| 63 |
+
if ratio_diff < best_ratio_diff:
|
| 64 |
+
best_ratio_diff = ratio_diff
|
| 65 |
+
best_ratio = ratio
|
| 66 |
+
elif ratio_diff == best_ratio_diff:
|
| 67 |
+
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
|
| 68 |
+
best_ratio = ratio
|
| 69 |
+
return best_ratio
|
| 70 |
+
|
| 71 |
+
target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios)
|
| 72 |
+
target_width = image_size * target_aspect_ratio[0]
|
| 73 |
+
target_height = image_size * target_aspect_ratio[1]
|
| 74 |
+
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
| 75 |
+
|
| 76 |
+
resized_img = image.resize((target_width, target_height))
|
| 77 |
+
processed_images = []
|
| 78 |
+
for i in range(blocks):
|
| 79 |
+
box = (
|
| 80 |
+
(i % (target_width // image_size)) * image_size,
|
| 81 |
+
(i // (target_width // image_size)) * image_size,
|
| 82 |
+
((i % (target_width // image_size)) + 1) * image_size,
|
| 83 |
+
((i // (target_width // image_size)) + 1) * image_size,
|
| 84 |
+
)
|
| 85 |
+
split_img = resized_img.crop(box)
|
| 86 |
+
processed_images.append(split_img)
|
| 87 |
+
assert len(processed_images) == blocks
|
| 88 |
+
if use_thumbnail and len(processed_images) != 1:
|
| 89 |
+
processed_images.append(image.resize((image_size, image_size)))
|
| 90 |
+
return processed_images
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def load_image(image_file: Image.Image, input_size: int = 448, max_num: int = 12):
|
| 94 |
+
transform = build_transform(input_size=input_size)
|
| 95 |
+
images = dynamic_preprocess(image_file, image_size=input_size, use_thumbnail=True, max_num=max_num)
|
| 96 |
+
pixel_values = [transform(img) for img in images]
|
| 97 |
+
pixel_values = torch.stack(pixel_values)
|
| 98 |
+
return pixel_values
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
class InternVLGradioDemo:
|
| 102 |
+
def __init__(self, hf_model: str, axmodel_dir: str, vit_axmodel: str, max_seq_len: int = 2047):
|
| 103 |
+
self.hf_model = hf_model
|
| 104 |
+
self.axmodel_dir = axmodel_dir
|
| 105 |
+
self.vit_axmodel = vit_axmodel
|
| 106 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 107 |
+
|
| 108 |
+
self.embeds = np.load(os.path.join(axmodel_dir, "model.embed_tokens.weight.npy"))
|
| 109 |
+
self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model)
|
| 110 |
+
config = AutoConfig.from_pretrained(self.hf_model, trust_remote_code=True)
|
| 111 |
+
if hasattr(config, 'llm_config') and config.llm_config is not None:
|
| 112 |
+
self.cfg = config.llm_config
|
| 113 |
+
else:
|
| 114 |
+
self.cfg = config
|
| 115 |
+
|
| 116 |
+
self.vit_session = InferenceSession(self.vit_axmodel)
|
| 117 |
+
self.infer_manager = InferManager(self.cfg, self.axmodel_dir, max_seq_len=max_seq_len)
|
| 118 |
+
|
| 119 |
+
def _build_single_turn_prompt(self, user_text: str, vit_features: List[np.ndarray]):
|
| 120 |
+
prompt = SYSTEM_PROMPT
|
| 121 |
+
prompt += f"<|im_start|>user\n{user_text}"
|
| 122 |
+
for _ in vit_features:
|
| 123 |
+
prompt += "\n<img>" + "<IMG_CONTEXT>" * IMG_CONTEXT_REPEAT + "</img>"
|
| 124 |
+
prompt += "<|im_end|>\n<|im_start|>assistant\n"
|
| 125 |
+
return prompt
|
| 126 |
+
|
| 127 |
+
def _insert_vision_features(self, token_ids: List[int], prefill_data: np.ndarray, vit_features: List[np.ndarray]):
|
| 128 |
+
image_start_indices = np.where(np.array(token_ids) == IMG_PLACEHOLDER_TOKEN_ID)[0].tolist()
|
| 129 |
+
if len(image_start_indices) != len(vit_features):
|
| 130 |
+
raise ValueError("图片数量与占位符数量不一致, 请检查输入和模板生成逻辑")
|
| 131 |
+
for idx, image_start_index in enumerate(image_start_indices):
|
| 132 |
+
insert_pos = image_start_index + 1
|
| 133 |
+
prefill_data[insert_pos: insert_pos + IMG_CONTEXT_REPEAT] = vit_features[idx][0, :, :]
|
| 134 |
+
return prefill_data
|
| 135 |
+
|
| 136 |
+
def _run_model(self, prompt: str, vit_features: List[np.ndarray]):
|
| 137 |
+
"""Non-streaming推理,保留以防需要一次性结果。"""
|
| 138 |
+
for k_cache in self.infer_manager.k_caches:
|
| 139 |
+
k_cache.fill(0)
|
| 140 |
+
for v_cache in self.infer_manager.v_caches:
|
| 141 |
+
v_cache.fill(0)
|
| 142 |
+
|
| 143 |
+
token_ids = self.tokenizer.encode(prompt)
|
| 144 |
+
prefill_data = np.take(self.embeds, token_ids, axis=0).astype(bfloat16)
|
| 145 |
+
if vit_features:
|
| 146 |
+
prefill_data = self._insert_vision_features(token_ids, prefill_data, vit_features)
|
| 147 |
+
|
| 148 |
+
eos_token_id = None
|
| 149 |
+
if isinstance(self.cfg.eos_token_id, list) and len(self.cfg.eos_token_id) > 1:
|
| 150 |
+
eos_token_id = self.cfg.eos_token_id
|
| 151 |
+
|
| 152 |
+
slice_len = 128
|
| 153 |
+
token_ids = self.infer_manager.prefill(self.tokenizer, token_ids, prefill_data, slice_len=slice_len)
|
| 154 |
+
return self.infer_manager.decode(
|
| 155 |
+
self.tokenizer,
|
| 156 |
+
token_ids,
|
| 157 |
+
self.embeds,
|
| 158 |
+
slice_len=slice_len,
|
| 159 |
+
eos_token_id=eos_token_id,
|
| 160 |
+
stream=False,
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
def _stream_generate(self, prompt: str, vit_features: List[np.ndarray]):
|
| 164 |
+
"""流式生成,逐 token 产出累积文本与计时信息 (TTFT 与平均 decode ms/token)。"""
|
| 165 |
+
# reset kv cache per request
|
| 166 |
+
for k_cache in self.infer_manager.k_caches:
|
| 167 |
+
k_cache.fill(0)
|
| 168 |
+
for v_cache in self.infer_manager.v_caches:
|
| 169 |
+
v_cache.fill(0)
|
| 170 |
+
|
| 171 |
+
token_ids = self.tokenizer.encode(prompt)
|
| 172 |
+
prefill_data = np.take(self.embeds, token_ids, axis=0).astype(bfloat16)
|
| 173 |
+
if vit_features:
|
| 174 |
+
prefill_data = self._insert_vision_features(token_ids, prefill_data, vit_features)
|
| 175 |
+
|
| 176 |
+
eos_token_id = None
|
| 177 |
+
if isinstance(self.cfg.eos_token_id, list) and len(self.cfg.eos_token_id) > 1:
|
| 178 |
+
eos_token_id = self.cfg.eos_token_id
|
| 179 |
+
|
| 180 |
+
slice_len = 128
|
| 181 |
+
t_start = time.time()
|
| 182 |
+
token_ids = self.infer_manager.prefill(self.tokenizer, token_ids, prefill_data, slice_len=slice_len)
|
| 183 |
+
|
| 184 |
+
# copy decode逻辑,实现手动流式输出
|
| 185 |
+
mask = np.zeros((1, 1, self.infer_manager.max_seq_len + 1), dtype=np.float32).astype(bfloat16)
|
| 186 |
+
mask[:, :, :self.infer_manager.max_seq_len] -= 65536
|
| 187 |
+
seq_len = len(token_ids) - 1
|
| 188 |
+
if slice_len > 0:
|
| 189 |
+
mask[:, :, :seq_len] = 0
|
| 190 |
+
|
| 191 |
+
ttft_ms: Optional[float] = None
|
| 192 |
+
decode_tokens = 0
|
| 193 |
+
decode_elapsed_ms: float = 0.0
|
| 194 |
+
generated_text = ""
|
| 195 |
+
yield generated_text, ttft_ms, None, None, False
|
| 196 |
+
|
| 197 |
+
for step_idx in range(self.infer_manager.max_seq_len):
|
| 198 |
+
if slice_len > 0 and step_idx < seq_len:
|
| 199 |
+
continue
|
| 200 |
+
cur_token = token_ids[step_idx]
|
| 201 |
+
indices = np.array([step_idx], np.uint32).reshape((1, 1))
|
| 202 |
+
data = self.embeds[cur_token, :].reshape((1, 1, self.cfg.hidden_size)).astype(bfloat16)
|
| 203 |
+
for layer_idx in range(self.cfg.num_hidden_layers):
|
| 204 |
+
input_feed = {
|
| 205 |
+
"K_cache": self.infer_manager.k_caches[layer_idx],
|
| 206 |
+
"V_cache": self.infer_manager.v_caches[layer_idx],
|
| 207 |
+
"indices": indices,
|
| 208 |
+
"input": data,
|
| 209 |
+
"mask": mask,
|
| 210 |
+
}
|
| 211 |
+
outputs = self.infer_manager.decoder_sessions[layer_idx].run(None, input_feed, shape_group=0)
|
| 212 |
+
self.infer_manager.k_caches[layer_idx][:, step_idx, :] = outputs[0][:, :, :]
|
| 213 |
+
self.infer_manager.v_caches[layer_idx][:, step_idx, :] = outputs[1][:, :, :]
|
| 214 |
+
data = outputs[2]
|
| 215 |
+
mask[..., step_idx] = 0
|
| 216 |
+
if step_idx < seq_len - 1:
|
| 217 |
+
continue
|
| 218 |
+
post_out = self.infer_manager.post_process_session.run(None, {"input": data})[0]
|
| 219 |
+
next_token, possible_tokens, possible_probs = self.infer_manager.post_process(post_out, temperature=0.7)
|
| 220 |
+
if eos_token_id is not None and next_token in eos_token_id:
|
| 221 |
+
ttft_ms = ttft_ms or (time.time() - t_start) * 1000
|
| 222 |
+
break
|
| 223 |
+
if next_token == self.tokenizer.eos_token_id:
|
| 224 |
+
ttft_ms = ttft_ms or (time.time() - t_start) * 1000
|
| 225 |
+
break
|
| 226 |
+
|
| 227 |
+
token_ids.append(next_token)
|
| 228 |
+
# 使用完整 token 列表解码,避免多字节 UTF-8 字符被截断显示为乱码
|
| 229 |
+
# 只解码新生成的 tokens(从 seq_len 开始)
|
| 230 |
+
generated_text = self.tokenizer.decode(token_ids[seq_len:], skip_special_tokens=True)
|
| 231 |
+
|
| 232 |
+
if ttft_ms is None:
|
| 233 |
+
ttft_ms = (time.time() - t_start) * 1000
|
| 234 |
+
else:
|
| 235 |
+
decode_tokens += 1
|
| 236 |
+
decode_elapsed_ms = (time.time() - t_start) * 1000 - ttft_ms
|
| 237 |
+
|
| 238 |
+
avg_decode = (decode_elapsed_ms / decode_tokens) if decode_tokens > 0 else None
|
| 239 |
+
yield generated_text, ttft_ms, avg_decode, decode_tokens, False
|
| 240 |
+
|
| 241 |
+
total_ms = (time.time() - t_start) * 1000
|
| 242 |
+
avg_decode = (decode_elapsed_ms / decode_tokens) if decode_tokens > 0 else None
|
| 243 |
+
yield generated_text, ttft_ms, avg_decode, decode_tokens, True
|
| 244 |
+
|
| 245 |
+
def chat(self, user_input: str, image: Optional[Image.Image]) -> Generator:
|
| 246 |
+
user_text = (user_input or "").strip()
|
| 247 |
+
if not user_text and image is None:
|
| 248 |
+
yield [], gr.update(), gr.update(), gr.update(), gr.update()
|
| 249 |
+
return
|
| 250 |
+
|
| 251 |
+
# 先展示占位,保持图片不清空;同时占位速度信息
|
| 252 |
+
yield [(user_text, "处理中…")], gr.update(value=""), gr.update(), gr.update(value="<div style='text-align: right; font-size: 13px; color: #6b7280; font-family: monospace;'>TTFT -- ms | Decode -- ms/token | Tokens --</div>"), gr.update(interactive=False)
|
| 253 |
+
|
| 254 |
+
vit_outputs = []
|
| 255 |
+
if image is not None:
|
| 256 |
+
pixel_values = load_image(image, input_size=448, max_num=1)
|
| 257 |
+
vit_output = self.vit_session.run(None, {"image": pixel_values.numpy()})[0]
|
| 258 |
+
vit_outputs.append(vit_output.copy())
|
| 259 |
+
|
| 260 |
+
prompt = self._build_single_turn_prompt(user_text, vit_outputs)
|
| 261 |
+
|
| 262 |
+
chatbot_history = [(user_text, "")] # 将在流式过程中填充
|
| 263 |
+
for partial, ttft_ms, avg_decode_ms, decode_tokens, finished in self._stream_generate(prompt, vit_outputs):
|
| 264 |
+
chatbot_history[-1] = (user_text, partial)
|
| 265 |
+
ttft_disp = f"{ttft_ms:.0f}" if ttft_ms is not None else "--"
|
| 266 |
+
decode_disp = f"{avg_decode_ms:.1f}" if avg_decode_ms is not None else "--"
|
| 267 |
+
tok_disp = f"{decode_tokens}" if decode_tokens is not None else "--"
|
| 268 |
+
metrics_text = f"<div style='text-align: right; font-size: 13px; color: #6b7280; font-family: monospace;'>TTFT {ttft_disp} ms | Decode {decode_disp} ms/token | Tokens {tok_disp}</div>"
|
| 269 |
+
if finished:
|
| 270 |
+
yield chatbot_history, gr.update(value=""), gr.update(), gr.update(value=metrics_text), gr.update(interactive=True)
|
| 271 |
+
else:
|
| 272 |
+
yield chatbot_history, gr.update(value=""), gr.update(), gr.update(value=metrics_text), gr.update(interactive=False)
|
| 273 |
+
|
| 274 |
+
@staticmethod
|
| 275 |
+
def build_ui(demo: "InternVLGradioDemo", server_name: str = "0.0.0.0", server_port: int = 7860, share: bool = False):
|
| 276 |
+
# 自定义 JavaScript: Enter 发送, Shift+Enter 换行
|
| 277 |
+
custom_js = """
|
| 278 |
+
function() {
|
| 279 |
+
// 等待 DOM 加载完成后绑定事件
|
| 280 |
+
setTimeout(() => {
|
| 281 |
+
const textareas = document.querySelectorAll('#user-input textarea');
|
| 282 |
+
textareas.forEach(textarea => {
|
| 283 |
+
// 移除可能存在的旧监听器
|
| 284 |
+
textarea.removeEventListener('keydown', textarea._customKeyHandler);
|
| 285 |
+
|
| 286 |
+
textarea._customKeyHandler = function(e) {
|
| 287 |
+
if (e.key === 'Enter') {
|
| 288 |
+
if (e.shiftKey) {
|
| 289 |
+
// Shift+Enter: 插入换行符
|
| 290 |
+
e.preventDefault();
|
| 291 |
+
const start = this.selectionStart;
|
| 292 |
+
const end = this.selectionEnd;
|
| 293 |
+
const value = this.value;
|
| 294 |
+
this.value = value.substring(0, start) + '\\n' + value.substring(end);
|
| 295 |
+
this.selectionStart = this.selectionEnd = start + 1;
|
| 296 |
+
// 触发 input 事件让 Gradio 感知变化
|
| 297 |
+
this.dispatchEvent(new Event('input', { bubbles: true }));
|
| 298 |
+
} else {
|
| 299 |
+
// Enter: 发送消息
|
| 300 |
+
e.preventDefault();
|
| 301 |
+
const sendBtn = document.querySelector('#send-btn');
|
| 302 |
+
if (sendBtn) {
|
| 303 |
+
sendBtn.click();
|
| 304 |
+
}
|
| 305 |
+
}
|
| 306 |
+
}
|
| 307 |
+
};
|
| 308 |
+
textarea.addEventListener('keydown', textarea._customKeyHandler);
|
| 309 |
+
});
|
| 310 |
+
}, 500);
|
| 311 |
+
}
|
| 312 |
+
"""
|
| 313 |
+
|
| 314 |
+
with gr.Blocks(title="InternVL3_5-2B_GPTQ_INT4 AX Gradio Demo", theme=gr.themes.Soft(), js=custom_js) as iface:
|
| 315 |
+
gr.HTML("""<style>
|
| 316 |
+
#image-pane img {object-fit: contain; max-height: 380px;}
|
| 317 |
+
#chat-wrap {position: relative;}
|
| 318 |
+
#metrics-display {position: absolute; right: 12px; bottom: 12px; z-index: 5; pointer-events: none; text-align: right;}
|
| 319 |
+
#metrics-display > div {display: inline-block;}
|
| 320 |
+
</style>""")
|
| 321 |
+
gr.Markdown("""### InternVL3_5-2B_GPTQ_INT4 图文对话演示\n上传一张图片 (可选),输入问题,获取中文回答。""")
|
| 322 |
+
|
| 323 |
+
with gr.Row():
|
| 324 |
+
# 左侧:对话框和输入区域
|
| 325 |
+
with gr.Column(scale=5):
|
| 326 |
+
with gr.Group(elem_id="chat-wrap"):
|
| 327 |
+
chatbot = gr.Chatbot(height=500, label="对话")
|
| 328 |
+
metrics_md = gr.Markdown("<div style='text-align: right; font-size: 13px; color: #6b7280; font-family: monospace;'>TTFT -- ms | Decode -- ms/token | Tokens --</div>", elem_id="metrics-display")
|
| 329 |
+
|
| 330 |
+
with gr.Row():
|
| 331 |
+
user_input = gr.Textbox(
|
| 332 |
+
placeholder="按 Enter 发送,Shift+Enter 换行",
|
| 333 |
+
lines=2,
|
| 334 |
+
scale=7,
|
| 335 |
+
max_lines=5,
|
| 336 |
+
show_label=False,
|
| 337 |
+
elem_id="user-input",
|
| 338 |
+
)
|
| 339 |
+
with gr.Column(scale=1, min_width=100):
|
| 340 |
+
send_btn = gr.Button("发送", variant="primary", size="sm", elem_id="send-btn")
|
| 341 |
+
clear_btn = gr.Button("清空对话", variant="secondary", size="sm")
|
| 342 |
+
|
| 343 |
+
# 右侧:图像上传和信息提示
|
| 344 |
+
with gr.Column(scale=3):
|
| 345 |
+
image_input = gr.Image(
|
| 346 |
+
type="pil",
|
| 347 |
+
label="上传图片 (可选)",
|
| 348 |
+
height=380,
|
| 349 |
+
image_mode="RGB",
|
| 350 |
+
show_download_button=False,
|
| 351 |
+
elem_id="image-pane",
|
| 352 |
+
)
|
| 353 |
+
gr.Markdown("""- 支持单张图像理解\n- 仅当前问题与回答,不保留历史\n- 处理时间取决于硬件,请耐心等待""")
|
| 354 |
+
|
| 355 |
+
def _clear():
|
| 356 |
+
return [], gr.update(value=""), gr.update(), gr.update(value="<div style='text-align: right; font-size: 13px; color: #6b7280; font-family: monospace;'>TTFT -- ms | Decode -- ms/token | Tokens --</div>"), gr.update(interactive=True)
|
| 357 |
+
|
| 358 |
+
send_btn.click(
|
| 359 |
+
fn=demo.chat,
|
| 360 |
+
inputs=[user_input, image_input],
|
| 361 |
+
outputs=[chatbot, user_input, image_input, metrics_md, send_btn],
|
| 362 |
+
show_progress=False,
|
| 363 |
+
queue=True,
|
| 364 |
+
)
|
| 365 |
+
# 移除 user_input.submit,由自定义 JS 处理 Enter 发送,Shift+Enter 换行
|
| 366 |
+
clear_btn.click(fn=_clear, inputs=None, outputs=[chatbot, user_input, image_input, metrics_md, send_btn])
|
| 367 |
+
|
| 368 |
+
iface.queue().launch(server_name=server_name, server_port=server_port, share=share)
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
def parse_args():
|
| 372 |
+
parser = argparse.ArgumentParser(description="InternVL3-5-2B AX gradio demo")
|
| 373 |
+
parser.add_argument("--hf_model", type=str, default="./InternVL3_5-2B",
|
| 374 |
+
help="HuggingFace 模型路径")
|
| 375 |
+
parser.add_argument("--axmodel_path", type=str, default="./InternVL3_5-2B_axmodel",
|
| 376 |
+
help="LLM axmodel 目录")
|
| 377 |
+
parser.add_argument("--vit_model", type=str, default="./vit-models/internvl_vit_model_1x3x448x448.axmodel",
|
| 378 |
+
help="ViT axmodel 路径")
|
| 379 |
+
parser.add_argument("--port", type=int, default=7860, help="Gradio 端口")
|
| 380 |
+
parser.add_argument("--host", type=str, default="0.0.0.0", help="Gradio 监听地址")
|
| 381 |
+
parser.add_argument("--share", action="store_true", help="启用 gradio share")
|
| 382 |
+
return parser.parse_args()
|
| 383 |
+
|
| 384 |
+
|
| 385 |
+
def main():
|
| 386 |
+
args = parse_args()
|
| 387 |
+
demo = InternVLGradioDemo(args.hf_model, args.axmodel_path, args.vit_model)
|
| 388 |
+
InternVLGradioDemo.build_ui(demo, server_name=args.host, server_port=args.port, share=args.share)
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
if __name__ == "__main__":
|
| 392 |
+
main()
|
infer_axmodel.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoProcessor, AutoModelForImageTextToText
|
| 2 |
+
import torch
|
| 3 |
+
import onnx
|
| 4 |
+
import onnxruntime as ort
|
| 5 |
+
import numpy as np
|
| 6 |
+
import os
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
from transformers import AutoConfig, AutoTokenizer
|
| 9 |
+
from typing import List, Tuple
|
| 10 |
+
from axengine import InferenceSession
|
| 11 |
+
from ml_dtypes import bfloat16
|
| 12 |
+
from utils.infer_func import InferManager
|
| 13 |
+
import argparse
|
| 14 |
+
from PIL import Image
|
| 15 |
+
import torchvision.transforms as T
|
| 16 |
+
from torchvision.transforms.functional import InterpolationMode
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
| 20 |
+
IMAGENET_STD = (0.229, 0.224, 0.225)
|
| 21 |
+
|
| 22 |
+
def build_transform(input_size):
|
| 23 |
+
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
|
| 24 |
+
transform = T.Compose([
|
| 25 |
+
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
|
| 26 |
+
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
|
| 27 |
+
T.ToTensor(),
|
| 28 |
+
T.Normalize(mean=MEAN, std=STD)
|
| 29 |
+
])
|
| 30 |
+
return transform
|
| 31 |
+
|
| 32 |
+
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
|
| 33 |
+
best_ratio_diff = float('inf')
|
| 34 |
+
best_ratio = (1, 1)
|
| 35 |
+
area = width * height
|
| 36 |
+
for ratio in target_ratios:
|
| 37 |
+
target_aspect_ratio = ratio[0] / ratio[1]
|
| 38 |
+
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
|
| 39 |
+
if ratio_diff < best_ratio_diff:
|
| 40 |
+
best_ratio_diff = ratio_diff
|
| 41 |
+
best_ratio = ratio
|
| 42 |
+
elif ratio_diff == best_ratio_diff:
|
| 43 |
+
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
|
| 44 |
+
best_ratio = ratio
|
| 45 |
+
return best_ratio
|
| 46 |
+
|
| 47 |
+
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
|
| 48 |
+
orig_width, orig_height = image.size
|
| 49 |
+
aspect_ratio = orig_width / orig_height
|
| 50 |
+
|
| 51 |
+
# calculate the existing image aspect ratio
|
| 52 |
+
target_ratios = set(
|
| 53 |
+
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
|
| 54 |
+
i * j <= max_num and i * j >= min_num)
|
| 55 |
+
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
| 56 |
+
|
| 57 |
+
# find the closest aspect ratio to the target
|
| 58 |
+
target_aspect_ratio = find_closest_aspect_ratio(
|
| 59 |
+
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
|
| 60 |
+
|
| 61 |
+
# calculate the target width and height
|
| 62 |
+
target_width = image_size * target_aspect_ratio[0]
|
| 63 |
+
target_height = image_size * target_aspect_ratio[1]
|
| 64 |
+
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
| 65 |
+
|
| 66 |
+
# resize the image
|
| 67 |
+
resized_img = image.resize((target_width, target_height))
|
| 68 |
+
processed_images = []
|
| 69 |
+
for i in range(blocks):
|
| 70 |
+
box = (
|
| 71 |
+
(i % (target_width // image_size)) * image_size,
|
| 72 |
+
(i // (target_width // image_size)) * image_size,
|
| 73 |
+
((i % (target_width // image_size)) + 1) * image_size,
|
| 74 |
+
((i // (target_width // image_size)) + 1) * image_size
|
| 75 |
+
)
|
| 76 |
+
# split the image
|
| 77 |
+
split_img = resized_img.crop(box)
|
| 78 |
+
processed_images.append(split_img)
|
| 79 |
+
assert len(processed_images) == blocks
|
| 80 |
+
if use_thumbnail and len(processed_images) != 1:
|
| 81 |
+
thumbnail_img = image.resize((image_size, image_size))
|
| 82 |
+
processed_images.append(thumbnail_img)
|
| 83 |
+
return processed_images
|
| 84 |
+
|
| 85 |
+
def load_image(image_file, input_size=448, max_num=12):
|
| 86 |
+
image = Image.open(image_file).convert('RGB')
|
| 87 |
+
transform = build_transform(input_size=input_size)
|
| 88 |
+
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
|
| 89 |
+
pixel_values = [transform(image) for image in images]
|
| 90 |
+
pixel_values = torch.stack(pixel_values)
|
| 91 |
+
return pixel_values
|
| 92 |
+
|
| 93 |
+
if __name__ == "__main__":
|
| 94 |
+
|
| 95 |
+
"""
|
| 96 |
+
python3 infer_axmodel.py --vit_model vit-models/internvl_vit_model_1x3x448x448.axmodel --images examples/image_0.jpg
|
| 97 |
+
"""
|
| 98 |
+
prompt = None
|
| 99 |
+
parser = argparse.ArgumentParser(description="Model configuration parameters")
|
| 100 |
+
parser.add_argument("--hf_model", type=str, default="./InternVL3_5-1B",
|
| 101 |
+
help="Path to HuggingFace model")
|
| 102 |
+
parser.add_argument("--axmodel_path", type=str, default="./InternVL3_5-1B_axmodel",
|
| 103 |
+
help="Path to save compiled axmodel of llama model")
|
| 104 |
+
parser.add_argument("--vit_model", type=str, default=None, help="Path to save compiled axmodel of llama model")
|
| 105 |
+
parser.add_argument("-i", "--images", nargs='+', type=str, default=None,
|
| 106 |
+
help="Path to the test image.")
|
| 107 |
+
parser.add_argument("-q", "--question", type=str, default="请你描述这幅图的内容.",
|
| 108 |
+
help="Your question that you want to ask the model.")
|
| 109 |
+
args = parser.parse_args()
|
| 110 |
+
|
| 111 |
+
hf_model_path = args.hf_model
|
| 112 |
+
axmodel_path = args.axmodel_path
|
| 113 |
+
images = args.images
|
| 114 |
+
prompt = args.question
|
| 115 |
+
|
| 116 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 117 |
+
embeds = np.load(os.path.join(axmodel_path, "model.embed_tokens.weight.npy"))
|
| 118 |
+
|
| 119 |
+
# load the tokenizer and the model
|
| 120 |
+
tokenizer = AutoTokenizer.from_pretrained(hf_model_path)
|
| 121 |
+
config = AutoConfig.from_pretrained(hf_model_path, trust_remote_code=True)
|
| 122 |
+
|
| 123 |
+
# model = AutoModelForCausalLM.from_pretrained(
|
| 124 |
+
# hf_model_path,
|
| 125 |
+
# ).to(device)
|
| 126 |
+
|
| 127 |
+
test_imgs_path = args.images
|
| 128 |
+
vit_axmodel_path = args.vit_model
|
| 129 |
+
|
| 130 |
+
# set the max number of tiles in `max_num`
|
| 131 |
+
pixel_values_list = []
|
| 132 |
+
if test_imgs_path is not None:
|
| 133 |
+
for img_path in test_imgs_path:
|
| 134 |
+
pixel_values = load_image(img_path, input_size=448, max_num=1)
|
| 135 |
+
pixel_values_list.append(pixel_values)
|
| 136 |
+
print(f"输入图像数: {len(pixel_values_list)}")
|
| 137 |
+
print("preprocess image done!")
|
| 138 |
+
|
| 139 |
+
# extract img feature by vit
|
| 140 |
+
vit_session = InferenceSession(vit_axmodel_path)
|
| 141 |
+
vit_output_list = []
|
| 142 |
+
for idx, pixel_values in enumerate(pixel_values_list):
|
| 143 |
+
vit_output = vit_session.run(None, {"image": pixel_values.numpy()})[0]
|
| 144 |
+
vit_output_list.append(vit_output.copy()) # 避免 vit 输出结果使用同一块内存
|
| 145 |
+
|
| 146 |
+
print(f"vit_output.shape is {vit_output_list[0].shape}, vit feature extract done!")
|
| 147 |
+
|
| 148 |
+
prompt = "<|im_start|>system\n你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型, 英文名叫 InternVL3, 是一个有用无害的人工智能助手, 擅长思考和回答用户的问题. 请你在回答问题时使用简体中文.<|im_end|>\n"
|
| 149 |
+
question = args.question
|
| 150 |
+
prompt += "<|im_start|>user\n" + question
|
| 151 |
+
|
| 152 |
+
if len(pixel_values_list) > 0:
|
| 153 |
+
for idx in range(len(pixel_values_list)):
|
| 154 |
+
prompt += "\n<img>" + "<IMG_CONTEXT>" * 256 + "</img>\n"
|
| 155 |
+
prompt += "<|im_end|>\n<|im_start|>assistant\n"
|
| 156 |
+
print(f"prompt is {prompt}")
|
| 157 |
+
token_ids = tokenizer.encode(prompt)
|
| 158 |
+
# 图像理解
|
| 159 |
+
image_start_indices = np.where(np.array(token_ids) == 151669)[0].tolist() # <img> tag 151669, 151665
|
| 160 |
+
prefill_data = np.take(embeds, token_ids, axis=0)
|
| 161 |
+
prefill_data = prefill_data.astype(bfloat16)
|
| 162 |
+
token_len = len(token_ids)
|
| 163 |
+
|
| 164 |
+
for idx, image_start_index in enumerate(image_start_indices):
|
| 165 |
+
image_insert_index = image_start_index + 1
|
| 166 |
+
prefill_data[image_insert_index : image_insert_index + 256] = vit_output_list[idx][0, :, :]
|
| 167 |
+
##################################
|
| 168 |
+
|
| 169 |
+
if hasattr(config, 'llm_config') and config.llm_config is not None: # 兼容 GPTQ INT4 模型
|
| 170 |
+
cfg = config.llm_config
|
| 171 |
+
else:
|
| 172 |
+
cfg = config
|
| 173 |
+
|
| 174 |
+
eos_token_id = None
|
| 175 |
+
if isinstance(cfg.eos_token_id, list) and len(cfg.eos_token_id) > 1:
|
| 176 |
+
eos_token_id = cfg.eos_token_id
|
| 177 |
+
|
| 178 |
+
slice_len = 128
|
| 179 |
+
prefill_max_len = 1024 - 1
|
| 180 |
+
max_seq_len = 2048 - 1 # prefill + decode max length
|
| 181 |
+
|
| 182 |
+
imer = InferManager(cfg, axmodel_path, max_seq_len=max_seq_len) # prefill + decode max length
|
| 183 |
+
# import pdb; pdb.set_trace()
|
| 184 |
+
token_ids = imer.prefill(tokenizer, token_ids, prefill_data, slice_len=slice_len)
|
| 185 |
+
imer.decode(tokenizer, token_ids, embeds, slice_len=slice_len, eos_token_id=eos_token_id)
|
| 186 |
+
print("\n")
|
infer_torch.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
import numpy as np
|
| 3 |
+
import torch
|
| 4 |
+
import torchvision.transforms as T
|
| 5 |
+
from decord import VideoReader, cpu
|
| 6 |
+
from PIL import Image
|
| 7 |
+
from torchvision.transforms.functional import InterpolationMode
|
| 8 |
+
from transformers import AutoModel, AutoTokenizer
|
| 9 |
+
|
| 10 |
+
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
| 11 |
+
IMAGENET_STD = (0.229, 0.224, 0.225)
|
| 12 |
+
|
| 13 |
+
def build_transform(input_size):
|
| 14 |
+
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
|
| 15 |
+
transform = T.Compose([
|
| 16 |
+
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
|
| 17 |
+
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
|
| 18 |
+
T.ToTensor(),
|
| 19 |
+
T.Normalize(mean=MEAN, std=STD)
|
| 20 |
+
])
|
| 21 |
+
return transform
|
| 22 |
+
|
| 23 |
+
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
|
| 24 |
+
best_ratio_diff = float('inf')
|
| 25 |
+
best_ratio = (1, 1)
|
| 26 |
+
area = width * height
|
| 27 |
+
for ratio in target_ratios:
|
| 28 |
+
target_aspect_ratio = ratio[0] / ratio[1]
|
| 29 |
+
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
|
| 30 |
+
if ratio_diff < best_ratio_diff:
|
| 31 |
+
best_ratio_diff = ratio_diff
|
| 32 |
+
best_ratio = ratio
|
| 33 |
+
elif ratio_diff == best_ratio_diff:
|
| 34 |
+
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
|
| 35 |
+
best_ratio = ratio
|
| 36 |
+
return best_ratio
|
| 37 |
+
|
| 38 |
+
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
|
| 39 |
+
orig_width, orig_height = image.size
|
| 40 |
+
aspect_ratio = orig_width / orig_height
|
| 41 |
+
|
| 42 |
+
# calculate the existing image aspect ratio
|
| 43 |
+
target_ratios = set(
|
| 44 |
+
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
|
| 45 |
+
i * j <= max_num and i * j >= min_num)
|
| 46 |
+
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
| 47 |
+
|
| 48 |
+
# find the closest aspect ratio to the target
|
| 49 |
+
target_aspect_ratio = find_closest_aspect_ratio(
|
| 50 |
+
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
|
| 51 |
+
|
| 52 |
+
# calculate the target width and height
|
| 53 |
+
target_width = image_size * target_aspect_ratio[0]
|
| 54 |
+
target_height = image_size * target_aspect_ratio[1]
|
| 55 |
+
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
| 56 |
+
|
| 57 |
+
# resize the image
|
| 58 |
+
resized_img = image.resize((target_width, target_height))
|
| 59 |
+
processed_images = []
|
| 60 |
+
for i in range(blocks):
|
| 61 |
+
box = (
|
| 62 |
+
(i % (target_width // image_size)) * image_size,
|
| 63 |
+
(i // (target_width // image_size)) * image_size,
|
| 64 |
+
((i % (target_width // image_size)) + 1) * image_size,
|
| 65 |
+
((i // (target_width // image_size)) + 1) * image_size
|
| 66 |
+
)
|
| 67 |
+
# split the image
|
| 68 |
+
split_img = resized_img.crop(box)
|
| 69 |
+
processed_images.append(split_img)
|
| 70 |
+
assert len(processed_images) == blocks
|
| 71 |
+
if use_thumbnail and len(processed_images) != 1:
|
| 72 |
+
thumbnail_img = image.resize((image_size, image_size))
|
| 73 |
+
processed_images.append(thumbnail_img)
|
| 74 |
+
return processed_images
|
| 75 |
+
|
| 76 |
+
def load_image(image_file, input_size=448, max_num=12):
|
| 77 |
+
image = Image.open(image_file).convert('RGB')
|
| 78 |
+
transform = build_transform(input_size=input_size)
|
| 79 |
+
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
|
| 80 |
+
pixel_values = [transform(image) for image in images]
|
| 81 |
+
pixel_values = torch.stack(pixel_values)
|
| 82 |
+
return pixel_values
|
| 83 |
+
|
| 84 |
+
path = './InternVL3_5-1B'
|
| 85 |
+
model = AutoModel.from_pretrained(
|
| 86 |
+
path,
|
| 87 |
+
torch_dtype=torch.bfloat16,
|
| 88 |
+
load_in_8bit=False,
|
| 89 |
+
low_cpu_mem_usage=True,
|
| 90 |
+
use_flash_attn=True,
|
| 91 |
+
trust_remote_code=True,
|
| 92 |
+
device_map="auto").eval()
|
| 93 |
+
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
|
| 94 |
+
|
| 95 |
+
# set the max number of tiles in `max_num`
|
| 96 |
+
pixel_values = load_image('./examples/image_1.jpg', input_size=448, max_num=1).to(torch.bfloat16).cuda()
|
| 97 |
+
generation_config = dict(max_new_tokens=1024, do_sample=True)
|
| 98 |
+
|
| 99 |
+
# pure-text conversation (纯文本对话)
|
| 100 |
+
question = '中国的首都'
|
| 101 |
+
response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
|
| 102 |
+
print(f'User: {question}\nAssistant: {response}')
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
# single-image single-round conversation (单图单轮对话)
|
| 106 |
+
question = '<image>\n请你描述这幅图的内容.'
|
| 107 |
+
response = model.chat(tokenizer, pixel_values, question, generation_config)
|
| 108 |
+
print(f'User: {question}\nAssistant: {response}')
|
| 109 |
+
|
| 110 |
+
# # single-image multi-round conversation (单图多轮对话)
|
| 111 |
+
# question = '<image>\nPlease describe the image in detail.'
|
| 112 |
+
# response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
|
| 113 |
+
# print(f'User: {question}\nAssistant: {response}')
|
| 114 |
+
|
| 115 |
+
# question = 'Please write a poem according to the image.'
|
| 116 |
+
# response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
|
| 117 |
+
# print(f'User: {question}\nAssistant: {response}')
|
| 118 |
+
|
| 119 |
+
# # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
|
| 120 |
+
# pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
| 121 |
+
# pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
|
| 122 |
+
# pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 123 |
+
|
| 124 |
+
# question = '<image>\nDescribe the two images in detail.'
|
| 125 |
+
# response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 126 |
+
# history=None, return_history=True)
|
| 127 |
+
# print(f'User: {question}\nAssistant: {response}')
|
| 128 |
+
|
| 129 |
+
# question = 'What are the similarities and differences between these two images.'
|
| 130 |
+
# response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 131 |
+
# history=history, return_history=True)
|
| 132 |
+
# print(f'User: {question}\nAssistant: {response}')
|
| 133 |
+
|
| 134 |
+
# # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
|
| 135 |
+
# pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
| 136 |
+
# pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
|
| 137 |
+
# pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 138 |
+
# num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
| 139 |
+
|
| 140 |
+
# question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
|
| 141 |
+
# response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 142 |
+
# num_patches_list=num_patches_list,
|
| 143 |
+
# history=None, return_history=True)
|
| 144 |
+
# print(f'User: {question}\nAssistant: {response}')
|
| 145 |
+
|
| 146 |
+
# question = 'What are the similarities and differences between these two images.'
|
| 147 |
+
# response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 148 |
+
# num_patches_list=num_patches_list,
|
| 149 |
+
# history=history, return_history=True)
|
| 150 |
+
# print(f'User: {question}\nAssistant: {response}')
|
| 151 |
+
|
| 152 |
+
# # batch inference, single image per sample (单图批处理)
|
| 153 |
+
# pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
|
| 154 |
+
# pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
|
| 155 |
+
# num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
|
| 156 |
+
# pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
|
| 157 |
+
|
| 158 |
+
# questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
|
| 159 |
+
# responses = model.batch_chat(tokenizer, pixel_values,
|
| 160 |
+
# num_patches_list=num_patches_list,
|
| 161 |
+
# questions=questions,
|
| 162 |
+
# generation_config=generation_config)
|
| 163 |
+
# for question, response in zip(questions, responses):
|
| 164 |
+
# print(f'User: {question}\nAssistant: {response}')
|
| 165 |
+
|
| 166 |
+
# # video multi-round conversation (视频多轮对话)
|
| 167 |
+
# def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
|
| 168 |
+
# if bound:
|
| 169 |
+
# start, end = bound[0], bound[1]
|
| 170 |
+
# else:
|
| 171 |
+
# start, end = -100000, 100000
|
| 172 |
+
# start_idx = max(first_idx, round(start * fps))
|
| 173 |
+
# end_idx = min(round(end * fps), max_frame)
|
| 174 |
+
# seg_size = float(end_idx - start_idx) / num_segments
|
| 175 |
+
# frame_indices = np.array([
|
| 176 |
+
# int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
|
| 177 |
+
# for idx in range(num_segments)
|
| 178 |
+
# ])
|
| 179 |
+
# return frame_indices
|
| 180 |
+
|
| 181 |
+
# def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
|
| 182 |
+
# vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
|
| 183 |
+
# max_frame = len(vr) - 1
|
| 184 |
+
# fps = float(vr.get_avg_fps())
|
| 185 |
+
|
| 186 |
+
# pixel_values_list, num_patches_list = [], []
|
| 187 |
+
# transform = build_transform(input_size=input_size)
|
| 188 |
+
# frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
|
| 189 |
+
# for frame_index in frame_indices:
|
| 190 |
+
# img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
|
| 191 |
+
# img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
|
| 192 |
+
# pixel_values = [transform(tile) for tile in img]
|
| 193 |
+
# pixel_values = torch.stack(pixel_values)
|
| 194 |
+
# num_patches_list.append(pixel_values.shape[0])
|
| 195 |
+
# pixel_values_list.append(pixel_values)
|
| 196 |
+
# pixel_values = torch.cat(pixel_values_list)
|
| 197 |
+
# return pixel_values, num_patches_list
|
| 198 |
+
|
| 199 |
+
# video_path = './examples/red-panda.mp4'
|
| 200 |
+
# pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
|
| 201 |
+
# pixel_values = pixel_values.to(torch.bfloat16).cuda()
|
| 202 |
+
# video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
|
| 203 |
+
# question = video_prefix + 'What is the red panda doing?'
|
| 204 |
+
# # Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
|
| 205 |
+
# response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 206 |
+
# num_patches_list=num_patches_list, history=None, return_history=True)
|
| 207 |
+
# print(f'User: {question}\nAssistant: {response}')
|
| 208 |
+
|
| 209 |
+
# question = 'Describe this video in detail.'
|
| 210 |
+
# response, history = model.chat(tokenizer, pixel_values, question, generation_config,
|
| 211 |
+
# num_patches_list=num_patches_list, history=history, return_history=True)
|
| 212 |
+
# print(f'User: {question}\nAssistant: {response}')
|
internvl3-5_axmodel/model.embed_tokens.weight.bfloat16.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d23be80431651d6c1dc8a9a89d35ffc0565d9114c0b4675d085dad1f7ab5d89f
|
| 3 |
+
size 622329856
|
internvl3-5_axmodel/model.embed_tokens.weight.float32.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1604cef8ba75bc3c615e8b2853734464f54a26abea11f441e98f18fc49be24ab
|
| 3 |
+
size 1244659712
|
internvl3-5_axmodel/model.embed_tokens.weight.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0fd10721363fc0a9e0bd780faad607e032524fb4f6bcccf78068a5f7fe5319fc
|
| 3 |
+
size 1244659840
|
internvl3-5_axmodel/qwen3_p128_l0_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:71702b53c25639f32047fb391aa816d0a3fbdd076532eaca815a8ea86352e063
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l10_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e14a765456a52ffaeecc16f419d71727944233f71f13db9fecbc9cb2f8230e57
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l11_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:83ff2e11d13e501a978657a9a4e0f1fba5bd7b83ec93d6e45967fe8000dc79ba
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l12_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e4a517dfb88f693dde987f1b822eb3ca9003bd0b942113a04c21d9034afcda52
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l13_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c6d4675b57b3e87cd74b3ae15793699886ca5d3f3aab3227260909254f69c3d5
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l14_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:65dddb4171b6f688dd2bd998589e5057b8453d39e0a95268d3bf12fef5ec23f1
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l15_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:351dac6756d1c14f4786d1b55d34231e24641bc0d4e6899b6d455f9dcc6f4ac7
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l16_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9372ec58825360c4c1d66e090749ec74fb589e9538d4f63d2ffb4ed2210c625a
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l17_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7c1b676f03db77238ecc374a831968ddce9737a9abf4dbf7a95c0d14e50786b1
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l18_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:625f9855cd7b7c486686ec3a38711748340eb8087ce126557171ed49bf0f1a7d
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l19_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:96da8471a519e7cbd1236b7964e494e587d2350c6f000d3d4bc6266d8422b723
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l1_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:499cd84ed255cd5f19ea4f6e3886f6483f785d583ead804dee33cbc5b89c3950
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l20_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4c190f2267217f3ec07169db2cf34718a1f2a8360edc7a4737eeb1d8aaa832eb
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l21_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:93640f21e3a7a308a9313ce76e830d6ab9047ac78b788d59303cf2f9afdb5e73
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l22_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:884d394f2c0ba2b571daa7d9350429f1c153bce7f91df67472c5758c874ba82a
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l23_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:af327bcf09933e6087c100fbc5ac52a142b201f5e55db81ec4b1fefb3dfb8d37
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l24_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3ceee8ba3e472826e49a4977d01bb8f245f248250be56d4b04bbe6a81f6e03e9
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l25_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dc0817cecbb9809895154c0ed3d1053221767509bcf3e695df73ff6c4762083b
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l26_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ecf825ae0bc56cc0bc96b7062fb6dedceb2daad0665f9ae6000efe9a412acf3
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l27_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:37050cbaa94e0a14a55b8d441775a0120deb35a3fc27207d3e3deb73635937c0
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l2_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:07b3ac58fa480f03751c8535238ff232fbc0af5068667c110dda74d5594f6b37
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l3_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0db1e97fa4d4c4c5329e104e25eb0f9670cbbcdfa8b058fb7cc1347212b42324
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l4_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:faf0b89b9d025cfb647e7d64845135e2cf37203be061e885c6bedd64b93e4d23
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l5_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b75eb62ae15769a503ccf2061e51133b9f2daf3061623ab2952137e1a147b36f
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l6_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:67086c8f9ffe5f46857c435108ad587a2744c448d81ecbcd80c223a4047cf71d
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l7_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9ed5a3a5e72d5bac083e803727001f5436e59e969885c55fe719cec99a2f3016
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l8_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6eb5ba6a32c93cb1b678d7a857e2aa80c44cb169e9d880d10f01a00ad06a30b8
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_p128_l9_together.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e192956cce5923f46923a7f1a94be90dd489d72a89e1abaafe090baa201ecd5a
|
| 3 |
+
size 35275739
|
internvl3-5_axmodel/qwen3_post.axmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:42f2237caf159f574c3d796adc93f2337c2b76fd18413ce90301781669fecdc4
|
| 3 |
+
size 340033671
|
internvl3-5_tokenizer/added_tokens.json
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</box>": 151677,
|
| 3 |
+
"</img>": 151670,
|
| 4 |
+
"</quad>": 151673,
|
| 5 |
+
"</ref>": 151675,
|
| 6 |
+
"</think>": 151668,
|
| 7 |
+
"</tool_call>": 151658,
|
| 8 |
+
"</tool_response>": 151666,
|
| 9 |
+
"<IMG_CONTEXT>": 151671,
|
| 10 |
+
"<box>": 151676,
|
| 11 |
+
"<img>": 151669,
|
| 12 |
+
"<quad>": 151672,
|
| 13 |
+
"<ref>": 151674,
|
| 14 |
+
"<think>": 151667,
|
| 15 |
+
"<tool_call>": 151657,
|
| 16 |
+
"<tool_response>": 151665,
|
| 17 |
+
"<|box_end|>": 151649,
|
| 18 |
+
"<|box_start|>": 151648,
|
| 19 |
+
"<|endoftext|>": 151643,
|
| 20 |
+
"<|file_sep|>": 151664,
|
| 21 |
+
"<|fim_middle|>": 151660,
|
| 22 |
+
"<|fim_pad|>": 151662,
|
| 23 |
+
"<|fim_prefix|>": 151659,
|
| 24 |
+
"<|fim_suffix|>": 151661,
|
| 25 |
+
"<|im_end|>": 151645,
|
| 26 |
+
"<|im_start|>": 151644,
|
| 27 |
+
"<|image_pad|>": 151655,
|
| 28 |
+
"<|object_ref_end|>": 151647,
|
| 29 |
+
"<|object_ref_start|>": 151646,
|
| 30 |
+
"<|quad_end|>": 151651,
|
| 31 |
+
"<|quad_start|>": 151650,
|
| 32 |
+
"<|repo_name|>": 151663,
|
| 33 |
+
"<|video_pad|>": 151656,
|
| 34 |
+
"<|vision_end|>": 151653,
|
| 35 |
+
"<|vision_pad|>": 151654,
|
| 36 |
+
"<|vision_start|>": 151652
|
| 37 |
+
}
|
internvl3-5_tokenizer/config.json
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen3ForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 151643,
|
| 8 |
+
"debug": false,
|
| 9 |
+
"dtype": "bfloat16",
|
| 10 |
+
"eos_token_id": 151645,
|
| 11 |
+
"ep_size": 1,
|
| 12 |
+
"head_dim": 128,
|
| 13 |
+
"hidden_act": "silu",
|
| 14 |
+
"hidden_size": 2048,
|
| 15 |
+
"initializer_range": 0.02,
|
| 16 |
+
"intermediate_size": 6144,
|
| 17 |
+
"layer_types": [
|
| 18 |
+
"full_attention",
|
| 19 |
+
"full_attention",
|
| 20 |
+
"full_attention",
|
| 21 |
+
"full_attention",
|
| 22 |
+
"full_attention",
|
| 23 |
+
"full_attention",
|
| 24 |
+
"full_attention",
|
| 25 |
+
"full_attention",
|
| 26 |
+
"full_attention",
|
| 27 |
+
"full_attention",
|
| 28 |
+
"full_attention",
|
| 29 |
+
"full_attention",
|
| 30 |
+
"full_attention",
|
| 31 |
+
"full_attention",
|
| 32 |
+
"full_attention",
|
| 33 |
+
"full_attention",
|
| 34 |
+
"full_attention",
|
| 35 |
+
"full_attention",
|
| 36 |
+
"full_attention",
|
| 37 |
+
"full_attention",
|
| 38 |
+
"full_attention",
|
| 39 |
+
"full_attention",
|
| 40 |
+
"full_attention",
|
| 41 |
+
"full_attention",
|
| 42 |
+
"full_attention",
|
| 43 |
+
"full_attention",
|
| 44 |
+
"full_attention",
|
| 45 |
+
"full_attention"
|
| 46 |
+
],
|
| 47 |
+
"max_position_embeddings": 40960,
|
| 48 |
+
"max_window_layers": 28,
|
| 49 |
+
"micro_forward": false,
|
| 50 |
+
"model_type": "qwen3",
|
| 51 |
+
"num_attention_heads": 16,
|
| 52 |
+
"num_hidden_layers": 28,
|
| 53 |
+
"num_key_value_heads": 8,
|
| 54 |
+
"quantization_config": {
|
| 55 |
+
"bits": 4,
|
| 56 |
+
"checkpoint_format": "gptq",
|
| 57 |
+
"desc_act": false,
|
| 58 |
+
"group_size": 128,
|
| 59 |
+
"lm_head": false,
|
| 60 |
+
"meta": {
|
| 61 |
+
"act_group_aware": false,
|
| 62 |
+
"damp_auto_increment": 0.01,
|
| 63 |
+
"damp_percent": 0.01,
|
| 64 |
+
"mse": 0.0,
|
| 65 |
+
"quantizer": [
|
| 66 |
+
"gptqmodel:5.0.0-dev0"
|
| 67 |
+
],
|
| 68 |
+
"static_groups": false,
|
| 69 |
+
"true_sequential": true,
|
| 70 |
+
"uri": "https://github.com/modelcloud/gptqmodel",
|
| 71 |
+
"v2": false,
|
| 72 |
+
"v2_alpha": 0.25
|
| 73 |
+
},
|
| 74 |
+
"pack_dtype": "int32",
|
| 75 |
+
"quant_method": "gptq",
|
| 76 |
+
"sym": true
|
| 77 |
+
},
|
| 78 |
+
"rms_norm_eps": 1e-06,
|
| 79 |
+
"rope_scaling": null,
|
| 80 |
+
"rope_theta": 1000000,
|
| 81 |
+
"skip_checkpoint": false,
|
| 82 |
+
"sliding_window": null,
|
| 83 |
+
"tie_word_embeddings": false,
|
| 84 |
+
"transformers_version": "4.56.2",
|
| 85 |
+
"use_cache": false,
|
| 86 |
+
"use_deepep": false,
|
| 87 |
+
"use_sliding_window": false,
|
| 88 |
+
"vocab_size": 151936
|
| 89 |
+
}
|