yongqiang commited on
Commit
1601280
·
0 Parent(s):

Initialize the repository

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +44 -0
  2. .gitignore +3 -0
  3. README.md +148 -0
  4. assets/demo_1.png +3 -0
  5. assets/demo_2.png +3 -0
  6. config.json +0 -0
  7. examples/image_0.jpg +3 -0
  8. examples/image_1.jpg +3 -0
  9. examples/image_2.png +3 -0
  10. examples/image_3.png +3 -0
  11. examples/laorenshuaidao.mp4 +3 -0
  12. examples/red-panda.mp4 +3 -0
  13. examples/tuboshu.mp4 +3 -0
  14. gradio_demo.py +392 -0
  15. infer_axmodel.py +186 -0
  16. infer_torch.py +212 -0
  17. internvl3-5_axmodel/model.embed_tokens.weight.bfloat16.bin +3 -0
  18. internvl3-5_axmodel/model.embed_tokens.weight.float32.bin +3 -0
  19. internvl3-5_axmodel/model.embed_tokens.weight.npy +3 -0
  20. internvl3-5_axmodel/qwen3_p128_l0_together.axmodel +3 -0
  21. internvl3-5_axmodel/qwen3_p128_l10_together.axmodel +3 -0
  22. internvl3-5_axmodel/qwen3_p128_l11_together.axmodel +3 -0
  23. internvl3-5_axmodel/qwen3_p128_l12_together.axmodel +3 -0
  24. internvl3-5_axmodel/qwen3_p128_l13_together.axmodel +3 -0
  25. internvl3-5_axmodel/qwen3_p128_l14_together.axmodel +3 -0
  26. internvl3-5_axmodel/qwen3_p128_l15_together.axmodel +3 -0
  27. internvl3-5_axmodel/qwen3_p128_l16_together.axmodel +3 -0
  28. internvl3-5_axmodel/qwen3_p128_l17_together.axmodel +3 -0
  29. internvl3-5_axmodel/qwen3_p128_l18_together.axmodel +3 -0
  30. internvl3-5_axmodel/qwen3_p128_l19_together.axmodel +3 -0
  31. internvl3-5_axmodel/qwen3_p128_l1_together.axmodel +3 -0
  32. internvl3-5_axmodel/qwen3_p128_l20_together.axmodel +3 -0
  33. internvl3-5_axmodel/qwen3_p128_l21_together.axmodel +3 -0
  34. internvl3-5_axmodel/qwen3_p128_l22_together.axmodel +3 -0
  35. internvl3-5_axmodel/qwen3_p128_l23_together.axmodel +3 -0
  36. internvl3-5_axmodel/qwen3_p128_l24_together.axmodel +3 -0
  37. internvl3-5_axmodel/qwen3_p128_l25_together.axmodel +3 -0
  38. internvl3-5_axmodel/qwen3_p128_l26_together.axmodel +3 -0
  39. internvl3-5_axmodel/qwen3_p128_l27_together.axmodel +3 -0
  40. internvl3-5_axmodel/qwen3_p128_l2_together.axmodel +3 -0
  41. internvl3-5_axmodel/qwen3_p128_l3_together.axmodel +3 -0
  42. internvl3-5_axmodel/qwen3_p128_l4_together.axmodel +3 -0
  43. internvl3-5_axmodel/qwen3_p128_l5_together.axmodel +3 -0
  44. internvl3-5_axmodel/qwen3_p128_l6_together.axmodel +3 -0
  45. internvl3-5_axmodel/qwen3_p128_l7_together.axmodel +3 -0
  46. internvl3-5_axmodel/qwen3_p128_l8_together.axmodel +3 -0
  47. internvl3-5_axmodel/qwen3_p128_l9_together.axmodel +3 -0
  48. internvl3-5_axmodel/qwen3_post.axmodel +3 -0
  49. internvl3-5_tokenizer/added_tokens.json +37 -0
  50. internvl3-5_tokenizer/config.json +89 -0
.gitattributes ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.axmodel filter=lfs diff=lfs merge=lfs -text
37
+ main_api_ax650 filter=lfs diff=lfs merge=lfs -text
38
+ main_api_axcl_x86 filter=lfs diff=lfs merge=lfs -text
39
+ main_ax650 filter=lfs diff=lfs merge=lfs -text
40
+ main_axcl_x86 filter=lfs diff=lfs merge=lfs -text
41
+ *.png filter=lfs diff=lfs merge=lfs -text
42
+ *.jpg filter=lfs diff=lfs merge=lfs -text
43
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
44
+ internvl3-5_tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__
2
+ *tmp/
3
+
README.md ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: bsd-3-clause
4
+ base_model:
5
+ - OpenGVLab/InternVL3_5-2B
6
+ tags:
7
+ - InternVL3
8
+ - InternVL3_5-2B
9
+ - InternVL3_5-2B_GPTQ_INT4
10
+ - Int8
11
+ - VLM
12
+ pipeline_tag: image-text-to-text
13
+ language:
14
+ - en
15
+ ---
16
+
17
+ # InternVL3_5-2B_GPTQ_INT4
18
+
19
+ This version of InternVL3_5-2B_GPTQ_INT4 has been converted to run on the Axera NPU using **w4a16** quantization.
20
+
21
+ This model has been optimized with the following LoRA:
22
+
23
+ Compatible with Pulsar2 version: 5.1-patch1.
24
+
25
+ Please note that the context of the model is 2k and the maximum prefill length is 1k.
26
+
27
+ ## Convert tools links:
28
+
29
+ For those who are interested in model conversion, you can try to export axmodel through the original repo:
30
+
31
+ https://huggingface.co/OpenGVLab/InternVL3_5-2B
32
+
33
+ [How to Convert LLM from Huggingface to axmodel](https://github.com/AXERA-TECH/InternVL3_5-2B_GPTQ_INT4.axera/tree/main/model_convert)
34
+
35
+ [AXera NPU HOST LLM Runtime](https://github.com/AXERA-TECH/ax-llm/tree/ax-internvl)
36
+
37
+ [AXera NPU AXCL LLM Runtime](https://github.com/AXERA-TECH/ax-llm/tree/axcl-internvl)
38
+
39
+ ## Support Platform
40
+
41
+ - AX650
42
+ - AX650N DEMO Board
43
+ - [M4N-Dock(爱芯派Pro)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
44
+ - [M.2 Accelerator card](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html)
45
+
46
+ |Chips|image encoder 448|ttft|w8a16|
47
+ |--|--|--|--|
48
+ |AX650| 364.412 ms | 4951.50 ms | 28.07 tokens/sec|
49
+
50
+
51
+ ## How to use
52
+
53
+ Download all files from this repository to the device
54
+
55
+ ```
56
+ $ tree -L 1
57
+ .
58
+ ├── assets
59
+ ├── config.json
60
+ ├── examples
61
+ ├── gradio_demo.py
62
+ ├── infer_axmodel.py
63
+ ├── infer_torch.py
64
+ ├── internvl3-5_axmodel
65
+ ├── internvl3-5_tokenizer
66
+ ├── README.md
67
+ ├── utils
68
+ └── vit-models
69
+
70
+ 6 directories, 5 files
71
+ ```
72
+
73
+ #### Install transformer
74
+
75
+ ```
76
+ pip install transformers==4.57.1
77
+ ```
78
+
79
+ #### Inference with AX650 Host, such as M4N-Dock(爱芯派Pro) or AX650 DEMO Board
80
+
81
+ Interactive conversations using the `Gradio API`:
82
+
83
+ ```bash
84
+ $ python3 gradio_demo.py --hf_model internvl3-5_tokenizer/ --axmodel_path internvl3-5_axmodel/ --vit_model vit-models/internvl_vit_model_1x3x448x448.axmodel
85
+ ```
86
+
87
+ Plain text dialogue:
88
+
89
+ ![demo_1](assets/demo_1.png)
90
+
91
+ Image understanding:
92
+
93
+ ![demo_2](assets/demo_2.png)
94
+
95
+ ---
96
+
97
+ Run the following command on the Axera board to start a chat conversation:
98
+
99
+ ```sh
100
+ $ python3 infer_axmodel.py --hf_model internvl3-5_tokenizer/ --axmodel_path internvl3-5_axmodel/ --question "请计算函数[y=2x^2+2]的导数, 并提供 markdown 格式的推理过程"
101
+ ```
102
+
103
+ output:
104
+
105
+ ```bash
106
+ [INFO] Using provider: AxEngineExecutionProvider
107
+ [INFO] Model type: 2 (triple core)
108
+ [INFO] Compiler version: 5.1-dirty 0fdbfe15-dirty
109
+ Model loaded successfully!
110
+ slice_indices: [0]
111
+ Slice prefill done: 0
112
+ answer >> 函数 \( y = 2x^2 + 2 \) 的导数可以通过求导法则来计算。首先,我们对函数中的每一项分别求导:
113
+
114
+ 1. 对于 \( 2x^2 \),使用幂法则求导:
115
+ \[
116
+ \frac{d}{dx}(2x^2) = 2 \cdot 2x = 4x
117
+ \]
118
+
119
+ 2. 对于常数项 \( 2 \),其导数为 0,因为常数的导数为 0。
120
+
121
+ 将这两部分的结果相加,得到函数 \( y \) 的导数:
122
+ \[
123
+ y' = 4x
124
+ \]
125
+
126
+ 因此,函数 \( y = 2x^2 + 2 \) 的导数为 \( y' = 4x \)。
127
+ ```
128
+
129
+ Enter the following command to perform the single-image understanding task:
130
+
131
+ ```sh
132
+ $ python3 infer_axmodel.py --hf_model internvl3-5_tokenizer/ --axmodel_path internvl3-5_axmodel/ --question "请描述这幅图" -i examples/image_0.jpg --vit_model vit-models/internvl_vit_model_1x3x448x448.axmodel
133
+ ```
134
+
135
+ ![image_0.jpg](examples/image_0.jpg)
136
+
137
+ output:
138
+
139
+ ```bash
140
+ [INFO] Model type: 2 (triple core)
141
+ [INFO] Compiler version: 5.1-dirty 0fdbfe15-dirty
142
+ Model loaded successfully!
143
+ slice_indices: [0, 1, 2]
144
+ Slice prefill done: 0
145
+ Slice prefill done: 1
146
+ Slice prefill done: 2
147
+ answer >> 这是一张红熊猫的照片。红熊猫是一种红棕色的哺乳动物,通常生活在亚洲的森林中。它们以捕食昆虫和小型无脊椎动物为生。图片中,红熊猫正坐在一个木制的平台上,背景是绿色的树木和植被,显得非常自然和生动。红熊猫的表情看起来很友好,似乎在观察或等待什么。
148
+ ```
assets/demo_1.png ADDED

Git LFS Details

  • SHA256: 6340140c81bf679b2ba9aa494e1526f6db9b6e435221dc6c582b2887f8d8e9a6
  • Pointer size: 131 Bytes
  • Size of remote file: 395 kB
assets/demo_2.png ADDED

Git LFS Details

  • SHA256: 4c7d95c191d1afbf33ea4054a561a2540fd6c0d59dd78e397b88d41c0ed1fd33
  • Pointer size: 132 Bytes
  • Size of remote file: 1.31 MB
config.json ADDED
File without changes
examples/image_0.jpg ADDED

Git LFS Details

  • SHA256: c587294b3bf637dacbb3c96324c127187a2f242c94f639633a0d8a2775a9a399
  • Pointer size: 130 Bytes
  • Size of remote file: 78.1 kB
examples/image_1.jpg ADDED

Git LFS Details

  • SHA256: 08487494b8dc08d44bc36491adf3ab89ff30d13a3122da86f3cd67cad89eeee8
  • Pointer size: 131 Bytes
  • Size of remote file: 126 kB
examples/image_2.png ADDED

Git LFS Details

  • SHA256: 622ae2d01ff4467fa69a7888728d776650117a0f4887e96ba0fb9a8a6d77b3c3
  • Pointer size: 131 Bytes
  • Size of remote file: 355 kB
examples/image_3.png ADDED

Git LFS Details

  • SHA256: 729e80e77d8611778859d2f232cb7f2a8fda04ed67dd8dcc3e7cd7a657367402
  • Pointer size: 131 Bytes
  • Size of remote file: 394 kB
examples/laorenshuaidao.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f5c00b37b23af3d01d133da880eb7f6e50d4af608e3575784be7063eb137011
3
+ size 2704112
examples/red-panda.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d921c07bb97224d65a37801541d246067f0d506f08723ffa1ad85c217907ccb8
3
+ size 1867237
examples/tuboshu.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ced4d95877b9a7f8b48f79bdfe4287eff8837f20348daec2f2e2987459ec1712
3
+ size 5952043
gradio_demo.py ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import time
4
+ from typing import Any, Dict, List, Optional, Generator, Tuple
5
+
6
+ import gradio as gr
7
+ import numpy as np
8
+ import torch
9
+ import torchvision.transforms as T
10
+ from ml_dtypes import bfloat16
11
+ from PIL import Image
12
+ from torchvision.transforms.functional import InterpolationMode
13
+ from transformers import AutoConfig, AutoTokenizer
14
+
15
+ from utils.infer_func import InferManager
16
+ from axengine import InferenceSession
17
+
18
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
19
+ IMAGENET_STD = (0.229, 0.224, 0.225)
20
+ IMG_PLACEHOLDER_TOKEN_ID = 151669 # <img>
21
+ IMG_CONTEXT_REPEAT = 256 # number of image context tokens expected by the model
22
+
23
+
24
+ SYSTEM_PROMPT = (
25
+ "<|im_start|>system\n"
26
+ "你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型, 英文名叫 InternVL3, "
27
+ "是一个有用无害的人工智能助手, 擅长思考和回答用户的问题. 请你在回答问题时使用简体中文."
28
+ "<|im_end|>\n"
29
+ )
30
+
31
+
32
+ def build_transform(input_size: int):
33
+ transform = T.Compose([
34
+ T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
35
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
36
+ T.ToTensor(),
37
+ T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
38
+ ])
39
+ return transform
40
+
41
+
42
+ def dynamic_preprocess(image: Image.Image, min_num: int = 1, max_num: int = 12, image_size: int = 448,
43
+ use_thumbnail: bool = False):
44
+ orig_width, orig_height = image.size
45
+ aspect_ratio = orig_width / orig_height
46
+
47
+ target_ratios = set(
48
+ (i, j)
49
+ for n in range(min_num, max_num + 1)
50
+ for i in range(1, n + 1)
51
+ for j in range(1, n + 1)
52
+ if i * j <= max_num and i * j >= min_num
53
+ )
54
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
55
+
56
+ def find_closest_aspect_ratio(ar: float, ratios: List[tuple]):
57
+ best_ratio_diff = float("inf")
58
+ best_ratio = (1, 1)
59
+ area = orig_width * orig_height
60
+ for ratio in ratios:
61
+ target_aspect_ratio = ratio[0] / ratio[1]
62
+ ratio_diff = abs(ar - target_aspect_ratio)
63
+ if ratio_diff < best_ratio_diff:
64
+ best_ratio_diff = ratio_diff
65
+ best_ratio = ratio
66
+ elif ratio_diff == best_ratio_diff:
67
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
68
+ best_ratio = ratio
69
+ return best_ratio
70
+
71
+ target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios)
72
+ target_width = image_size * target_aspect_ratio[0]
73
+ target_height = image_size * target_aspect_ratio[1]
74
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
75
+
76
+ resized_img = image.resize((target_width, target_height))
77
+ processed_images = []
78
+ for i in range(blocks):
79
+ box = (
80
+ (i % (target_width // image_size)) * image_size,
81
+ (i // (target_width // image_size)) * image_size,
82
+ ((i % (target_width // image_size)) + 1) * image_size,
83
+ ((i // (target_width // image_size)) + 1) * image_size,
84
+ )
85
+ split_img = resized_img.crop(box)
86
+ processed_images.append(split_img)
87
+ assert len(processed_images) == blocks
88
+ if use_thumbnail and len(processed_images) != 1:
89
+ processed_images.append(image.resize((image_size, image_size)))
90
+ return processed_images
91
+
92
+
93
+ def load_image(image_file: Image.Image, input_size: int = 448, max_num: int = 12):
94
+ transform = build_transform(input_size=input_size)
95
+ images = dynamic_preprocess(image_file, image_size=input_size, use_thumbnail=True, max_num=max_num)
96
+ pixel_values = [transform(img) for img in images]
97
+ pixel_values = torch.stack(pixel_values)
98
+ return pixel_values
99
+
100
+
101
+ class InternVLGradioDemo:
102
+ def __init__(self, hf_model: str, axmodel_dir: str, vit_axmodel: str, max_seq_len: int = 2047):
103
+ self.hf_model = hf_model
104
+ self.axmodel_dir = axmodel_dir
105
+ self.vit_axmodel = vit_axmodel
106
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
107
+
108
+ self.embeds = np.load(os.path.join(axmodel_dir, "model.embed_tokens.weight.npy"))
109
+ self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model)
110
+ config = AutoConfig.from_pretrained(self.hf_model, trust_remote_code=True)
111
+ if hasattr(config, 'llm_config') and config.llm_config is not None:
112
+ self.cfg = config.llm_config
113
+ else:
114
+ self.cfg = config
115
+
116
+ self.vit_session = InferenceSession(self.vit_axmodel)
117
+ self.infer_manager = InferManager(self.cfg, self.axmodel_dir, max_seq_len=max_seq_len)
118
+
119
+ def _build_single_turn_prompt(self, user_text: str, vit_features: List[np.ndarray]):
120
+ prompt = SYSTEM_PROMPT
121
+ prompt += f"<|im_start|>user\n{user_text}"
122
+ for _ in vit_features:
123
+ prompt += "\n<img>" + "<IMG_CONTEXT>" * IMG_CONTEXT_REPEAT + "</img>"
124
+ prompt += "<|im_end|>\n<|im_start|>assistant\n"
125
+ return prompt
126
+
127
+ def _insert_vision_features(self, token_ids: List[int], prefill_data: np.ndarray, vit_features: List[np.ndarray]):
128
+ image_start_indices = np.where(np.array(token_ids) == IMG_PLACEHOLDER_TOKEN_ID)[0].tolist()
129
+ if len(image_start_indices) != len(vit_features):
130
+ raise ValueError("图片数量与占位符数量不一致, 请检查输入和模板生成逻辑")
131
+ for idx, image_start_index in enumerate(image_start_indices):
132
+ insert_pos = image_start_index + 1
133
+ prefill_data[insert_pos: insert_pos + IMG_CONTEXT_REPEAT] = vit_features[idx][0, :, :]
134
+ return prefill_data
135
+
136
+ def _run_model(self, prompt: str, vit_features: List[np.ndarray]):
137
+ """Non-streaming推理,保留以防需要一次性结果。"""
138
+ for k_cache in self.infer_manager.k_caches:
139
+ k_cache.fill(0)
140
+ for v_cache in self.infer_manager.v_caches:
141
+ v_cache.fill(0)
142
+
143
+ token_ids = self.tokenizer.encode(prompt)
144
+ prefill_data = np.take(self.embeds, token_ids, axis=0).astype(bfloat16)
145
+ if vit_features:
146
+ prefill_data = self._insert_vision_features(token_ids, prefill_data, vit_features)
147
+
148
+ eos_token_id = None
149
+ if isinstance(self.cfg.eos_token_id, list) and len(self.cfg.eos_token_id) > 1:
150
+ eos_token_id = self.cfg.eos_token_id
151
+
152
+ slice_len = 128
153
+ token_ids = self.infer_manager.prefill(self.tokenizer, token_ids, prefill_data, slice_len=slice_len)
154
+ return self.infer_manager.decode(
155
+ self.tokenizer,
156
+ token_ids,
157
+ self.embeds,
158
+ slice_len=slice_len,
159
+ eos_token_id=eos_token_id,
160
+ stream=False,
161
+ )
162
+
163
+ def _stream_generate(self, prompt: str, vit_features: List[np.ndarray]):
164
+ """流式生成,逐 token 产出累积文本与计时信息 (TTFT 与平均 decode ms/token)。"""
165
+ # reset kv cache per request
166
+ for k_cache in self.infer_manager.k_caches:
167
+ k_cache.fill(0)
168
+ for v_cache in self.infer_manager.v_caches:
169
+ v_cache.fill(0)
170
+
171
+ token_ids = self.tokenizer.encode(prompt)
172
+ prefill_data = np.take(self.embeds, token_ids, axis=0).astype(bfloat16)
173
+ if vit_features:
174
+ prefill_data = self._insert_vision_features(token_ids, prefill_data, vit_features)
175
+
176
+ eos_token_id = None
177
+ if isinstance(self.cfg.eos_token_id, list) and len(self.cfg.eos_token_id) > 1:
178
+ eos_token_id = self.cfg.eos_token_id
179
+
180
+ slice_len = 128
181
+ t_start = time.time()
182
+ token_ids = self.infer_manager.prefill(self.tokenizer, token_ids, prefill_data, slice_len=slice_len)
183
+
184
+ # copy decode逻辑,实现手动流式输出
185
+ mask = np.zeros((1, 1, self.infer_manager.max_seq_len + 1), dtype=np.float32).astype(bfloat16)
186
+ mask[:, :, :self.infer_manager.max_seq_len] -= 65536
187
+ seq_len = len(token_ids) - 1
188
+ if slice_len > 0:
189
+ mask[:, :, :seq_len] = 0
190
+
191
+ ttft_ms: Optional[float] = None
192
+ decode_tokens = 0
193
+ decode_elapsed_ms: float = 0.0
194
+ generated_text = ""
195
+ yield generated_text, ttft_ms, None, None, False
196
+
197
+ for step_idx in range(self.infer_manager.max_seq_len):
198
+ if slice_len > 0 and step_idx < seq_len:
199
+ continue
200
+ cur_token = token_ids[step_idx]
201
+ indices = np.array([step_idx], np.uint32).reshape((1, 1))
202
+ data = self.embeds[cur_token, :].reshape((1, 1, self.cfg.hidden_size)).astype(bfloat16)
203
+ for layer_idx in range(self.cfg.num_hidden_layers):
204
+ input_feed = {
205
+ "K_cache": self.infer_manager.k_caches[layer_idx],
206
+ "V_cache": self.infer_manager.v_caches[layer_idx],
207
+ "indices": indices,
208
+ "input": data,
209
+ "mask": mask,
210
+ }
211
+ outputs = self.infer_manager.decoder_sessions[layer_idx].run(None, input_feed, shape_group=0)
212
+ self.infer_manager.k_caches[layer_idx][:, step_idx, :] = outputs[0][:, :, :]
213
+ self.infer_manager.v_caches[layer_idx][:, step_idx, :] = outputs[1][:, :, :]
214
+ data = outputs[2]
215
+ mask[..., step_idx] = 0
216
+ if step_idx < seq_len - 1:
217
+ continue
218
+ post_out = self.infer_manager.post_process_session.run(None, {"input": data})[0]
219
+ next_token, possible_tokens, possible_probs = self.infer_manager.post_process(post_out, temperature=0.7)
220
+ if eos_token_id is not None and next_token in eos_token_id:
221
+ ttft_ms = ttft_ms or (time.time() - t_start) * 1000
222
+ break
223
+ if next_token == self.tokenizer.eos_token_id:
224
+ ttft_ms = ttft_ms or (time.time() - t_start) * 1000
225
+ break
226
+
227
+ token_ids.append(next_token)
228
+ # 使用完整 token 列表解码,避免多字节 UTF-8 字符被截断显示为乱码
229
+ # 只解码新生成的 tokens(从 seq_len 开始)
230
+ generated_text = self.tokenizer.decode(token_ids[seq_len:], skip_special_tokens=True)
231
+
232
+ if ttft_ms is None:
233
+ ttft_ms = (time.time() - t_start) * 1000
234
+ else:
235
+ decode_tokens += 1
236
+ decode_elapsed_ms = (time.time() - t_start) * 1000 - ttft_ms
237
+
238
+ avg_decode = (decode_elapsed_ms / decode_tokens) if decode_tokens > 0 else None
239
+ yield generated_text, ttft_ms, avg_decode, decode_tokens, False
240
+
241
+ total_ms = (time.time() - t_start) * 1000
242
+ avg_decode = (decode_elapsed_ms / decode_tokens) if decode_tokens > 0 else None
243
+ yield generated_text, ttft_ms, avg_decode, decode_tokens, True
244
+
245
+ def chat(self, user_input: str, image: Optional[Image.Image]) -> Generator:
246
+ user_text = (user_input or "").strip()
247
+ if not user_text and image is None:
248
+ yield [], gr.update(), gr.update(), gr.update(), gr.update()
249
+ return
250
+
251
+ # 先展示占位,保持图片不清空;同时占位速度信息
252
+ yield [(user_text, "处理中…")], gr.update(value=""), gr.update(), gr.update(value="<div style='text-align: right; font-size: 13px; color: #6b7280; font-family: monospace;'>TTFT -- ms&nbsp;&nbsp;|&nbsp;&nbsp;Decode -- ms/token&nbsp;&nbsp;|&nbsp;&nbsp;Tokens --</div>"), gr.update(interactive=False)
253
+
254
+ vit_outputs = []
255
+ if image is not None:
256
+ pixel_values = load_image(image, input_size=448, max_num=1)
257
+ vit_output = self.vit_session.run(None, {"image": pixel_values.numpy()})[0]
258
+ vit_outputs.append(vit_output.copy())
259
+
260
+ prompt = self._build_single_turn_prompt(user_text, vit_outputs)
261
+
262
+ chatbot_history = [(user_text, "")] # 将在流式过程中填充
263
+ for partial, ttft_ms, avg_decode_ms, decode_tokens, finished in self._stream_generate(prompt, vit_outputs):
264
+ chatbot_history[-1] = (user_text, partial)
265
+ ttft_disp = f"{ttft_ms:.0f}" if ttft_ms is not None else "--"
266
+ decode_disp = f"{avg_decode_ms:.1f}" if avg_decode_ms is not None else "--"
267
+ tok_disp = f"{decode_tokens}" if decode_tokens is not None else "--"
268
+ metrics_text = f"<div style='text-align: right; font-size: 13px; color: #6b7280; font-family: monospace;'>TTFT {ttft_disp} ms&nbsp;&nbsp;|&nbsp;&nbsp;Decode {decode_disp} ms/token&nbsp;&nbsp;|&nbsp;&nbsp;Tokens {tok_disp}</div>"
269
+ if finished:
270
+ yield chatbot_history, gr.update(value=""), gr.update(), gr.update(value=metrics_text), gr.update(interactive=True)
271
+ else:
272
+ yield chatbot_history, gr.update(value=""), gr.update(), gr.update(value=metrics_text), gr.update(interactive=False)
273
+
274
+ @staticmethod
275
+ def build_ui(demo: "InternVLGradioDemo", server_name: str = "0.0.0.0", server_port: int = 7860, share: bool = False):
276
+ # 自定义 JavaScript: Enter 发送, Shift+Enter 换行
277
+ custom_js = """
278
+ function() {
279
+ // 等待 DOM 加载完成后绑定事件
280
+ setTimeout(() => {
281
+ const textareas = document.querySelectorAll('#user-input textarea');
282
+ textareas.forEach(textarea => {
283
+ // 移除可能存在的旧监听器
284
+ textarea.removeEventListener('keydown', textarea._customKeyHandler);
285
+
286
+ textarea._customKeyHandler = function(e) {
287
+ if (e.key === 'Enter') {
288
+ if (e.shiftKey) {
289
+ // Shift+Enter: 插入换行符
290
+ e.preventDefault();
291
+ const start = this.selectionStart;
292
+ const end = this.selectionEnd;
293
+ const value = this.value;
294
+ this.value = value.substring(0, start) + '\\n' + value.substring(end);
295
+ this.selectionStart = this.selectionEnd = start + 1;
296
+ // 触发 input 事件让 Gradio 感知变化
297
+ this.dispatchEvent(new Event('input', { bubbles: true }));
298
+ } else {
299
+ // Enter: 发送消息
300
+ e.preventDefault();
301
+ const sendBtn = document.querySelector('#send-btn');
302
+ if (sendBtn) {
303
+ sendBtn.click();
304
+ }
305
+ }
306
+ }
307
+ };
308
+ textarea.addEventListener('keydown', textarea._customKeyHandler);
309
+ });
310
+ }, 500);
311
+ }
312
+ """
313
+
314
+ with gr.Blocks(title="InternVL3_5-2B_GPTQ_INT4 AX Gradio Demo", theme=gr.themes.Soft(), js=custom_js) as iface:
315
+ gr.HTML("""<style>
316
+ #image-pane img {object-fit: contain; max-height: 380px;}
317
+ #chat-wrap {position: relative;}
318
+ #metrics-display {position: absolute; right: 12px; bottom: 12px; z-index: 5; pointer-events: none; text-align: right;}
319
+ #metrics-display > div {display: inline-block;}
320
+ </style>""")
321
+ gr.Markdown("""### InternVL3_5-2B_GPTQ_INT4 图文对话演示\n上传一张图片 (可选),输入问题,获取中文回答。""")
322
+
323
+ with gr.Row():
324
+ # 左侧:对话框和输入区域
325
+ with gr.Column(scale=5):
326
+ with gr.Group(elem_id="chat-wrap"):
327
+ chatbot = gr.Chatbot(height=500, label="对话")
328
+ metrics_md = gr.Markdown("<div style='text-align: right; font-size: 13px; color: #6b7280; font-family: monospace;'>TTFT -- ms&nbsp;&nbsp;|&nbsp;&nbsp;Decode -- ms/token&nbsp;&nbsp;|&nbsp;&nbsp;Tokens --</div>", elem_id="metrics-display")
329
+
330
+ with gr.Row():
331
+ user_input = gr.Textbox(
332
+ placeholder="按 Enter 发送,Shift+Enter 换行",
333
+ lines=2,
334
+ scale=7,
335
+ max_lines=5,
336
+ show_label=False,
337
+ elem_id="user-input",
338
+ )
339
+ with gr.Column(scale=1, min_width=100):
340
+ send_btn = gr.Button("发送", variant="primary", size="sm", elem_id="send-btn")
341
+ clear_btn = gr.Button("清空对话", variant="secondary", size="sm")
342
+
343
+ # 右侧:图像上传和信息提示
344
+ with gr.Column(scale=3):
345
+ image_input = gr.Image(
346
+ type="pil",
347
+ label="上传图片 (可选)",
348
+ height=380,
349
+ image_mode="RGB",
350
+ show_download_button=False,
351
+ elem_id="image-pane",
352
+ )
353
+ gr.Markdown("""- 支持单张图像理解\n- 仅当前问题与回答,不保留历史\n- 处理时间取决于硬件,请耐心等待""")
354
+
355
+ def _clear():
356
+ return [], gr.update(value=""), gr.update(), gr.update(value="<div style='text-align: right; font-size: 13px; color: #6b7280; font-family: monospace;'>TTFT -- ms&nbsp;&nbsp;|&nbsp;&nbsp;Decode -- ms/token&nbsp;&nbsp;|&nbsp;&nbsp;Tokens --</div>"), gr.update(interactive=True)
357
+
358
+ send_btn.click(
359
+ fn=demo.chat,
360
+ inputs=[user_input, image_input],
361
+ outputs=[chatbot, user_input, image_input, metrics_md, send_btn],
362
+ show_progress=False,
363
+ queue=True,
364
+ )
365
+ # 移除 user_input.submit,由自定义 JS 处理 Enter 发送,Shift+Enter 换行
366
+ clear_btn.click(fn=_clear, inputs=None, outputs=[chatbot, user_input, image_input, metrics_md, send_btn])
367
+
368
+ iface.queue().launch(server_name=server_name, server_port=server_port, share=share)
369
+
370
+
371
+ def parse_args():
372
+ parser = argparse.ArgumentParser(description="InternVL3-5-2B AX gradio demo")
373
+ parser.add_argument("--hf_model", type=str, default="./InternVL3_5-2B",
374
+ help="HuggingFace 模型路径")
375
+ parser.add_argument("--axmodel_path", type=str, default="./InternVL3_5-2B_axmodel",
376
+ help="LLM axmodel 目录")
377
+ parser.add_argument("--vit_model", type=str, default="./vit-models/internvl_vit_model_1x3x448x448.axmodel",
378
+ help="ViT axmodel 路径")
379
+ parser.add_argument("--port", type=int, default=7860, help="Gradio 端口")
380
+ parser.add_argument("--host", type=str, default="0.0.0.0", help="Gradio 监听地址")
381
+ parser.add_argument("--share", action="store_true", help="启用 gradio share")
382
+ return parser.parse_args()
383
+
384
+
385
+ def main():
386
+ args = parse_args()
387
+ demo = InternVLGradioDemo(args.hf_model, args.axmodel_path, args.vit_model)
388
+ InternVLGradioDemo.build_ui(demo, server_name=args.host, server_port=args.port, share=args.share)
389
+
390
+
391
+ if __name__ == "__main__":
392
+ main()
infer_axmodel.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoProcessor, AutoModelForImageTextToText
2
+ import torch
3
+ import onnx
4
+ import onnxruntime as ort
5
+ import numpy as np
6
+ import os
7
+ from tqdm import tqdm
8
+ from transformers import AutoConfig, AutoTokenizer
9
+ from typing import List, Tuple
10
+ from axengine import InferenceSession
11
+ from ml_dtypes import bfloat16
12
+ from utils.infer_func import InferManager
13
+ import argparse
14
+ from PIL import Image
15
+ import torchvision.transforms as T
16
+ from torchvision.transforms.functional import InterpolationMode
17
+
18
+
19
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
20
+ IMAGENET_STD = (0.229, 0.224, 0.225)
21
+
22
+ def build_transform(input_size):
23
+ MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
24
+ transform = T.Compose([
25
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
26
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
27
+ T.ToTensor(),
28
+ T.Normalize(mean=MEAN, std=STD)
29
+ ])
30
+ return transform
31
+
32
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
33
+ best_ratio_diff = float('inf')
34
+ best_ratio = (1, 1)
35
+ area = width * height
36
+ for ratio in target_ratios:
37
+ target_aspect_ratio = ratio[0] / ratio[1]
38
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
39
+ if ratio_diff < best_ratio_diff:
40
+ best_ratio_diff = ratio_diff
41
+ best_ratio = ratio
42
+ elif ratio_diff == best_ratio_diff:
43
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
44
+ best_ratio = ratio
45
+ return best_ratio
46
+
47
+ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
48
+ orig_width, orig_height = image.size
49
+ aspect_ratio = orig_width / orig_height
50
+
51
+ # calculate the existing image aspect ratio
52
+ target_ratios = set(
53
+ (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
54
+ i * j <= max_num and i * j >= min_num)
55
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
56
+
57
+ # find the closest aspect ratio to the target
58
+ target_aspect_ratio = find_closest_aspect_ratio(
59
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size)
60
+
61
+ # calculate the target width and height
62
+ target_width = image_size * target_aspect_ratio[0]
63
+ target_height = image_size * target_aspect_ratio[1]
64
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
65
+
66
+ # resize the image
67
+ resized_img = image.resize((target_width, target_height))
68
+ processed_images = []
69
+ for i in range(blocks):
70
+ box = (
71
+ (i % (target_width // image_size)) * image_size,
72
+ (i // (target_width // image_size)) * image_size,
73
+ ((i % (target_width // image_size)) + 1) * image_size,
74
+ ((i // (target_width // image_size)) + 1) * image_size
75
+ )
76
+ # split the image
77
+ split_img = resized_img.crop(box)
78
+ processed_images.append(split_img)
79
+ assert len(processed_images) == blocks
80
+ if use_thumbnail and len(processed_images) != 1:
81
+ thumbnail_img = image.resize((image_size, image_size))
82
+ processed_images.append(thumbnail_img)
83
+ return processed_images
84
+
85
+ def load_image(image_file, input_size=448, max_num=12):
86
+ image = Image.open(image_file).convert('RGB')
87
+ transform = build_transform(input_size=input_size)
88
+ images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
89
+ pixel_values = [transform(image) for image in images]
90
+ pixel_values = torch.stack(pixel_values)
91
+ return pixel_values
92
+
93
+ if __name__ == "__main__":
94
+
95
+ """
96
+ python3 infer_axmodel.py --vit_model vit-models/internvl_vit_model_1x3x448x448.axmodel --images examples/image_0.jpg
97
+ """
98
+ prompt = None
99
+ parser = argparse.ArgumentParser(description="Model configuration parameters")
100
+ parser.add_argument("--hf_model", type=str, default="./InternVL3_5-1B",
101
+ help="Path to HuggingFace model")
102
+ parser.add_argument("--axmodel_path", type=str, default="./InternVL3_5-1B_axmodel",
103
+ help="Path to save compiled axmodel of llama model")
104
+ parser.add_argument("--vit_model", type=str, default=None, help="Path to save compiled axmodel of llama model")
105
+ parser.add_argument("-i", "--images", nargs='+', type=str, default=None,
106
+ help="Path to the test image.")
107
+ parser.add_argument("-q", "--question", type=str, default="请你描述这幅图的内容.",
108
+ help="Your question that you want to ask the model.")
109
+ args = parser.parse_args()
110
+
111
+ hf_model_path = args.hf_model
112
+ axmodel_path = args.axmodel_path
113
+ images = args.images
114
+ prompt = args.question
115
+
116
+ device = "cuda" if torch.cuda.is_available() else "cpu"
117
+ embeds = np.load(os.path.join(axmodel_path, "model.embed_tokens.weight.npy"))
118
+
119
+ # load the tokenizer and the model
120
+ tokenizer = AutoTokenizer.from_pretrained(hf_model_path)
121
+ config = AutoConfig.from_pretrained(hf_model_path, trust_remote_code=True)
122
+
123
+ # model = AutoModelForCausalLM.from_pretrained(
124
+ # hf_model_path,
125
+ # ).to(device)
126
+
127
+ test_imgs_path = args.images
128
+ vit_axmodel_path = args.vit_model
129
+
130
+ # set the max number of tiles in `max_num`
131
+ pixel_values_list = []
132
+ if test_imgs_path is not None:
133
+ for img_path in test_imgs_path:
134
+ pixel_values = load_image(img_path, input_size=448, max_num=1)
135
+ pixel_values_list.append(pixel_values)
136
+ print(f"输入图像数: {len(pixel_values_list)}")
137
+ print("preprocess image done!")
138
+
139
+ # extract img feature by vit
140
+ vit_session = InferenceSession(vit_axmodel_path)
141
+ vit_output_list = []
142
+ for idx, pixel_values in enumerate(pixel_values_list):
143
+ vit_output = vit_session.run(None, {"image": pixel_values.numpy()})[0]
144
+ vit_output_list.append(vit_output.copy()) # 避免 vit 输出结果使用同一块内存
145
+
146
+ print(f"vit_output.shape is {vit_output_list[0].shape}, vit feature extract done!")
147
+
148
+ prompt = "<|im_start|>system\n你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型, 英文名叫 InternVL3, 是一个有用无害的人工智能助手, 擅长思考和回答用户的问题. 请你在回答问题时使用简体中文.<|im_end|>\n"
149
+ question = args.question
150
+ prompt += "<|im_start|>user\n" + question
151
+
152
+ if len(pixel_values_list) > 0:
153
+ for idx in range(len(pixel_values_list)):
154
+ prompt += "\n<img>" + "<IMG_CONTEXT>" * 256 + "</img>\n"
155
+ prompt += "<|im_end|>\n<|im_start|>assistant\n"
156
+ print(f"prompt is {prompt}")
157
+ token_ids = tokenizer.encode(prompt)
158
+ # 图像理解
159
+ image_start_indices = np.where(np.array(token_ids) == 151669)[0].tolist() # <img> tag 151669, 151665
160
+ prefill_data = np.take(embeds, token_ids, axis=0)
161
+ prefill_data = prefill_data.astype(bfloat16)
162
+ token_len = len(token_ids)
163
+
164
+ for idx, image_start_index in enumerate(image_start_indices):
165
+ image_insert_index = image_start_index + 1
166
+ prefill_data[image_insert_index : image_insert_index + 256] = vit_output_list[idx][0, :, :]
167
+ ##################################
168
+
169
+ if hasattr(config, 'llm_config') and config.llm_config is not None: # 兼容 GPTQ INT4 模型
170
+ cfg = config.llm_config
171
+ else:
172
+ cfg = config
173
+
174
+ eos_token_id = None
175
+ if isinstance(cfg.eos_token_id, list) and len(cfg.eos_token_id) > 1:
176
+ eos_token_id = cfg.eos_token_id
177
+
178
+ slice_len = 128
179
+ prefill_max_len = 1024 - 1
180
+ max_seq_len = 2048 - 1 # prefill + decode max length
181
+
182
+ imer = InferManager(cfg, axmodel_path, max_seq_len=max_seq_len) # prefill + decode max length
183
+ # import pdb; pdb.set_trace()
184
+ token_ids = imer.prefill(tokenizer, token_ids, prefill_data, slice_len=slice_len)
185
+ imer.decode(tokenizer, token_ids, embeds, slice_len=slice_len, eos_token_id=eos_token_id)
186
+ print("\n")
infer_torch.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import torch
4
+ import torchvision.transforms as T
5
+ from decord import VideoReader, cpu
6
+ from PIL import Image
7
+ from torchvision.transforms.functional import InterpolationMode
8
+ from transformers import AutoModel, AutoTokenizer
9
+
10
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
11
+ IMAGENET_STD = (0.229, 0.224, 0.225)
12
+
13
+ def build_transform(input_size):
14
+ MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
15
+ transform = T.Compose([
16
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
17
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
18
+ T.ToTensor(),
19
+ T.Normalize(mean=MEAN, std=STD)
20
+ ])
21
+ return transform
22
+
23
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
24
+ best_ratio_diff = float('inf')
25
+ best_ratio = (1, 1)
26
+ area = width * height
27
+ for ratio in target_ratios:
28
+ target_aspect_ratio = ratio[0] / ratio[1]
29
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
30
+ if ratio_diff < best_ratio_diff:
31
+ best_ratio_diff = ratio_diff
32
+ best_ratio = ratio
33
+ elif ratio_diff == best_ratio_diff:
34
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
35
+ best_ratio = ratio
36
+ return best_ratio
37
+
38
+ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
39
+ orig_width, orig_height = image.size
40
+ aspect_ratio = orig_width / orig_height
41
+
42
+ # calculate the existing image aspect ratio
43
+ target_ratios = set(
44
+ (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
45
+ i * j <= max_num and i * j >= min_num)
46
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
47
+
48
+ # find the closest aspect ratio to the target
49
+ target_aspect_ratio = find_closest_aspect_ratio(
50
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size)
51
+
52
+ # calculate the target width and height
53
+ target_width = image_size * target_aspect_ratio[0]
54
+ target_height = image_size * target_aspect_ratio[1]
55
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
56
+
57
+ # resize the image
58
+ resized_img = image.resize((target_width, target_height))
59
+ processed_images = []
60
+ for i in range(blocks):
61
+ box = (
62
+ (i % (target_width // image_size)) * image_size,
63
+ (i // (target_width // image_size)) * image_size,
64
+ ((i % (target_width // image_size)) + 1) * image_size,
65
+ ((i // (target_width // image_size)) + 1) * image_size
66
+ )
67
+ # split the image
68
+ split_img = resized_img.crop(box)
69
+ processed_images.append(split_img)
70
+ assert len(processed_images) == blocks
71
+ if use_thumbnail and len(processed_images) != 1:
72
+ thumbnail_img = image.resize((image_size, image_size))
73
+ processed_images.append(thumbnail_img)
74
+ return processed_images
75
+
76
+ def load_image(image_file, input_size=448, max_num=12):
77
+ image = Image.open(image_file).convert('RGB')
78
+ transform = build_transform(input_size=input_size)
79
+ images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
80
+ pixel_values = [transform(image) for image in images]
81
+ pixel_values = torch.stack(pixel_values)
82
+ return pixel_values
83
+
84
+ path = './InternVL3_5-1B'
85
+ model = AutoModel.from_pretrained(
86
+ path,
87
+ torch_dtype=torch.bfloat16,
88
+ load_in_8bit=False,
89
+ low_cpu_mem_usage=True,
90
+ use_flash_attn=True,
91
+ trust_remote_code=True,
92
+ device_map="auto").eval()
93
+ tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
94
+
95
+ # set the max number of tiles in `max_num`
96
+ pixel_values = load_image('./examples/image_1.jpg', input_size=448, max_num=1).to(torch.bfloat16).cuda()
97
+ generation_config = dict(max_new_tokens=1024, do_sample=True)
98
+
99
+ # pure-text conversation (纯文本对话)
100
+ question = '中国的首都'
101
+ response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
102
+ print(f'User: {question}\nAssistant: {response}')
103
+
104
+
105
+ # single-image single-round conversation (单图单轮对话)
106
+ question = '<image>\n请你描述这幅图的内容.'
107
+ response = model.chat(tokenizer, pixel_values, question, generation_config)
108
+ print(f'User: {question}\nAssistant: {response}')
109
+
110
+ # # single-image multi-round conversation (单图多轮对话)
111
+ # question = '<image>\nPlease describe the image in detail.'
112
+ # response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
113
+ # print(f'User: {question}\nAssistant: {response}')
114
+
115
+ # question = 'Please write a poem according to the image.'
116
+ # response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
117
+ # print(f'User: {question}\nAssistant: {response}')
118
+
119
+ # # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
120
+ # pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
121
+ # pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
122
+ # pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
123
+
124
+ # question = '<image>\nDescribe the two images in detail.'
125
+ # response, history = model.chat(tokenizer, pixel_values, question, generation_config,
126
+ # history=None, return_history=True)
127
+ # print(f'User: {question}\nAssistant: {response}')
128
+
129
+ # question = 'What are the similarities and differences between these two images.'
130
+ # response, history = model.chat(tokenizer, pixel_values, question, generation_config,
131
+ # history=history, return_history=True)
132
+ # print(f'User: {question}\nAssistant: {response}')
133
+
134
+ # # multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
135
+ # pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
136
+ # pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
137
+ # pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
138
+ # num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
139
+
140
+ # question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
141
+ # response, history = model.chat(tokenizer, pixel_values, question, generation_config,
142
+ # num_patches_list=num_patches_list,
143
+ # history=None, return_history=True)
144
+ # print(f'User: {question}\nAssistant: {response}')
145
+
146
+ # question = 'What are the similarities and differences between these two images.'
147
+ # response, history = model.chat(tokenizer, pixel_values, question, generation_config,
148
+ # num_patches_list=num_patches_list,
149
+ # history=history, return_history=True)
150
+ # print(f'User: {question}\nAssistant: {response}')
151
+
152
+ # # batch inference, single image per sample (单图批处理)
153
+ # pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
154
+ # pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
155
+ # num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
156
+ # pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
157
+
158
+ # questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
159
+ # responses = model.batch_chat(tokenizer, pixel_values,
160
+ # num_patches_list=num_patches_list,
161
+ # questions=questions,
162
+ # generation_config=generation_config)
163
+ # for question, response in zip(questions, responses):
164
+ # print(f'User: {question}\nAssistant: {response}')
165
+
166
+ # # video multi-round conversation (视频多轮对话)
167
+ # def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
168
+ # if bound:
169
+ # start, end = bound[0], bound[1]
170
+ # else:
171
+ # start, end = -100000, 100000
172
+ # start_idx = max(first_idx, round(start * fps))
173
+ # end_idx = min(round(end * fps), max_frame)
174
+ # seg_size = float(end_idx - start_idx) / num_segments
175
+ # frame_indices = np.array([
176
+ # int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
177
+ # for idx in range(num_segments)
178
+ # ])
179
+ # return frame_indices
180
+
181
+ # def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
182
+ # vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
183
+ # max_frame = len(vr) - 1
184
+ # fps = float(vr.get_avg_fps())
185
+
186
+ # pixel_values_list, num_patches_list = [], []
187
+ # transform = build_transform(input_size=input_size)
188
+ # frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
189
+ # for frame_index in frame_indices:
190
+ # img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
191
+ # img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
192
+ # pixel_values = [transform(tile) for tile in img]
193
+ # pixel_values = torch.stack(pixel_values)
194
+ # num_patches_list.append(pixel_values.shape[0])
195
+ # pixel_values_list.append(pixel_values)
196
+ # pixel_values = torch.cat(pixel_values_list)
197
+ # return pixel_values, num_patches_list
198
+
199
+ # video_path = './examples/red-panda.mp4'
200
+ # pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
201
+ # pixel_values = pixel_values.to(torch.bfloat16).cuda()
202
+ # video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
203
+ # question = video_prefix + 'What is the red panda doing?'
204
+ # # Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
205
+ # response, history = model.chat(tokenizer, pixel_values, question, generation_config,
206
+ # num_patches_list=num_patches_list, history=None, return_history=True)
207
+ # print(f'User: {question}\nAssistant: {response}')
208
+
209
+ # question = 'Describe this video in detail.'
210
+ # response, history = model.chat(tokenizer, pixel_values, question, generation_config,
211
+ # num_patches_list=num_patches_list, history=history, return_history=True)
212
+ # print(f'User: {question}\nAssistant: {response}')
internvl3-5_axmodel/model.embed_tokens.weight.bfloat16.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d23be80431651d6c1dc8a9a89d35ffc0565d9114c0b4675d085dad1f7ab5d89f
3
+ size 622329856
internvl3-5_axmodel/model.embed_tokens.weight.float32.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1604cef8ba75bc3c615e8b2853734464f54a26abea11f441e98f18fc49be24ab
3
+ size 1244659712
internvl3-5_axmodel/model.embed_tokens.weight.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fd10721363fc0a9e0bd780faad607e032524fb4f6bcccf78068a5f7fe5319fc
3
+ size 1244659840
internvl3-5_axmodel/qwen3_p128_l0_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71702b53c25639f32047fb391aa816d0a3fbdd076532eaca815a8ea86352e063
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l10_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e14a765456a52ffaeecc16f419d71727944233f71f13db9fecbc9cb2f8230e57
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l11_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83ff2e11d13e501a978657a9a4e0f1fba5bd7b83ec93d6e45967fe8000dc79ba
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l12_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4a517dfb88f693dde987f1b822eb3ca9003bd0b942113a04c21d9034afcda52
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l13_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6d4675b57b3e87cd74b3ae15793699886ca5d3f3aab3227260909254f69c3d5
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l14_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65dddb4171b6f688dd2bd998589e5057b8453d39e0a95268d3bf12fef5ec23f1
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l15_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:351dac6756d1c14f4786d1b55d34231e24641bc0d4e6899b6d455f9dcc6f4ac7
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l16_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9372ec58825360c4c1d66e090749ec74fb589e9538d4f63d2ffb4ed2210c625a
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l17_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c1b676f03db77238ecc374a831968ddce9737a9abf4dbf7a95c0d14e50786b1
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l18_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:625f9855cd7b7c486686ec3a38711748340eb8087ce126557171ed49bf0f1a7d
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l19_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96da8471a519e7cbd1236b7964e494e587d2350c6f000d3d4bc6266d8422b723
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l1_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:499cd84ed255cd5f19ea4f6e3886f6483f785d583ead804dee33cbc5b89c3950
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l20_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c190f2267217f3ec07169db2cf34718a1f2a8360edc7a4737eeb1d8aaa832eb
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l21_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93640f21e3a7a308a9313ce76e830d6ab9047ac78b788d59303cf2f9afdb5e73
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l22_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:884d394f2c0ba2b571daa7d9350429f1c153bce7f91df67472c5758c874ba82a
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l23_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af327bcf09933e6087c100fbc5ac52a142b201f5e55db81ec4b1fefb3dfb8d37
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l24_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ceee8ba3e472826e49a4977d01bb8f245f248250be56d4b04bbe6a81f6e03e9
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l25_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc0817cecbb9809895154c0ed3d1053221767509bcf3e695df73ff6c4762083b
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l26_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ecf825ae0bc56cc0bc96b7062fb6dedceb2daad0665f9ae6000efe9a412acf3
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l27_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37050cbaa94e0a14a55b8d441775a0120deb35a3fc27207d3e3deb73635937c0
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l2_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07b3ac58fa480f03751c8535238ff232fbc0af5068667c110dda74d5594f6b37
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l3_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0db1e97fa4d4c4c5329e104e25eb0f9670cbbcdfa8b058fb7cc1347212b42324
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l4_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:faf0b89b9d025cfb647e7d64845135e2cf37203be061e885c6bedd64b93e4d23
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l5_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b75eb62ae15769a503ccf2061e51133b9f2daf3061623ab2952137e1a147b36f
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l6_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67086c8f9ffe5f46857c435108ad587a2744c448d81ecbcd80c223a4047cf71d
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l7_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ed5a3a5e72d5bac083e803727001f5436e59e969885c55fe719cec99a2f3016
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l8_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6eb5ba6a32c93cb1b678d7a857e2aa80c44cb169e9d880d10f01a00ad06a30b8
3
+ size 35275739
internvl3-5_axmodel/qwen3_p128_l9_together.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e192956cce5923f46923a7f1a94be90dd489d72a89e1abaafe090baa201ecd5a
3
+ size 35275739
internvl3-5_axmodel/qwen3_post.axmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42f2237caf159f574c3d796adc93f2337c2b76fd18413ce90301781669fecdc4
3
+ size 340033671
internvl3-5_tokenizer/added_tokens.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</box>": 151677,
3
+ "</img>": 151670,
4
+ "</quad>": 151673,
5
+ "</ref>": 151675,
6
+ "</think>": 151668,
7
+ "</tool_call>": 151658,
8
+ "</tool_response>": 151666,
9
+ "<IMG_CONTEXT>": 151671,
10
+ "<box>": 151676,
11
+ "<img>": 151669,
12
+ "<quad>": 151672,
13
+ "<ref>": 151674,
14
+ "<think>": 151667,
15
+ "<tool_call>": 151657,
16
+ "<tool_response>": 151665,
17
+ "<|box_end|>": 151649,
18
+ "<|box_start|>": 151648,
19
+ "<|endoftext|>": 151643,
20
+ "<|file_sep|>": 151664,
21
+ "<|fim_middle|>": 151660,
22
+ "<|fim_pad|>": 151662,
23
+ "<|fim_prefix|>": 151659,
24
+ "<|fim_suffix|>": 151661,
25
+ "<|im_end|>": 151645,
26
+ "<|im_start|>": 151644,
27
+ "<|image_pad|>": 151655,
28
+ "<|object_ref_end|>": 151647,
29
+ "<|object_ref_start|>": 151646,
30
+ "<|quad_end|>": 151651,
31
+ "<|quad_start|>": 151650,
32
+ "<|repo_name|>": 151663,
33
+ "<|video_pad|>": 151656,
34
+ "<|vision_end|>": 151653,
35
+ "<|vision_pad|>": 151654,
36
+ "<|vision_start|>": 151652
37
+ }
internvl3-5_tokenizer/config.json ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "debug": false,
9
+ "dtype": "bfloat16",
10
+ "eos_token_id": 151645,
11
+ "ep_size": 1,
12
+ "head_dim": 128,
13
+ "hidden_act": "silu",
14
+ "hidden_size": 2048,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 6144,
17
+ "layer_types": [
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention"
46
+ ],
47
+ "max_position_embeddings": 40960,
48
+ "max_window_layers": 28,
49
+ "micro_forward": false,
50
+ "model_type": "qwen3",
51
+ "num_attention_heads": 16,
52
+ "num_hidden_layers": 28,
53
+ "num_key_value_heads": 8,
54
+ "quantization_config": {
55
+ "bits": 4,
56
+ "checkpoint_format": "gptq",
57
+ "desc_act": false,
58
+ "group_size": 128,
59
+ "lm_head": false,
60
+ "meta": {
61
+ "act_group_aware": false,
62
+ "damp_auto_increment": 0.01,
63
+ "damp_percent": 0.01,
64
+ "mse": 0.0,
65
+ "quantizer": [
66
+ "gptqmodel:5.0.0-dev0"
67
+ ],
68
+ "static_groups": false,
69
+ "true_sequential": true,
70
+ "uri": "https://github.com/modelcloud/gptqmodel",
71
+ "v2": false,
72
+ "v2_alpha": 0.25
73
+ },
74
+ "pack_dtype": "int32",
75
+ "quant_method": "gptq",
76
+ "sym": true
77
+ },
78
+ "rms_norm_eps": 1e-06,
79
+ "rope_scaling": null,
80
+ "rope_theta": 1000000,
81
+ "skip_checkpoint": false,
82
+ "sliding_window": null,
83
+ "tie_word_embeddings": false,
84
+ "transformers_version": "4.56.2",
85
+ "use_cache": false,
86
+ "use_deepep": false,
87
+ "use_sliding_window": false,
88
+ "vocab_size": 151936
89
+ }