yongqiang commited on Dec 26, 2025

Commit

0 Parent(s):

Initialize the repository

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +44 -0
.gitignore +3 -0
README.md +148 -0
assets/demo_1.png +3 -0
assets/demo_2.png +3 -0
config.json +0 -0
examples/image_0.jpg +3 -0
examples/image_1.jpg +3 -0
examples/image_2.png +3 -0
examples/image_3.png +3 -0
examples/laorenshuaidao.mp4 +3 -0
examples/red-panda.mp4 +3 -0
examples/tuboshu.mp4 +3 -0
gradio_demo.py +392 -0
infer_axmodel.py +186 -0
infer_torch.py +212 -0
internvl3-5_axmodel/model.embed_tokens.weight.bfloat16.bin +3 -0
internvl3-5_axmodel/model.embed_tokens.weight.float32.bin +3 -0
internvl3-5_axmodel/model.embed_tokens.weight.npy +3 -0
internvl3-5_axmodel/qwen3_p128_l0_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l10_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l11_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l12_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l13_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l14_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l15_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l16_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l17_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l18_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l19_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l1_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l20_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l21_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l22_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l23_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l24_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l25_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l26_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l27_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l2_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l3_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l4_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l5_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l6_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l7_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l8_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_p128_l9_together.axmodel +3 -0
internvl3-5_axmodel/qwen3_post.axmodel +3 -0
internvl3-5_tokenizer/added_tokens.json +37 -0
internvl3-5_tokenizer/config.json +89 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,44 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.axmodel filter=lfs diff=lfs merge=lfs -text
+main_api_ax650 filter=lfs diff=lfs merge=lfs -text
+main_api_axcl_x86 filter=lfs diff=lfs merge=lfs -text
+main_ax650 filter=lfs diff=lfs merge=lfs -text
+main_axcl_x86 filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text
+internvl3-5_tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ __pycache__
2	+ *tmp/
3	+

README.md ADDED Viewed

	@@ -0,0 +1,148 @@

+---
+library_name: transformers
+license: bsd-3-clause
+base_model:
+- OpenGVLab/InternVL3_5-2B
+tags:
+- InternVL3
+- InternVL3_5-2B
+- InternVL3_5-2B_GPTQ_INT4
+- Int8
+- VLM
+pipeline_tag: image-text-to-text
+language:
+- en
+---
+# InternVL3_5-2B_GPTQ_INT4
+This version of InternVL3_5-2B_GPTQ_INT4 has been converted to run on the Axera NPU using **w4a16** quantization.
+This model has been optimized with the following LoRA:
+Compatible with Pulsar2 version: 5.1-patch1.
+Please note that the context of the model is 2k and the maximum prefill length is 1k.
+## Convert tools links:
+For those who are interested in model conversion, you can try to export axmodel through the original repo:
+https://huggingface.co/OpenGVLab/InternVL3_5-2B
+[How to Convert LLM from Huggingface to axmodel](https://github.com/AXERA-TECH/InternVL3_5-2B_GPTQ_INT4.axera/tree/main/model_convert)
+[AXera NPU HOST LLM Runtime](https://github.com/AXERA-TECH/ax-llm/tree/ax-internvl)
+[AXera NPU AXCL LLM Runtime](https://github.com/AXERA-TECH/ax-llm/tree/axcl-internvl)
+## Support Platform
+- AX650
+  - AX650N DEMO Board
+  - [M4N-Dock(爱芯派Pro)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
+  - [M.2 Accelerator card](https://axcl-docs.readthedocs.io/zh-cn/latest/doc_guide_hardware.html)
+|Chips|image encoder 448|ttft|w8a16|
+|--|--|--|--|
+|AX650| 364.412 ms | 4951.50 ms | 28.07 tokens/sec|
+## How to use
+Download all files from this repository to the device
+```
+$ tree -L 1
+.
+├── assets
+├── config.json
+├── examples
+├── gradio_demo.py
+├── infer_axmodel.py
+├── infer_torch.py
+├── internvl3-5_axmodel
+├── internvl3-5_tokenizer
+├── README.md
+├── utils
+└── vit-models
+6 directories, 5 files
+```
+#### Install transformer
+```
+pip install transformers==4.57.1
+```
+#### Inference with AX650 Host, such as M4N-Dock(爱芯派Pro) or AX650 DEMO Board
+Interactive conversations using the `Gradio API`:
+```bash
+$ python3 gradio_demo.py --hf_model internvl3-5_tokenizer/ --axmodel_path internvl3-5_axmodel/ --vit_model vit-models/internvl_vit_model_1x3x448x448.axmodel
+```
+Plain text dialogue:
+![demo_1](assets/demo_1.png)
+Image understanding:
+![demo_2](assets/demo_2.png)
+---
+Run the following command on the Axera board to start a chat conversation:
+```sh
+$ python3 infer_axmodel.py --hf_model internvl3-5_tokenizer/ --axmodel_path internvl3-5_axmodel/ --question "请计算函数[y=2x^2+2]的导数, 并提供 markdown 格式的推理过程"
+```
+output:
+```bash
+[INFO] Using provider: AxEngineExecutionProvider
+[INFO] Model type: 2 (triple core)
+[INFO] Compiler version: 5.1-dirty 0fdbfe15-dirty
+Model loaded successfully!
+slice_indices: [0]
+Slice prefill done: 0
+answer >> 函数 \( y = 2x^2 + 2 \) 的导数可以通过求导法则来计算。首先，我们对函数中的每一项分别求导：
+1. 对于 \( 2x^2 \)，使用幂法则求导：
+   \[
+   \frac{d}{dx}(2x^2) = 2 \cdot 2x = 4x
+   \]
+2. 对于常数项 \( 2 \)，其导数为 0，因为常数的导数为 0。
+将这两部分的结果相加，得到函数 \( y \) 的导数：
+\[
+y' = 4x
+\]
+因此，函数 \( y = 2x^2 + 2 \) 的导数为 \( y' = 4x \)。
+```
+Enter the following command to perform the single-image understanding task:
+```sh
+$ python3 infer_axmodel.py --hf_model internvl3-5_tokenizer/ --axmodel_path internvl3-5_axmodel/ --question "请描述这幅图" -i examples/image_0.jpg --vit_model vit-models/internvl_vit_model_1x3x448x448.axmodel
+```
+![image_0.jpg](examples/image_0.jpg)
+output:
+```bash
+[INFO] Model type: 2 (triple core)
+[INFO] Compiler version: 5.1-dirty 0fdbfe15-dirty
+Model loaded successfully!
+slice_indices: [0, 1, 2]
+Slice prefill done: 0
+Slice prefill done: 1
+Slice prefill done: 2
+answer >> 这是一张红熊猫的照片。红熊猫是一种红棕色的哺乳动物，通常生活在亚洲的森林中。它们以捕食昆虫和小型无脊椎动物为生。图片中，红熊猫正坐在一个木制的平台上，背景是绿色的树木和植被，显得非常自然和生动。红熊猫的表情看起来很友好，似乎在观察或等待什么。
+```

assets/demo_1.png ADDED Viewed

Git LFS Details

SHA256: 6340140c81bf679b2ba9aa494e1526f6db9b6e435221dc6c582b2887f8d8e9a6
Pointer size: 131 Bytes
Size of remote file: 395 kB

assets/demo_2.png ADDED Viewed

Git LFS Details

SHA256: 4c7d95c191d1afbf33ea4054a561a2540fd6c0d59dd78e397b88d41c0ed1fd33
Pointer size: 132 Bytes
Size of remote file: 1.31 MB

config.json ADDED Viewed

File without changes

examples/image_0.jpg ADDED Viewed

Git LFS Details

SHA256: c587294b3bf637dacbb3c96324c127187a2f242c94f639633a0d8a2775a9a399
Pointer size: 130 Bytes
Size of remote file: 78.1 kB

examples/image_1.jpg ADDED Viewed

Git LFS Details

SHA256: 08487494b8dc08d44bc36491adf3ab89ff30d13a3122da86f3cd67cad89eeee8
Pointer size: 131 Bytes
Size of remote file: 126 kB

examples/image_2.png ADDED Viewed

Git LFS Details

SHA256: 622ae2d01ff4467fa69a7888728d776650117a0f4887e96ba0fb9a8a6d77b3c3
Pointer size: 131 Bytes
Size of remote file: 355 kB

examples/image_3.png ADDED Viewed

Git LFS Details

SHA256: 729e80e77d8611778859d2f232cb7f2a8fda04ed67dd8dcc3e7cd7a657367402
Pointer size: 131 Bytes
Size of remote file: 394 kB

examples/laorenshuaidao.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f5c00b37b23af3d01d133da880eb7f6e50d4af608e3575784be7063eb137011
+size 2704112

examples/red-panda.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d921c07bb97224d65a37801541d246067f0d506f08723ffa1ad85c217907ccb8
+size 1867237

examples/tuboshu.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ced4d95877b9a7f8b48f79bdfe4287eff8837f20348daec2f2e2987459ec1712
+size 5952043

gradio_demo.py ADDED Viewed

	@@ -0,0 +1,392 @@

+import argparse
+import os
+import time
+from typing import Any, Dict, List, Optional, Generator, Tuple
+import gradio as gr
+import numpy as np
+import torch
+import torchvision.transforms as T
+from ml_dtypes import bfloat16
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoConfig, AutoTokenizer
+from utils.infer_func import InferManager
+from axengine import InferenceSession
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+IMG_PLACEHOLDER_TOKEN_ID = 151669  # <img>
+IMG_CONTEXT_REPEAT = 256  # number of image context tokens expected by the model
+SYSTEM_PROMPT = (
+    "<|im_start|>system\n"
+    "你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型, 英文名叫 InternVL3, "
+    "是一个有用无害的人工智能助手, 擅长思考和回答用户的问题. 请你在回答问题时使用简体中文."
+    "<|im_end|>\n"
+)
+def build_transform(input_size: int):
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
+    ])
+    return transform
+def dynamic_preprocess(image: Image.Image, min_num: int = 1, max_num: int = 12, image_size: int = 448,
+                       use_thumbnail: bool = False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    def find_closest_aspect_ratio(ar: float, ratios: List[tuple]):
+        best_ratio_diff = float("inf")
+        best_ratio = (1, 1)
+        area = orig_width * orig_height
+        for ratio in ratios:
+            target_aspect_ratio = ratio[0] / ratio[1]
+            ratio_diff = abs(ar - target_aspect_ratio)
+            if ratio_diff < best_ratio_diff:
+                best_ratio_diff = ratio_diff
+                best_ratio = ratio
+            elif ratio_diff == best_ratio_diff:
+                if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                    best_ratio = ratio
+        return best_ratio
+    target_aspect_ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios)
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        processed_images.append(image.resize((image_size, image_size)))
+    return processed_images
+def load_image(image_file: Image.Image, input_size: int = 448, max_num: int = 12):
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image_file, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    pixel_values = [transform(img) for img in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+class InternVLGradioDemo:
+    def __init__(self, hf_model: str, axmodel_dir: str, vit_axmodel: str, max_seq_len: int = 2047):
+        self.hf_model = hf_model
+        self.axmodel_dir = axmodel_dir
+        self.vit_axmodel = vit_axmodel
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.embeds = np.load(os.path.join(axmodel_dir, "model.embed_tokens.weight.npy"))
+        self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model)
+        config = AutoConfig.from_pretrained(self.hf_model, trust_remote_code=True)
+        if hasattr(config, 'llm_config') and config.llm_config is not None:
+            self.cfg = config.llm_config
+        else:
+            self.cfg = config
+        self.vit_session = InferenceSession(self.vit_axmodel)
+        self.infer_manager = InferManager(self.cfg, self.axmodel_dir, max_seq_len=max_seq_len)
+    def _build_single_turn_prompt(self, user_text: str, vit_features: List[np.ndarray]):
+        prompt = SYSTEM_PROMPT
+        prompt += f"<|im_start|>user\n{user_text}"
+        for _ in vit_features:
+            prompt += "\n<img>" + "<IMG_CONTEXT>" * IMG_CONTEXT_REPEAT + "</img>"
+        prompt += "<|im_end|>\n<|im_start|>assistant\n"
+        return prompt
+    def _insert_vision_features(self, token_ids: List[int], prefill_data: np.ndarray, vit_features: List[np.ndarray]):
+        image_start_indices = np.where(np.array(token_ids) == IMG_PLACEHOLDER_TOKEN_ID)[0].tolist()
+        if len(image_start_indices) != len(vit_features):
+            raise ValueError("图片数量与占位符数量不一致, 请检查输入和模板生成逻辑")
+        for idx, image_start_index in enumerate(image_start_indices):
+            insert_pos = image_start_index + 1
+            prefill_data[insert_pos: insert_pos + IMG_CONTEXT_REPEAT] = vit_features[idx][0, :, :]
+        return prefill_data
+    def _run_model(self, prompt: str, vit_features: List[np.ndarray]):
+        """Non-streaming推理，保留以防需要一次性结果。"""
+        for k_cache in self.infer_manager.k_caches:
+            k_cache.fill(0)
+        for v_cache in self.infer_manager.v_caches:
+            v_cache.fill(0)
+        token_ids = self.tokenizer.encode(prompt)
+        prefill_data = np.take(self.embeds, token_ids, axis=0).astype(bfloat16)
+        if vit_features:
+            prefill_data = self._insert_vision_features(token_ids, prefill_data, vit_features)
+        eos_token_id = None
+        if isinstance(self.cfg.eos_token_id, list) and len(self.cfg.eos_token_id) > 1:
+            eos_token_id = self.cfg.eos_token_id
+        slice_len = 128
+        token_ids = self.infer_manager.prefill(self.tokenizer, token_ids, prefill_data, slice_len=slice_len)
+        return self.infer_manager.decode(
+            self.tokenizer,
+            token_ids,
+            self.embeds,
+            slice_len=slice_len,
+            eos_token_id=eos_token_id,
+            stream=False,
+        )
+    def _stream_generate(self, prompt: str, vit_features: List[np.ndarray]):
+        """流式生成，逐 token 产出累积文本与计时信息 (TTFT 与平均 decode ms/token)。"""
+        # reset kv cache per request
+        for k_cache in self.infer_manager.k_caches:
+            k_cache.fill(0)
+        for v_cache in self.infer_manager.v_caches:
+            v_cache.fill(0)
+        token_ids = self.tokenizer.encode(prompt)
+        prefill_data = np.take(self.embeds, token_ids, axis=0).astype(bfloat16)
+        if vit_features:
+            prefill_data = self._insert_vision_features(token_ids, prefill_data, vit_features)
+        eos_token_id = None
+        if isinstance(self.cfg.eos_token_id, list) and len(self.cfg.eos_token_id) > 1:
+            eos_token_id = self.cfg.eos_token_id
+        slice_len = 128
+        t_start = time.time()
+        token_ids = self.infer_manager.prefill(self.tokenizer, token_ids, prefill_data, slice_len=slice_len)
+        # copy decode逻辑，实现手动流式输出
+        mask = np.zeros((1, 1, self.infer_manager.max_seq_len + 1), dtype=np.float32).astype(bfloat16)
+        mask[:, :, :self.infer_manager.max_seq_len] -= 65536
+        seq_len = len(token_ids) - 1
+        if slice_len > 0:
+            mask[:, :, :seq_len] = 0
+        ttft_ms: Optional[float] = None
+        decode_tokens = 0
+        decode_elapsed_ms: float = 0.0
+        generated_text = ""
+        yield generated_text, ttft_ms, None, None, False
+        for step_idx in range(self.infer_manager.max_seq_len):
+            if slice_len > 0 and step_idx < seq_len:
+                continue
+            cur_token = token_ids[step_idx]
+            indices = np.array([step_idx], np.uint32).reshape((1, 1))
+            data = self.embeds[cur_token, :].reshape((1, 1, self.cfg.hidden_size)).astype(bfloat16)
+            for layer_idx in range(self.cfg.num_hidden_layers):
+                input_feed = {
+                    "K_cache": self.infer_manager.k_caches[layer_idx],
+                    "V_cache": self.infer_manager.v_caches[layer_idx],
+                    "indices": indices,
+                    "input": data,
+                    "mask": mask,
+                }
+                outputs = self.infer_manager.decoder_sessions[layer_idx].run(None, input_feed, shape_group=0)
+                self.infer_manager.k_caches[layer_idx][:, step_idx, :] = outputs[0][:, :, :]
+                self.infer_manager.v_caches[layer_idx][:, step_idx, :] = outputs[1][:, :, :]
+                data = outputs[2]
+            mask[..., step_idx] = 0
+            if step_idx < seq_len - 1:
+                continue
+            post_out = self.infer_manager.post_process_session.run(None, {"input": data})[0]
+            next_token, possible_tokens, possible_probs = self.infer_manager.post_process(post_out, temperature=0.7)
+            if eos_token_id is not None and next_token in eos_token_id:
+                ttft_ms = ttft_ms or (time.time() - t_start) * 1000
+                break
+            if next_token == self.tokenizer.eos_token_id:
+                ttft_ms = ttft_ms or (time.time() - t_start) * 1000
+                break
+            token_ids.append(next_token)
+            # 使用完整 token 列表解码，避免多字节 UTF-8 字符被截断显示为乱码
+            # 只解码新生成的 tokens（从 seq_len 开始）
+            generated_text = self.tokenizer.decode(token_ids[seq_len:], skip_special_tokens=True)
+            if ttft_ms is None:
+                ttft_ms = (time.time() - t_start) * 1000
+            else:
+                decode_tokens += 1
+                decode_elapsed_ms = (time.time() - t_start) * 1000 - ttft_ms
+            avg_decode = (decode_elapsed_ms / decode_tokens) if decode_tokens > 0 else None
+            yield generated_text, ttft_ms, avg_decode, decode_tokens, False
+        total_ms = (time.time() - t_start) * 1000
+        avg_decode = (decode_elapsed_ms / decode_tokens) if decode_tokens > 0 else None
+        yield generated_text, ttft_ms, avg_decode, decode_tokens, True
+    def chat(self, user_input: str, image: Optional[Image.Image]) -> Generator:
+        user_text = (user_input or "").strip()
+        if not user_text and image is None:
+            yield [], gr.update(), gr.update(), gr.update(), gr.update()
+            return
+        # 先展示占位，保持图片不清空；同时占位速度信息
+        yield [(user_text, "处理中…")], gr.update(value=""), gr.update(), gr.update(value="<div style='text-align: right; font-size: 13px; color: #6b7280; font-family: monospace;'>TTFT -- ms&nbsp;&nbsp;|&nbsp;&nbsp;Decode -- ms/token&nbsp;&nbsp;|&nbsp;&nbsp;Tokens --</div>"), gr.update(interactive=False)
+        vit_outputs = []
+        if image is not None:
+            pixel_values = load_image(image, input_size=448, max_num=1)
+            vit_output = self.vit_session.run(None, {"image": pixel_values.numpy()})[0]
+            vit_outputs.append(vit_output.copy())
+        prompt = self._build_single_turn_prompt(user_text, vit_outputs)
+        chatbot_history = [(user_text, "")]  # 将在流式过程中填充
+        for partial, ttft_ms, avg_decode_ms, decode_tokens, finished in self._stream_generate(prompt, vit_outputs):
+            chatbot_history[-1] = (user_text, partial)
+            ttft_disp = f"{ttft_ms:.0f}" if ttft_ms is not None else "--"
+            decode_disp = f"{avg_decode_ms:.1f}" if avg_decode_ms is not None else "--"
+            tok_disp = f"{decode_tokens}" if decode_tokens is not None else "--"
+            metrics_text = f"<div style='text-align: right; font-size: 13px; color: #6b7280; font-family: monospace;'>TTFT {ttft_disp} ms&nbsp;&nbsp;|&nbsp;&nbsp;Decode {decode_disp} ms/token&nbsp;&nbsp;|&nbsp;&nbsp;Tokens {tok_disp}</div>"
+            if finished:
+                yield chatbot_history, gr.update(value=""), gr.update(), gr.update(value=metrics_text), gr.update(interactive=True)
+            else:
+                yield chatbot_history, gr.update(value=""), gr.update(), gr.update(value=metrics_text), gr.update(interactive=False)
+    @staticmethod
+    def build_ui(demo: "InternVLGradioDemo", server_name: str = "0.0.0.0", server_port: int = 7860, share: bool = False):
+        # 自定义 JavaScript: Enter 发送, Shift+Enter 换行
+        custom_js = """
+        function() {
+            // 等待 DOM 加载完成后绑定事件
+            setTimeout(() => {
+                const textareas = document.querySelectorAll('#user-input textarea');
+                textareas.forEach(textarea => {
+                    // 移除可能存在的旧监听器
+                    textarea.removeEventListener('keydown', textarea._customKeyHandler);
+                    textarea._customKeyHandler = function(e) {
+                        if (e.key === 'Enter') {
+                            if (e.shiftKey) {
+                                // Shift+Enter: 插入换行符
+                                e.preventDefault();
+                                const start = this.selectionStart;
+                                const end = this.selectionEnd;
+                                const value = this.value;
+                                this.value = value.substring(0, start) + '\\n' + value.substring(end);
+                                this.selectionStart = this.selectionEnd = start + 1;
+                                // 触发 input 事件让 Gradio 感知变化
+                                this.dispatchEvent(new Event('input', { bubbles: true }));
+                            } else {
+                                // Enter: 发送消息
+                                e.preventDefault();
+                                const sendBtn = document.querySelector('#send-btn');
+                                if (sendBtn) {
+                                    sendBtn.click();
+                                }
+                            }
+                        }
+                    };
+                    textarea.addEventListener('keydown', textarea._customKeyHandler);
+                });
+            }, 500);
+        }
+        """
+        with gr.Blocks(title="InternVL3_5-2B_GPTQ_INT4 AX Gradio Demo", theme=gr.themes.Soft(), js=custom_js) as iface:
+            gr.HTML("""<style>
+            #image-pane img {object-fit: contain; max-height: 380px;}
+            #chat-wrap {position: relative;}
+            #metrics-display {position: absolute; right: 12px; bottom: 12px; z-index: 5; pointer-events: none; text-align: right;}
+            #metrics-display > div {display: inline-block;}
+            </style>""")
+            gr.Markdown("""### InternVL3_5-2B_GPTQ_INT4 图文对话演示\n上传一张图片 (可选)，输入问题，获取中文回答。""")
+            with gr.Row():
+                # 左侧：对话框和输入区域
+                with gr.Column(scale=5):
+                    with gr.Group(elem_id="chat-wrap"):
+                        chatbot = gr.Chatbot(height=500, label="对话")
+                        metrics_md = gr.Markdown("<div style='text-align: right; font-size: 13px; color: #6b7280; font-family: monospace;'>TTFT -- ms&nbsp;&nbsp;|&nbsp;&nbsp;Decode -- ms/token&nbsp;&nbsp;|&nbsp;&nbsp;Tokens --</div>", elem_id="metrics-display")
+                    with gr.Row():
+                        user_input = gr.Textbox(
+                            placeholder="按 Enter 发送，Shift+Enter 换行",
+                            lines=2,
+                            scale=7,
+                            max_lines=5,
+                            show_label=False,
+                            elem_id="user-input",
+                        )
+                        with gr.Column(scale=1, min_width=100):
+                            send_btn = gr.Button("发送", variant="primary", size="sm", elem_id="send-btn")
+                            clear_btn = gr.Button("清空对话", variant="secondary", size="sm")
+                # 右侧：图像上传和信息提示
+                with gr.Column(scale=3):
+                    image_input = gr.Image(
+                        type="pil",
+                        label="上传图片 (可选)",
+                        height=380,
+                        image_mode="RGB",
+                        show_download_button=False,
+                        elem_id="image-pane",
+                    )
+                    gr.Markdown("""- 支持单张图像理解\n- 仅当前问题与回答，不保留历史\n- 处理时间取决于硬件，请耐心等待""")
+            def _clear():
+                return [], gr.update(value=""), gr.update(), gr.update(value="<div style='text-align: right; font-size: 13px; color: #6b7280; font-family: monospace;'>TTFT -- ms&nbsp;&nbsp;|&nbsp;&nbsp;Decode -- ms/token&nbsp;&nbsp;|&nbsp;&nbsp;Tokens --</div>"), gr.update(interactive=True)
+            send_btn.click(
+                fn=demo.chat,
+                inputs=[user_input, image_input],
+                outputs=[chatbot, user_input, image_input, metrics_md, send_btn],
+                show_progress=False,
+                queue=True,
+            )
+            # 移除 user_input.submit，由自定义 JS 处理 Enter 发送，Shift+Enter 换行
+            clear_btn.click(fn=_clear, inputs=None, outputs=[chatbot, user_input, image_input, metrics_md, send_btn])
+        iface.queue().launch(server_name=server_name, server_port=server_port, share=share)
+def parse_args():
+    parser = argparse.ArgumentParser(description="InternVL3-5-2B AX gradio demo")
+    parser.add_argument("--hf_model", type=str, default="./InternVL3_5-2B",
+                        help="HuggingFace 模型路径")
+    parser.add_argument("--axmodel_path", type=str, default="./InternVL3_5-2B_axmodel",
+                        help="LLM axmodel 目录")
+    parser.add_argument("--vit_model", type=str, default="./vit-models/internvl_vit_model_1x3x448x448.axmodel",
+                        help="ViT axmodel 路径")
+    parser.add_argument("--port", type=int, default=7860, help="Gradio 端口")
+    parser.add_argument("--host", type=str, default="0.0.0.0", help="Gradio 监听地址")
+    parser.add_argument("--share", action="store_true", help="启用 gradio share")
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    demo = InternVLGradioDemo(args.hf_model, args.axmodel_path, args.vit_model)
+    InternVLGradioDemo.build_ui(demo, server_name=args.host, server_port=args.port, share=args.share)
+if __name__ == "__main__":
+    main()

infer_axmodel.py ADDED Viewed

	@@ -0,0 +1,186 @@

+from transformers import AutoProcessor, AutoModelForImageTextToText
+import torch
+import onnx
+import onnxruntime as ort
+import numpy as np
+import os
+from tqdm import tqdm
+from transformers import AutoConfig, AutoTokenizer
+from typing import List, Tuple
+from axengine import InferenceSession
+from ml_dtypes import bfloat16
+from utils.infer_func import InferManager
+import argparse
+from PIL import Image
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def load_image(image_file, input_size=448, max_num=12):
+    image = Image.open(image_file).convert('RGB')
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+if __name__ == "__main__":
+    """
+    python3 infer_axmodel.py  --vit_model vit-models/internvl_vit_model_1x3x448x448.axmodel --images examples/image_0.jpg
+    """
+    prompt = None
+    parser = argparse.ArgumentParser(description="Model configuration parameters")
+    parser.add_argument("--hf_model", type=str, default="./InternVL3_5-1B",
+                        help="Path to HuggingFace model")
+    parser.add_argument("--axmodel_path", type=str, default="./InternVL3_5-1B_axmodel",
+                        help="Path to save compiled axmodel of llama model")
+    parser.add_argument("--vit_model", type=str, default=None, help="Path to save compiled axmodel of llama model")
+    parser.add_argument("-i", "--images", nargs='+', type=str, default=None,
+                        help="Path to the test image.")
+    parser.add_argument("-q", "--question", type=str, default="请你描述这幅图的内容.",
+                        help="Your question that you want to ask the model.")
+    args = parser.parse_args()
+    hf_model_path = args.hf_model
+    axmodel_path = args.axmodel_path
+    images = args.images
+    prompt = args.question
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    embeds = np.load(os.path.join(axmodel_path, "model.embed_tokens.weight.npy"))
+    # load the tokenizer and the model
+    tokenizer = AutoTokenizer.from_pretrained(hf_model_path)
+    config = AutoConfig.from_pretrained(hf_model_path, trust_remote_code=True)
+    # model = AutoModelForCausalLM.from_pretrained(
+    #     hf_model_path,
+    # ).to(device)
+    test_imgs_path = args.images
+    vit_axmodel_path = args.vit_model
+    # set the max number of tiles in `max_num`
+    pixel_values_list = []
+    if test_imgs_path is not None:
+        for img_path in test_imgs_path:
+            pixel_values = load_image(img_path, input_size=448, max_num=1)
+            pixel_values_list.append(pixel_values)
+        print(f"输入图像数: {len(pixel_values_list)}")
+        print("preprocess image done!")
+        # extract img feature by vit
+        vit_session = InferenceSession(vit_axmodel_path)
+        vit_output_list = []
+        for idx, pixel_values in enumerate(pixel_values_list):
+            vit_output = vit_session.run(None, {"image": pixel_values.numpy()})[0]
+            vit_output_list.append(vit_output.copy()) # 避免 vit 输出结果使用同一块内存
+        print(f"vit_output.shape is {vit_output_list[0].shape}, vit feature extract done!")
+    prompt = "<|im_start|>system\n你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型, 英文名叫 InternVL3, 是一个有用无害的人工智能助手, 擅长思考和回答用户的问题. 请你在回答问题时使用简体中文.<|im_end|>\n"
+    question = args.question
+    prompt += "<|im_start|>user\n" + question
+    if len(pixel_values_list) > 0:
+        for idx in range(len(pixel_values_list)):
+            prompt += "\n<img>" + "<IMG_CONTEXT>" * 256 + "</img>\n"
+    prompt += "<|im_end|>\n<|im_start|>assistant\n"
+    print(f"prompt is {prompt}")
+    token_ids = tokenizer.encode(prompt)
+    # 图像理解
+    image_start_indices = np.where(np.array(token_ids) == 151669)[0].tolist() # <img> tag 151669, 151665
+    prefill_data = np.take(embeds, token_ids, axis=0)
+    prefill_data = prefill_data.astype(bfloat16)
+    token_len = len(token_ids)
+    for idx, image_start_index in enumerate(image_start_indices):
+        image_insert_index = image_start_index + 1
+        prefill_data[image_insert_index : image_insert_index + 256] = vit_output_list[idx][0, :, :]
+    ##################################
+    if hasattr(config, 'llm_config') and config.llm_config is not None: # 兼容 GPTQ INT4 模型
+        cfg = config.llm_config
+    else:
+        cfg = config
+    eos_token_id = None
+    if isinstance(cfg.eos_token_id, list) and len(cfg.eos_token_id) > 1:
+        eos_token_id = cfg.eos_token_id
+    slice_len = 128
+    prefill_max_len = 1024 - 1
+    max_seq_len = 2048 - 1  # prefill + decode max length
+    imer = InferManager(cfg, axmodel_path, max_seq_len=max_seq_len) # prefill + decode max length
+    # import pdb; pdb.set_trace()
+    token_ids = imer.prefill(tokenizer, token_ids, prefill_data, slice_len=slice_len)
+    imer.decode(tokenizer, token_ids, embeds, slice_len=slice_len, eos_token_id=eos_token_id)
+    print("\n")

infer_torch.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import math
+import numpy as np
+import torch
+import torchvision.transforms as T
+from decord import VideoReader, cpu
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoModel, AutoTokenizer
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def load_image(image_file, input_size=448, max_num=12):
+    image = Image.open(image_file).convert('RGB')
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+path = './InternVL3_5-1B'
+model = AutoModel.from_pretrained(
+    path,
+    torch_dtype=torch.bfloat16,
+    load_in_8bit=False,
+    low_cpu_mem_usage=True,
+    use_flash_attn=True,
+    trust_remote_code=True,
+    device_map="auto").eval()
+tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
+# set the max number of tiles in `max_num`
+pixel_values = load_image('./examples/image_1.jpg', input_size=448, max_num=1).to(torch.bfloat16).cuda()
+generation_config = dict(max_new_tokens=1024, do_sample=True)
+# pure-text conversation (纯文本对话)
+question = '中国的首都'
+response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
+# single-image single-round conversation (单图单轮对话)
+question = '<image>\n请你描述这幅图的内容.'
+response = model.chat(tokenizer, pixel_values, question, generation_config)
+print(f'User: {question}\nAssistant: {response}')
+# # single-image multi-round conversation (单图多轮对话)
+# question = '<image>\nPlease describe the image in detail.'
+# response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
+# print(f'User: {question}\nAssistant: {response}')
+# question = 'Please write a poem according to the image.'
+# response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
+# print(f'User: {question}\nAssistant: {response}')
+# # multi-image multi-round conversation, combined images (多图多轮对话，拼接图像)
+# pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
+# pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
+# pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
+# question = '<image>\nDescribe the two images in detail.'
+# response, history = model.chat(tokenizer, pixel_values, question, generation_config,
+#                                history=None, return_history=True)
+# print(f'User: {question}\nAssistant: {response}')
+# question = 'What are the similarities and differences between these two images.'
+# response, history = model.chat(tokenizer, pixel_values, question, generation_config,
+#                                history=history, return_history=True)
+# print(f'User: {question}\nAssistant: {response}')
+# # multi-image multi-round conversation, separate images (多图多轮对话，独立图像)
+# pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
+# pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
+# pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
+# num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
+# question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
+# response, history = model.chat(tokenizer, pixel_values, question, generation_config,
+#                                num_patches_list=num_patches_list,
+#                                history=None, return_history=True)
+# print(f'User: {question}\nAssistant: {response}')
+# question = 'What are the similarities and differences between these two images.'
+# response, history = model.chat(tokenizer, pixel_values, question, generation_config,
+#                                num_patches_list=num_patches_list,
+#                                history=history, return_history=True)
+# print(f'User: {question}\nAssistant: {response}')
+# # batch inference, single image per sample (单图批处理)
+# pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
+# pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
+# num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
+# pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
+# questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
+# responses = model.batch_chat(tokenizer, pixel_values,
+#                              num_patches_list=num_patches_list,
+#                              questions=questions,
+#                              generation_config=generation_config)
+# for question, response in zip(questions, responses):
+#     print(f'User: {question}\nAssistant: {response}')
+# # video multi-round conversation (视频多轮对话)
+# def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+#     if bound:
+#         start, end = bound[0], bound[1]
+#     else:
+#         start, end = -100000, 100000
+#     start_idx = max(first_idx, round(start * fps))
+#     end_idx = min(round(end * fps), max_frame)
+#     seg_size = float(end_idx - start_idx) / num_segments
+#     frame_indices = np.array([
+#         int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+#         for idx in range(num_segments)
+#     ])
+#     return frame_indices
+# def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
+#     vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+#     max_frame = len(vr) - 1
+#     fps = float(vr.get_avg_fps())
+#     pixel_values_list, num_patches_list = [], []
+#     transform = build_transform(input_size=input_size)
+#     frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
+#     for frame_index in frame_indices:
+#         img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
+#         img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
+#         pixel_values = [transform(tile) for tile in img]
+#         pixel_values = torch.stack(pixel_values)
+#         num_patches_list.append(pixel_values.shape[0])
+#         pixel_values_list.append(pixel_values)
+#     pixel_values = torch.cat(pixel_values_list)
+#     return pixel_values, num_patches_list
+# video_path = './examples/red-panda.mp4'
+# pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
+# pixel_values = pixel_values.to(torch.bfloat16).cuda()
+# video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
+# question = video_prefix + 'What is the red panda doing?'
+# # Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
+# response, history = model.chat(tokenizer, pixel_values, question, generation_config,
+#                                num_patches_list=num_patches_list, history=None, return_history=True)
+# print(f'User: {question}\nAssistant: {response}')
+# question = 'Describe this video in detail.'
+# response, history = model.chat(tokenizer, pixel_values, question, generation_config,
+#                                num_patches_list=num_patches_list, history=history, return_history=True)
+# print(f'User: {question}\nAssistant: {response}')

internvl3-5_axmodel/model.embed_tokens.weight.bfloat16.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d23be80431651d6c1dc8a9a89d35ffc0565d9114c0b4675d085dad1f7ab5d89f
+size 622329856

internvl3-5_axmodel/model.embed_tokens.weight.float32.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1604cef8ba75bc3c615e8b2853734464f54a26abea11f441e98f18fc49be24ab
+size 1244659712

internvl3-5_axmodel/model.embed_tokens.weight.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fd10721363fc0a9e0bd780faad607e032524fb4f6bcccf78068a5f7fe5319fc
+size 1244659840

internvl3-5_axmodel/qwen3_p128_l0_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71702b53c25639f32047fb391aa816d0a3fbdd076532eaca815a8ea86352e063
+size 35275739

internvl3-5_axmodel/qwen3_p128_l10_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e14a765456a52ffaeecc16f419d71727944233f71f13db9fecbc9cb2f8230e57
+size 35275739

internvl3-5_axmodel/qwen3_p128_l11_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83ff2e11d13e501a978657a9a4e0f1fba5bd7b83ec93d6e45967fe8000dc79ba
+size 35275739

internvl3-5_axmodel/qwen3_p128_l12_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e4a517dfb88f693dde987f1b822eb3ca9003bd0b942113a04c21d9034afcda52
+size 35275739

internvl3-5_axmodel/qwen3_p128_l13_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6d4675b57b3e87cd74b3ae15793699886ca5d3f3aab3227260909254f69c3d5
+size 35275739

internvl3-5_axmodel/qwen3_p128_l14_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65dddb4171b6f688dd2bd998589e5057b8453d39e0a95268d3bf12fef5ec23f1
+size 35275739

internvl3-5_axmodel/qwen3_p128_l15_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:351dac6756d1c14f4786d1b55d34231e24641bc0d4e6899b6d455f9dcc6f4ac7
+size 35275739

internvl3-5_axmodel/qwen3_p128_l16_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9372ec58825360c4c1d66e090749ec74fb589e9538d4f63d2ffb4ed2210c625a
+size 35275739

internvl3-5_axmodel/qwen3_p128_l17_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c1b676f03db77238ecc374a831968ddce9737a9abf4dbf7a95c0d14e50786b1
+size 35275739

internvl3-5_axmodel/qwen3_p128_l18_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:625f9855cd7b7c486686ec3a38711748340eb8087ce126557171ed49bf0f1a7d
+size 35275739

internvl3-5_axmodel/qwen3_p128_l19_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96da8471a519e7cbd1236b7964e494e587d2350c6f000d3d4bc6266d8422b723
+size 35275739

internvl3-5_axmodel/qwen3_p128_l1_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:499cd84ed255cd5f19ea4f6e3886f6483f785d583ead804dee33cbc5b89c3950
+size 35275739

internvl3-5_axmodel/qwen3_p128_l20_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4c190f2267217f3ec07169db2cf34718a1f2a8360edc7a4737eeb1d8aaa832eb
+size 35275739

internvl3-5_axmodel/qwen3_p128_l21_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93640f21e3a7a308a9313ce76e830d6ab9047ac78b788d59303cf2f9afdb5e73
+size 35275739

internvl3-5_axmodel/qwen3_p128_l22_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:884d394f2c0ba2b571daa7d9350429f1c153bce7f91df67472c5758c874ba82a
+size 35275739

internvl3-5_axmodel/qwen3_p128_l23_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:af327bcf09933e6087c100fbc5ac52a142b201f5e55db81ec4b1fefb3dfb8d37
+size 35275739

internvl3-5_axmodel/qwen3_p128_l24_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ceee8ba3e472826e49a4977d01bb8f245f248250be56d4b04bbe6a81f6e03e9
+size 35275739

internvl3-5_axmodel/qwen3_p128_l25_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc0817cecbb9809895154c0ed3d1053221767509bcf3e695df73ff6c4762083b
+size 35275739

internvl3-5_axmodel/qwen3_p128_l26_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ecf825ae0bc56cc0bc96b7062fb6dedceb2daad0665f9ae6000efe9a412acf3
+size 35275739

internvl3-5_axmodel/qwen3_p128_l27_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:37050cbaa94e0a14a55b8d441775a0120deb35a3fc27207d3e3deb73635937c0
+size 35275739

internvl3-5_axmodel/qwen3_p128_l2_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07b3ac58fa480f03751c8535238ff232fbc0af5068667c110dda74d5594f6b37
+size 35275739

internvl3-5_axmodel/qwen3_p128_l3_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0db1e97fa4d4c4c5329e104e25eb0f9670cbbcdfa8b058fb7cc1347212b42324
+size 35275739

internvl3-5_axmodel/qwen3_p128_l4_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:faf0b89b9d025cfb647e7d64845135e2cf37203be061e885c6bedd64b93e4d23
+size 35275739

internvl3-5_axmodel/qwen3_p128_l5_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b75eb62ae15769a503ccf2061e51133b9f2daf3061623ab2952137e1a147b36f
+size 35275739

internvl3-5_axmodel/qwen3_p128_l6_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67086c8f9ffe5f46857c435108ad587a2744c448d81ecbcd80c223a4047cf71d
+size 35275739

internvl3-5_axmodel/qwen3_p128_l7_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ed5a3a5e72d5bac083e803727001f5436e59e969885c55fe719cec99a2f3016
+size 35275739

internvl3-5_axmodel/qwen3_p128_l8_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6eb5ba6a32c93cb1b678d7a857e2aa80c44cb169e9d880d10f01a00ad06a30b8
+size 35275739

internvl3-5_axmodel/qwen3_p128_l9_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e192956cce5923f46923a7f1a94be90dd489d72a89e1abaafe090baa201ecd5a
+size 35275739

internvl3-5_axmodel/qwen3_post.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42f2237caf159f574c3d796adc93f2337c2b76fd18413ce90301781669fecdc4
+size 340033671

internvl3-5_tokenizer/added_tokens.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "</box>": 151677,
+  "</img>": 151670,
+  "</quad>": 151673,
+  "</ref>": 151675,
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<IMG_CONTEXT>": 151671,
+  "<box>": 151676,
+  "<img>": 151669,
+  "<quad>": 151672,
+  "<ref>": 151674,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

internvl3-5_tokenizer/config.json ADDED Viewed

	@@ -0,0 +1,89 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "debug": false,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "ep_size": 1,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 6144,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 40960,
+  "max_window_layers": 28,
+  "micro_forward": false,
+  "model_type": "qwen3",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 8,
+  "quantization_config": {
+    "bits": 4,
+    "checkpoint_format": "gptq",
+    "desc_act": false,
+    "group_size": 128,
+    "lm_head": false,
+    "meta": {
+      "act_group_aware": false,
+      "damp_auto_increment": 0.01,
+      "damp_percent": 0.01,
+      "mse": 0.0,
+      "quantizer": [
+        "gptqmodel:5.0.0-dev0"
+      ],
+      "static_groups": false,
+      "true_sequential": true,
+      "uri": "https://github.com/modelcloud/gptqmodel",
+      "v2": false,
+      "v2_alpha": 0.25
+    },
+    "pack_dtype": "int32",
+    "quant_method": "gptq",
+    "sym": true
+  },
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "skip_checkpoint": false,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "transformers_version": "4.56.2",
+  "use_cache": false,
+  "use_deepep": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}