Prince-1 commited on Jan 1

Commit

4904233

verified ·

1 Parent(s): 53f9194

Add files using upload-large-folder tool

Browse files

Files changed (27) hide show

.gitattributes +2 -0
README.md +3 -0
chandra_quant_w8a8_rk3588.rkllm +3 -0
data/datasets.json +22 -0
data/datasets/1.jpg +0 -0
data/datasets/1085.jpg +0 -0
data/datasets/1086.jpg +0 -0
data/datasets/1128.jpg +0 -0
data/datasets/1129.jpg +0 -0
data/datasets/1200.jpg +0 -0
data/datasets/1201.jpg +0 -0
data/datasets/1231.jpg +0 -0
data/datasets/1232.jpg +0 -0
data/datasets/1554.jpg +0 -0
data/datasets/1555.jpg +0 -0
data/datasets/2.jpg +0 -0
data/datasets/21.jpg +0 -0
data/datasets/22.jpg +0 -0
data/datasets/241.jpg +0 -0
data/datasets/252.jpg +0 -0
data/datasets/362.jpg +0 -0
data/datasets/364.jpg +0 -0
data/datasets/448.jpg +0 -0
data/datasets/477.jpg +0 -0
data/demo.jpg +3 -0
export_rkllm.py +52 -0
export_vision.py +323 -0

.gitattributes CHANGED Viewed

@@ -152,3 +152,5 @@ onnx/vpm.blocks.23.mlp.linear_fc2.weight filter=lfs diff=lfs merge=lfs -text
 onnx/vpm.blocks.14.mlp.linear_fc1.weight filter=lfs diff=lfs merge=lfs -text
 onnx/vpm.blocks.1.attn.proj.weight filter=lfs diff=lfs merge=lfs -text
 chandra.rkllm filter=lfs diff=lfs merge=lfs -text

 onnx/vpm.blocks.14.mlp.linear_fc1.weight filter=lfs diff=lfs merge=lfs -text
 onnx/vpm.blocks.1.attn.proj.weight filter=lfs diff=lfs merge=lfs -text
 chandra.rkllm filter=lfs diff=lfs merge=lfs -text
+data/demo.jpg filter=lfs diff=lfs merge=lfs -text
+chandra_quant_w8a8_rk3588.rkllm filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -8,6 +8,9 @@ base_model:
   - datalab-to/chandra
 ---
 # Chandra
 Chandra is an OCR model that outputs markdown, HTML, and JSON.  It is highly accurate at extracting text from images and PDFs, while preserving layout information.

   - datalab-to/chandra
 ---
+# NOTE
+rkllm required `setuptools`
 # Chandra
 Chandra is an OCR model that outputs markdown, HTML, and JSON.  It is highly accurate at extracting text from images and PDFs, while preserving layout information.

chandra_quant_w8a8_rk3588.rkllm ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf52d7c5cc7760680af626b9450d93a5df779d92d6ba2e592846a62a22b78224
+size 8863852436

data/datasets.json ADDED Viewed

	@@ -0,0 +1,22 @@

+[
+    {"image_path": "data/datasets", "image": "1.jpg", "input": "Question: What is correct Python code to generate the content of the image?\nOptions:\nA. for x in range(6):\n  print(x)\nelse:\n  print(\"Finally finished!\")\n\nB. thisdict = {\n  \"brand\": \"Ford\",\n  \"model\": \"Mustang\",\n  \"year\": 1964\n}\n\nprint(len(thisdict))\nC. x = 1\ny = 2.8\nz = 1j\n\nprint(type(x))\nprint(type(y))\nprint(type(z))\n\nD. fruits = [\"apple\", \"banana\", \"cherry\"]\nfor x in fruits:\n  print(x)\nPlease select the correct answer from the options above. \n", "target":"D"},
+    {"image_path": "data/datasets", "image": "2.jpg", "input": "Question: What is correct Python code to generate the content of the image?\nOptions:\nA. class Person:\n  def __init__(self, name, age):\n    self.name = name\n    self.age = age\n\np1 = Person(\"John\", 36)\n\nprint(p1.name)\nprint(p1.age)\nB. fruits = [\"apple\", \"banana\", \"cherry\"]\nfor x in fruits:\n  print(x)\nC. x = min(5, 10, 25)\ny = max(5, 10, 25)\n\nprint(x)\nprint(y)\nD. a = 33\nb = 200\nif b > a:\n  print(\"b is greater than a\")\nPlease select the correct answer from the options above. \n", "target":"D"},
+    {"image_path": "data/datasets", "image": "21.jpg", "input": "Question: Which one is the correct caption of this image?\nOptions:\nA. A man rides a surfboard on a large wave.\nB. a young boy barefoot holding an umbrella touching the horn of a cow\nC. A giraffe standing by a stall in a field.\nD. A stop sign that has been vandalized with graffiti.\nPlease select the correct answer from the options above. \n", "target":"B"},
+    {"image_path": "data/datasets", "image": "22.jpg", "input": "Question: Which one is the correct caption of this image?\nOptions:\nA. A narrow kitchen filled with appliances and cooking utensils.\nB. A person with glasses and a tie in a room.\nC. Tray of vegetables with cucumber, carrots, broccoli and celery.\nD. A pretty young woman riding a surfboard on a wave in the ocean.\nPlease select the correct answer from the options above. \n", "target":"A"},
+    {"image_path": "data/datasets", "image": "241.jpg", "input": "Hint: The passage below describes an experiment. Read the passage and then follow the instructions below.\n\nMadelyn applied a thin layer of wax to the underside of her snowboard and rode the board straight down a hill. Then, she removed the wax and rode the snowboard straight down the hill again. She repeated the rides four more times, alternating whether she rode with a thin layer of wax on the board or not. Her friend Tucker timed each ride. Madelyn and Tucker calculated the average time it took to slide straight down the hill on the snowboard with wax compared to the average time on the snowboard without wax.\nFigure: snowboarding down a hill.\nQuestion: Identify the question that Madelyn and Tucker's experiment can best answer.\nOptions:\nA. Does Madelyn's snowboard slide down a hill in less time when it has a thin layer of wax or a thick layer of wax?\nB. Does Madelyn's snowboard slide down a hill in less time when it has a layer of wax or when it does not have a layer of wax?\nPlease select the correct answer from the options above. \n", "target":"B"},
+    {"image_path": "data/datasets", "image": "252.jpg", "input": "Hint: People can use the engineering-design process to develop solutions to problems. One step in the process is testing if a potential solution meets the requirements of the design.\nThe passage below describes how the engineering-design process was used to test a solution to a problem. Read the passage. Then answer the question below.\n\nLaura and Isabella were making batches of concrete for a construction project. To make the concrete, they mixed together dry cement powder, gravel, and water. Then, they checked if each batch was firm enough using a test called a slump test.\nThey poured some of the fresh concrete into an upside-down metal cone. They left the concrete in the metal cone for 30 seconds. Then, they lifted the cone to see if the concrete stayed in a cone shape or if it collapsed. If the concrete in a batch collapsed, they would know the batch should not be used.\nFigure: preparing a concrete slump test.\nQuestion: Which of the following could Laura and Isabella's test show?\nOptions:\nA. if the concrete from each batch took the same amount of time to dry\nB. if a new batch of concrete was firm enough to use\nPlease select the correct answer from the options above. \n", "target":"B"},
+    {"image_path": "data/datasets", "image": "362.jpg", "input": "Hint: Native copper has the following properties:\nsolid\nnot made by living things\nfound in nature\nfixed crystal structure\nmade of the metal copper\nQuestion: Is native copper a mineral?\nOptions:\nA. no\nB. yes\nPlease select the correct answer from the options above. \n", "target":"B"},
+    {"image_path": "data/datasets", "image": "364.jpg", "input": "Hint: Plastic has the following properties:\nsolid\nno fixed crystal structure\nnot a pure substance\nmade in a factory\nQuestion: Is plastic a mineral?\nOptions:\nA. yes\nB. no\nPlease select the correct answer from the options above. \n", "target":"B"},
+    {"image_path": "data/datasets", "image": "448.jpg", "input": "Hint: Read the text.\nButterflies and moths are easily mistaken for each other, but one distinction between them often appears during their pupal stage. When most butterfly caterpillars reach full size, they attach themselves to a leaf or other object and shed their skin a final time, forming a chrysalis, a hard, shell-like skin, which protects the pupa inside. The chrysalis may be dull and rough or shiny and smooth, usually blending into its surroundings. Most moth caterpillars, by contrast, create a cocoon to protect the pupa, rather than forming a chrysalis. The cocoons usually resemble hard silk pouches, but some moths also incorporate materials like hairs and twigs.\nQuestion: Which term matches the picture?\nOptions:\nA. cocoon\nB. chrysalis\nPlease select the correct answer from the options above. \n", "target":"B"},
+    {"image_path": "data/datasets", "image": "477.jpg", "input": "Hint: Read the text.\nHeat transfer can occur in different ways. Two common ways are through conduction and convection. Conduction occurs when molecules from one object collide with molecules from another object. Burning your hand by touching a hot car door on a sunny summer day is an example of conduction.\nConvection is another form of heat transfer. When a liquid or gas is heated, the heated matter rises upward, away from the heat source. Hot bubbles rising in a pot of water boiling on a stove is an example of convection.\nQuestion: Which term matches the picture?\nOptions:\nA. conduction\nB. convection\nPlease select the correct answer from the options above. \n", "target":"B"},
+    {"image_path": "data/datasets", "image": "1231.jpg", "input": "Question: Which image is more brightful?\nOptions:\nA. The first image\nB. The second image\nPlease select the correct answer from the options above. \n", "target":"A"},
+    {"image_path": "data/datasets", "image": "1232.jpg", "input": "Question: Which image is more brightful?\nOptions:\nA. The first image\nB. The second image\nPlease select the correct answer from the options above. \n", "target":"A"},
+    {"image_path": "data/datasets", "image": "1085.jpg", "input": "Question: is this place crowded?\nOptions:\nA. yes\nB. no\nPlease select the correct answer from the options above. \n", "target":"A"},
+    {"image_path": "data/datasets", "image": "1086.jpg", "input": "Question: is this place crowded?\nOptions:\nA. yes\nB. no\nPlease select the correct answer from the options above. \n", "target":"A"},
+    {"image_path": "data/datasets", "image": "1128.jpg", "input": "Question: In this picture, are the two dolphins the same size?\nOptions:\nA. same\nB. Not the same\nC. Can't judge\nPlease select the correct answer from the options above. \n", "target":"B"},
+    {"image_path": "data/datasets", "image": "1129.jpg", "input": "Question: In this picture, are the two butterfly wings the same shape?\nOptions:\nA. same\nB. Not the same\nC. Can't judge\nPlease select the correct answer from the options above. \n", "target":"B"},
+    {"image_path": "data/datasets", "image": "1200.jpg", "input": "Question: What will happen next?\nOptions:\nA. the motorcyle is gonna go forward\nB. the motorcyle is gonna crash\nC. the motorcyle is gonna go backward\nD. both A,B, and C\nPlease select the correct answer from the options above. \n", "target":"B"},
+    {"image_path": "data/datasets", "image": "1201.jpg", "input": "Question: What will happen next?\nOptions:\nA. this person is gonna stay still\nB. this person is gonna keep walking\nC. this person is gonna fall into the water\nD. both A,B, and C\nPlease select the correct answer from the options above. \n", "target":"C"},
+    {"image_path": "data/datasets", "image": "1554.jpg", "input": "Question: The object shown in this figure:\nOptions:\nA. Is a colorless, flammable liquid that is commonly used as a solvent and fuel\nB. Has a boiling point of 64.7°C\nC. Can be toxic if ingested or absorbed through the skin\nD. None of these options are correct.\nPlease select the correct answer from the options above. \n", "target":"C"},
+    {"image_path": "data/datasets", "image": "1555.jpg", "input": "Question: The object shown in this figure:\nOptions:\nA. Is a lustrous, white metal that is highly reflective and ductile\nB. Has the highest electrical and thermal conductivity of all metals\nC. Has a boiling point of 2,162°C\nD. All of these options are correct.\nPlease select the correct answer from the options above. \n", "target":"D"}
+]

data/datasets/1.jpg ADDED Viewed

data/datasets/1085.jpg ADDED Viewed

data/datasets/1086.jpg ADDED Viewed

data/datasets/1128.jpg ADDED Viewed

data/datasets/1129.jpg ADDED Viewed

data/datasets/1200.jpg ADDED Viewed

data/datasets/1201.jpg ADDED Viewed

data/datasets/1231.jpg ADDED Viewed

data/datasets/1232.jpg ADDED Viewed

data/datasets/1554.jpg ADDED Viewed

data/datasets/1555.jpg ADDED Viewed

data/datasets/2.jpg ADDED Viewed

data/datasets/21.jpg ADDED Viewed

data/datasets/22.jpg ADDED Viewed

data/datasets/241.jpg ADDED Viewed

data/datasets/252.jpg ADDED Viewed

data/datasets/362.jpg ADDED Viewed

data/datasets/364.jpg ADDED Viewed

data/datasets/448.jpg ADDED Viewed

data/datasets/477.jpg ADDED Viewed

data/demo.jpg ADDED Viewed

Git LFS Details

SHA256: 58c5c9898c5359bcf53797711e3d954c8ef529e141cb012ffc433376933839e7
Pointer size: 131 Bytes
Size of remote file: 245 kB

export_rkllm.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import os
+from rkllm.api import RKLLM
+from datasets import load_dataset
+from transformers import  AutoTokenizer
+from tqdm import tqdm
+import torch
+from torch import nn
+import argparse
+argparse = argparse.ArgumentParser()
+argparse.add_argument('--path', type=str, default='Qwen/Qwen2-VL-2B-Instruct', help='model path', required=False)
+argparse.add_argument('--target-platform', type=str, default='rk3588', help='target platform', required=False)
+argparse.add_argument('--num_npu_core', type=int, default=3, help='npu core num', required=False)
+argparse.add_argument('--quantized_dtype', type=str, default='w8a8', help='quantized dtype', required=False)
+argparse.add_argument('--device', type=str, default='cpu', help='device', required=False)
+argparse.add_argument('--savepath', type=str, default='qwen2_vl_2b_instruct.rkllm', help='save path', required=False)
+args = argparse.parse_args()
+modelpath = args.path
+target_platform = args.target_platform
+num_npu_core = args.num_npu_core
+quantized_dtype = args.quantized_dtype
+savepath = os.path.join("./rkllm", os.path.basename(modelpath).lower() + "_" + quantized_dtype + "_"  + target_platform + ".rkllm")
+os.makedirs(os.path.dirname(savepath), exist_ok=True)
+llm = RKLLM()
+# Load model
+# Use 'export CUDA_VISIBLE_DEVICES=2' to specify GPU device
+ret = llm.load_huggingface(model=modelpath, device=args.device)
+if ret != 0:
+    print('Load model failed!')
+    exit(ret)
+# Build model
+dataset = 'data/datasets.json'
+qparams = None
+ret = llm.build(do_quantization=True, optimization_level=1, quantized_dtype=quantized_dtype,
+                quantized_algorithm='normal', target_platform=target_platform, num_npu_core=num_npu_core, extra_qparams=qparams, dataset=dataset)
+if ret != 0:
+    print('Build model failed!')
+    exit(ret)
+# # Export rkllm model
+ret = llm.export_rkllm(savepath)
+if ret != 0:
+    print('Export model failed!')
+    exit(ret)

export_vision.py ADDED Viewed

	@@ -0,0 +1,323 @@

+import torch
+import numpy as np
+import os
+import math
+import argparse
+import torch.nn.functional as F
+from transformers import AutoModel
+class minicpm_v_2_6_vision(torch.nn.Module):
+    def __init__(self, vlm, batch_size, in_h, in_w):
+        super(minicpm_v_2_6_vision, self).__init__()
+        self.vpm = vlm.vpm
+        self.resampler = vlm.resampler
+        patch_size = vlm.config.patch_size
+        num_patches_per_side = vlm.vpm.embeddings.num_patches_per_side
+        tgt_sizes = torch.Tensor([[(in_h // patch_size), math.ceil(in_w / patch_size)]]).type(torch.int32)
+        patch_attention_mask = torch.ones(
+            size=(batch_size, in_h // patch_size, in_w // patch_size),
+            dtype=torch.bool, device=vlm.device,
+        )
+        max_im_h, max_im_w = in_h, in_w
+        max_nb_patches_h, max_nb_patches_w = max_im_h // patch_size, max_im_w // patch_size
+        boundaries = torch.arange(1 / num_patches_per_side, 1.0, 1 / num_patches_per_side)
+        position_ids = torch.full(
+            size=(batch_size, max_nb_patches_h * max_nb_patches_w),
+            fill_value=0,
+        )
+        for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
+            if tgt_sizes is not None:
+                nb_patches_h = tgt_sizes[batch_idx][0]
+                nb_patches_w = tgt_sizes[batch_idx][1]
+            else:
+                nb_patches_h = p_attn_mask[:, 0].sum()
+                nb_patches_w = p_attn_mask[0].sum()
+            fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
+            fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
+            bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
+            bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
+            pos_ids = (bucket_coords_h[:, None] * num_patches_per_side + bucket_coords_w).flatten()
+            position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
+            position_ids = position_ids.to(vlm.device)
+        self.position_ids = position_ids
+        patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1]
+        max_patch_len = torch.max(patch_len)
+        key_padding_mask = torch.zeros((batch_size, max_patch_len), dtype=torch.bool, device=vlm.device)
+        pos_embed = []
+        for i in range(batch_size):
+            tgt_h, tgt_w = tgt_sizes[i]
+            pos_embed.append(self.resampler.pos_embed[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1)).to(torch.float32))  # patches * D
+            key_padding_mask[i, patch_len[i]:] = True
+        self.pos_embed = torch.nn.utils.rnn.pad_sequence(
+            pos_embed, batch_first=True, padding_value=0.0).permute(1, 0, 2)  # BLD => L * B * D
+    def forward(self, pixel_values):
+        batch_size = pixel_values.size(0)
+        # patch embedding
+        patch_embeds = self.vpm.embeddings.patch_embedding(pixel_values)
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        hidden_states = embeddings + self.vpm.embeddings.position_embedding(self.position_ids)
+        # encoder
+        encoder_outputs = self.vpm.encoder(inputs_embeds=hidden_states)
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.vpm.post_layernorm(last_hidden_state)
+        # resampler
+        x = self.resampler.kv_proj(last_hidden_state)  # B * L * D
+        x = self.resampler.ln_kv(x).permute(1, 0, 2)  # L * B * D
+        q = self.resampler.ln_q(self.resampler.query)  # Q * D
+        out = self.resampler.attn(
+            self.resampler._repeat(q, batch_size),  # Q * B * D
+            x + self.pos_embed,  # L * B * D +  L * B * D
+            x)[0]
+        #  out: Q * B * D
+        x = out.permute(1, 0, 2)  # B * Q * D
+        x = self.resampler.ln_post(x)
+        x = x @ self.resampler.proj
+        return x
+class qwen2_5_vl_3b_vision(torch.nn.Module):
+    def __init__(self, vlm, batch_size):
+        super(qwen2_5_vl_3b_vision, self).__init__()
+        self.merge_size = 2
+        self.temporal_patch_size = 2
+        self.patch_size = 14
+        self.channel = 3
+        self.vpm = vlm.visual
+        self.batch_size = batch_size
+    def forward(self, pixel_value, grid_thw):
+        if self.batch_size == 1:
+            patches = pixel_value.repeat(self.temporal_patch_size, 1, 1, 1)
+        elif self.batch_size % self.temporal_patch_size == 1:
+            repeat_image = pixel_value[-1:, ...].repeat(2, 1, 1, 1)
+            patches = torch.cat((pixel_value, repeat_image), dim=0)
+        else:
+            patches = pixel_value
+        grid_t, grid_h, grid_w = grid_thw[0][0], grid_thw[0][1], grid_thw[0][2]
+        patches = patches.reshape(grid_t, self.temporal_patch_size, self.channel,
+                                  grid_h//self.merge_size, self.merge_size, self.patch_size, grid_w//self.merge_size, self.merge_size, self.patch_size)
+        patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8)
+        flatten_patches = patches.reshape(grid_t * grid_h * grid_w, self.channel * self.temporal_patch_size * self.patch_size * self.patch_size)
+        return self.vpm(flatten_patches, grid_thw)
+class qwen3_vl_vision(torch.nn.Module):
+    def __init__(self, vlm, batch_size):
+        super(qwen3_vl_vision, self).__init__()
+        self.merge_size = 2
+        self.temporal_patch_size = 2
+        self.patch_size = 16
+        self.channel = 3
+        self.vpm = vlm.visual
+        self.batch_size = batch_size
+    def forward(self, pixel_value, grid_thw):
+        if self.batch_size == 1:
+            patches = pixel_value.repeat(self.temporal_patch_size, 1, 1, 1)
+        elif self.batch_size % self.temporal_patch_size == 1:
+            repeat_image = pixel_value[-1:, ...].repeat(2, 1, 1, 1)
+            patches = torch.cat((pixel_value, repeat_image), dim=0)
+        else:
+            patches = pixel_value
+        grid_t, grid_h, grid_w = grid_thw[0][0], grid_thw[0][1], grid_thw[0][2]
+        patches = patches.reshape(grid_t, self.temporal_patch_size, self.channel,
+                                  grid_h//self.merge_size, self.merge_size, self.patch_size, grid_w//self.merge_size, self.merge_size, self.patch_size)
+        patches = patches.permute(0, 3, 6, 4, 7, 2, 1, 5, 8)
+        flatten_patches = patches.reshape(grid_t * grid_h * grid_w, self.channel * self.temporal_patch_size * self.patch_size * self.patch_size)
+        return self.vpm(flatten_patches, grid_thw)
+class smolvlm_vision(torch.nn.Module):
+    def __init__(self, vlm):
+        super(smolvlm_vision, self).__init__()
+        self.vpm = vlm.model.vision_model
+        self.connector = vlm.model.connector
+    def forward(self, pixel_values):
+        # Get sequence from the vision encoder
+        image_hidden_states = self.vpm(pixel_values).last_hidden_state
+        # Modality projection & resampling
+        image_hidden_states = self.connector(image_hidden_states)
+        print("image_features:", image_hidden_states.shape)
+        return image_hidden_states
+class vila1_5_3b_vision(torch.nn.Module):
+    def __init__(self, vlm):
+        super(vila1_5_3b_vision, self).__init__()
+        self.vlm = vlm
+    def forward(self, pixel_values):
+        # Get sequence from the vision encoder
+        out = self.vlm.encode_images(pixel_values)
+        return out
+class deepseekocr_vision(torch.nn.Module):
+    def __init__(self, model):
+        super(deepseekocr_vision, self).__init__()
+        self.sam_model = model.sam_model
+        self.vision_model = model.vision_model
+        self.view_seperator = model.view_seperator
+        self.image_newline = model.image_newline
+        self.projector = model.projector
+    def forward(self, pixel_value):
+        global_features_1 = self.sam_model(pixel_value)
+        global_features_2 = self.vision_model(pixel_value, global_features_1)
+        global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
+        global_features = self.projector(global_features)
+        print('=====================')
+        print('BASE: ', global_features.shape)
+        print('NO PATCHES')
+        print('=====================')
+        _, hw, n_dim = global_features.shape
+        h = w = int(hw ** 0.5)
+        global_features = global_features.view(h, w, n_dim)
+        global_features = torch.cat(
+            [global_features, self.image_newline[None, None, :].expand(h, 1, n_dim)], dim=1
+        )
+        global_features = global_features.view(-1, n_dim)
+        global_local_features = torch.cat([global_features, self.view_seperator[None, :]], dim=0)
+        return global_local_features
+if __name__ == "__main__":
+    argparse = argparse.ArgumentParser()
+    argparse.add_argument('--path', type=str, default='CKPT/MiniCPM-V-2_6', help='model path', required=False)
+    argparse.add_argument('--model_name', type=str, default='minicpm-v-2_6', help='model name', required=False)
+    argparse.add_argument('--batch_size', type=int, default=1, help='batch size', required=False)
+    argparse.add_argument('--height', type=int, default=448, help='image height', required=False)
+    argparse.add_argument('--width', type=int, default=448, help='image width', required=False)
+    argparse.add_argument('--device', type=str, default="cpu", help='cpu or cuda', required=False)
+    args = argparse.parse_args()
+    path = args.path
+    model_name = args.model_name
+    savepath = os.path.join("./onnx", model_name + "_vision.onnx")
+    device_type = args.device
+    os.makedirs(os.path.dirname(savepath), exist_ok=True)
+    if model_name == 'minicpm-v-2_6':
+        model = AutoModel.from_pretrained(
+            path, trust_remote_code=True, dtype=torch.float32,
+        )
+        model = model.to(device=device_type, dtype=torch.float32)
+        model.eval()
+        model = minicpm_v_2_6_vision(model, args.batch_size, args.height, args.width)
+        pixel_values = torch.randn(args.batch_size, 3, args.height, args.width, device=model.device, dtype=torch.float32)
+        out = model(pixel_values)
+        print("Output shape:", out.shape)
+        torch.onnx.export(model,
+                    pixel_values,
+                    savepath,
+                    input_names=['pixel'],
+                    opset_version=18)
+    elif model_name == 'qwen2_5-vl-3b':
+        from transformers import Qwen2_5_VLForConditionalGeneration
+        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            path,
+            dtype=torch.float32, # 注意此处的数据类型，由于 rknn 目前仅支持 float32 ，因此需要指定；若是在加载权重时限制了数据类型，需要自行修改config.json中的 "use_flash_attn" 参数为 false
+            low_cpu_mem_usage=True, _attn_implementation="eager",
+            trust_remote_code=True).eval().to(device_type)
+        pixel_values = torch.randn(args.batch_size, 3, args.height, args.width, device=model.device, dtype=torch.float32)
+        grid_thw = torch.tensor([[args.batch_size // 2 if args.batch_size% 2 == 0 else args.batch_size // 2 + 1, args.height//14, args.width//14]], dtype=torch.int64)
+        model.eval()
+        model = qwen2_5_vl_3b_vision(model, args.batch_size)
+        out = model(pixel_values, grid_thw)
+        print("Output shape:", out.shape)
+        torch.onnx.export(model,
+                    (pixel_values, grid_thw),
+                    savepath,
+                    input_names=['pixel', 'grid_thw'],
+                    dynamic_axes={'pixel': {2: 'height', 3: 'width'}},
+                    opset_version=15)
+    elif model_name == 'qwen3-vl':
+        from transformers import Qwen3VLForConditionalGeneration
+        model = Qwen3VLForConditionalGeneration.from_pretrained(
+            path,
+            dtype=torch.float32, # 注意此处的数据类型，由于 rknn 目前仅支持 float32 ，因此需要指定；若是在加载权重时限制了数据类型，需要自行修改config.json中的 "use_flash_attn" 参数为 false
+            low_cpu_mem_usage=True, _attn_implementation="eager",
+            trust_remote_code=True).eval().to(device_type)
+        # Fix resolution and grid
+        HEIGHT = 224
+        WIDTH = 224
+        BATCH = 1
+        pixel_values = torch.randn(
+            BATCH, 3, HEIGHT, WIDTH,
+            device=model.device,
+            dtype=torch.float32
+        )
+        grid_thw = torch.tensor(
+            [[1, HEIGHT // 16, WIDTH // 16]],
+            dtype=torch.int64
+        )
+        #pixel_values = torch.randn(args.batch_size, 3, args.height, args.width, device=model.device, dtype=torch.float32)
+        #grid_thw = torch.tensor([[args.batch_size // 2 if args.batch_size% 2 == 0 else args.batch_size // 2 + 1, args.height//16, args.width//16]], dtype=torch.int64)
+        model.eval()
+        model = qwen3_vl_vision(model, args.batch_size)
+        out = model(pixel_values, grid_thw)
+        print("Output shape:", out[0].shape)
+        torch.onnx.export(model,
+            (pixel_values, grid_thw),
+            savepath,
+            input_names=['pixel', 'grid_thw'],
+            #dynamic_axes={'pixel': {2: 'height', 3: 'width'}},
+            opset_version=18
+        )
+    elif model_name == 'smolvlm':
+        from transformers import SmolVLMForConditionalGeneration
+        model = SmolVLMForConditionalGeneration.from_pretrained(
+            path,
+            dtype=torch.float32,
+            _attn_implementation="eager",
+        ).to(device_type)
+        pixel_values = torch.randn(args.batch_size, 3, args.height, args.width, device=model.device, dtype=torch.float32)
+        print("pixel_values:", pixel_values.shape)
+        model = smolvlm_vision(model)
+        model = model.to(torch.float32).eval()
+        out = model(pixel_values)
+        torch.onnx.export(model,
+                    pixel_values,
+                    savepath,
+                    input_names=['pixel'],
+                    dynamic_axes={'pixel': {2: 'height', 3: 'width'}},
+                    opset_version=18)
+    elif model_name == 'internvl3-1b':
+        model = AutoModel.from_pretrained(
+        path,
+        torch_dtype=torch.float32,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True).eval().to(device_type)
+        pixel_values = torch.randn(args.batch_size, 3, args.height, args.width, device=model.device, dtype=torch.float32)
+        model.forward = model.extract_feature
+        model = model.to(torch.float32).eval()
+        torch.onnx.export(model, pixel_values, savepath, input_names=['pixel'])
+    elif model_name == 'deepseekocr':
+        model = AutoModel.from_pretrained(
+        path,
+        _attn_implementation='eager',
+        torch_dtype=torch.float32,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True).eval().to(device_type)
+        pixel_values = torch.randn(args.batch_size, 3, args.height, args.width, device=model.device, dtype=torch.float32)
+        model = deepseekocr_vision(model.model)
+        model = model.to(torch.float32).eval()
+        torch.onnx.export(model, pixel_values, savepath, input_names=['pixel'], opset_version=18)
+    else:
+        raise ValueError(f"Unsupported model name: {model_name}")
+        exit(1)
+    print(f"Exported to {savepath}")