iFlyBot commited on
Commit
dd77e51
·
1 Parent(s): 067b781

add README.md

Browse files
Files changed (1) hide show
  1. README.md +185 -0
README.md CHANGED
@@ -1,3 +1,188 @@
1
  ---
2
  license: mit
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
  ---
4
+
5
+
6
+ # IflyBotVLM
7
+
8
+
9
+ ## Introduction
10
+
11
+ IflyBotVLM is a 8B open-source vision-language model(VLM) designed for embodied brain.
12
+
13
+ ## Model Architecture
14
+
15
+
16
+
17
+ ## Model Performance
18
+
19
+ IflyBotVLM demonstrates superior performance across various challenging benchmarks.
20
+
21
+ ![image/png](https://huggingface.co/datasets/IflyBot/IflyBotVLM-Repo/resolve/main/images/benchmark_performance.png)
22
+
23
+ ![image/png](https://huggingface.co/datasets/IflyBot/IflyBotVLM-Repo/resolve/main/images/table-performances.png)
24
+
25
+ IflyBotVLM-8B achieves state-of-the-art (SOTA) or near-SOTA performance on ten spatial comprehension, spatial perception, and temporal task planning benchmarks: Where2Place, Refspatial-bench, ShareRobot-affordance, ShareRobot-trajectory, BLINK(spatial), EmbSpatial, ERQA, CVBench, SAT, EgoPlan2.
26
+
27
+ ## Quick Start
28
+
29
+ ### Using 🤗 Transformers to Chat
30
+
31
+ We provide an example code to run `InternVL3-8B` using `transformers`.
32
+
33
+ > Please use transformers>=4.37.2 to ensure the model works normally.
34
+
35
+ <details>
36
+ <summary>Python code</summary>
37
+
38
+ ```python
39
+ import math
40
+ import numpy as np
41
+ import torch
42
+ import torchvision.transforms as T
43
+ from PIL import Image
44
+ from torchvision.transforms.functional import InterpolationMode
45
+ from transformers import AutoModel, AutoTokenizer,AutoConfig
46
+ from tqdm import tqdm
47
+ import json
48
+
49
+
50
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
51
+ IMAGENET_STD = (0.229, 0.224, 0.225)
52
+
53
+ class IflyRoboInference:
54
+ def __init__(self, model_path=''):
55
+ self.model = AutoModel.from_pretrained(
56
+ model_path,
57
+ torch_dtype=torch.bfloat16,
58
+ load_in_8bit=False,
59
+ low_cpu_mem_usage=True,
60
+ use_flash_attn=True,
61
+ trust_remote_code=True,
62
+ device_map="balanced").eval() # "auto", "balanced"
63
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
64
+ self.generation_config = dict(
65
+ do_sample=True,
66
+ temperature=0.5,
67
+ top_p = 0.0,
68
+ top_k = 1,
69
+ max_new_tokens=16384
70
+ )
71
+
72
+ def build_transform(self, input_size):
73
+ MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
74
+ transform = T.Compose([
75
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
76
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
77
+ T.ToTensor(),
78
+ T.Normalize(mean=MEAN, std=STD)
79
+ ])
80
+ return transform
81
+
82
+ def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
83
+ best_ratio_diff = float('inf')
84
+ best_ratio = (1, 1)
85
+ area = width * height
86
+ for ratio in target_ratios:
87
+ target_aspect_ratio = ratio[0] / ratio[1]
88
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
89
+ if ratio_diff < best_ratio_diff:
90
+ best_ratio_diff = ratio_diff
91
+ best_ratio = ratio
92
+ elif ratio_diff == best_ratio_diff:
93
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
94
+ best_ratio = ratio
95
+ return best_ratio
96
+
97
+ def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=896, use_thumbnail=False):
98
+ orig_width, orig_height = image.size
99
+ aspect_ratio = orig_width / orig_height
100
+
101
+ target_ratios = set(
102
+ (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
103
+ i * j <= max_num and i * j >= min_num)
104
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
105
+
106
+ target_aspect_ratio = self.find_closest_aspect_ratio(
107
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size)
108
+
109
+ target_width = image_size * target_aspect_ratio[0]
110
+ target_height = image_size * target_aspect_ratio[1]
111
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
112
+
113
+ resized_img = image.resize((target_width, target_height))
114
+ processed_images = []
115
+ for i in range(blocks):
116
+ box = (
117
+ (i % (target_width // image_size)) * image_size,
118
+ (i // (target_width // image_size)) * image_size,
119
+ ((i % (target_width // image_size)) + 1) * image_size,
120
+ ((i // (target_width // image_size)) + 1) * image_size
121
+ )
122
+ split_img = resized_img.crop(box)
123
+ processed_images.append(split_img)
124
+ assert len(processed_images) == blocks
125
+ if use_thumbnail and len(processed_images) != 1:
126
+ thumbnail_img = image.resize((image_size, image_size))
127
+ processed_images.append(thumbnail_img)
128
+ return processed_images
129
+
130
+ def load_image(self, image_file, input_size=896, max_num=12):
131
+ image = Image.open(image_file).convert('RGB')
132
+ transform = self.build_transform(input_size=input_size)
133
+ images = self.dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
134
+ pixel_values = [transform(image) for image in images]
135
+ pixel_values = torch.stack(pixel_values)
136
+ return pixel_values
137
+
138
+
139
+ def forward_multi_image(self, image_paths: list, question: dict):
140
+ pixel_values = []
141
+ num_patches_list = []
142
+ resize_size = 448
143
+ for i, image_path in enumerate(image_paths):
144
+ pixel_value = self.load_image(image_path, input_size=resize_size).to(torch.bfloat16).cuda()
145
+ pixel_values.append(pixel_value)
146
+ num_patches_list.append(pixel_value.size(0))
147
+ pixel_values = torch.cat(tuple(pixel_values), dim=0)
148
+ print(question)
149
+ response, history = self.model.chat(self.tokenizer, pixel_values, question["prompt"], self.generation_config, history=None, return_history=True)
150
+ print(response)
151
+
152
+
153
+ def test_spatial_from_blink():
154
+ hf_path = "IflyBot/IflyBotVLM"
155
+ ifly_robo_infer = IflyRoboInference(hf_path)
156
+ question = {
157
+ "idx": "val_Spatial_Relation_143",
158
+ "sub_task" : "Spatial Relation",
159
+ "prompt": "<image> Is the person behind the cup?\nSelect from the following choices.\n(A) yes\n(B) no.\nPlease answer directly with only the letter of the correct option and nothing else."
160
+ }
161
+ image_path = [
162
+ "./examples-images/val_Spatial_Relation_143_1.jpg"
163
+ ]
164
+ ifly_robo_infer.forward_multi_image(image_path, question)
165
+
166
+
167
+ def test_visual_correspondence_from_blink():
168
+ hf_path = "IflyBot/IflyBotVLM"
169
+ ifly_robo_infer = IflyRoboInference(hf_path)
170
+ question = {
171
+ "idx": "val_Visual_Correspondence_1",
172
+ "sub_task" : "Visual Correspondence",
173
+ "prompt": "<image> <image> A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\n(A) Point A\n(B) Point B\n(C) Point C\n(D) Point D.\nPlease answer directly with only the letter of the correct option and nothing else."
174
+ }
175
+ image_path = [
176
+ "./examples-images/val_Visual_Correspondence_1_1.jpg",
177
+ "./examples-images/val_Visual_Correspondence_1_2.jpg"
178
+ ]
179
+ ifly_robo_infer.forward_multi_image(image_path, question)
180
+
181
+
182
+ if __name__ == '__main__':
183
+ test_spatial_from_blink()
184
+ test_visual_correspondence_from_blink()
185
+ test_task_plan_from_egoplan2()
186
+ ```
187
+
188
+ </details>