add README.md
Browse files
README.md
CHANGED
|
@@ -1,3 +1,188 @@
|
|
| 1 |
---
|
| 2 |
license: mit
|
| 3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
license: mit
|
| 3 |
---
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# IflyBotVLM
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
## Introduction
|
| 10 |
+
|
| 11 |
+
IflyBotVLM is a 8B open-source vision-language model(VLM) designed for embodied brain.
|
| 12 |
+
|
| 13 |
+
## Model Architecture
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
## Model Performance
|
| 18 |
+
|
| 19 |
+
IflyBotVLM demonstrates superior performance across various challenging benchmarks.
|
| 20 |
+
|
| 21 |
+

|
| 22 |
+
|
| 23 |
+

|
| 24 |
+
|
| 25 |
+
IflyBotVLM-8B achieves state-of-the-art (SOTA) or near-SOTA performance on ten spatial comprehension, spatial perception, and temporal task planning benchmarks: Where2Place, Refspatial-bench, ShareRobot-affordance, ShareRobot-trajectory, BLINK(spatial), EmbSpatial, ERQA, CVBench, SAT, EgoPlan2.
|
| 26 |
+
|
| 27 |
+
## Quick Start
|
| 28 |
+
|
| 29 |
+
### Using 🤗 Transformers to Chat
|
| 30 |
+
|
| 31 |
+
We provide an example code to run `InternVL3-8B` using `transformers`.
|
| 32 |
+
|
| 33 |
+
> Please use transformers>=4.37.2 to ensure the model works normally.
|
| 34 |
+
|
| 35 |
+
<details>
|
| 36 |
+
<summary>Python code</summary>
|
| 37 |
+
|
| 38 |
+
```python
|
| 39 |
+
import math
|
| 40 |
+
import numpy as np
|
| 41 |
+
import torch
|
| 42 |
+
import torchvision.transforms as T
|
| 43 |
+
from PIL import Image
|
| 44 |
+
from torchvision.transforms.functional import InterpolationMode
|
| 45 |
+
from transformers import AutoModel, AutoTokenizer,AutoConfig
|
| 46 |
+
from tqdm import tqdm
|
| 47 |
+
import json
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
IMAGENET_MEAN = (0.485, 0.456, 0.406)
|
| 51 |
+
IMAGENET_STD = (0.229, 0.224, 0.225)
|
| 52 |
+
|
| 53 |
+
class IflyRoboInference:
|
| 54 |
+
def __init__(self, model_path=''):
|
| 55 |
+
self.model = AutoModel.from_pretrained(
|
| 56 |
+
model_path,
|
| 57 |
+
torch_dtype=torch.bfloat16,
|
| 58 |
+
load_in_8bit=False,
|
| 59 |
+
low_cpu_mem_usage=True,
|
| 60 |
+
use_flash_attn=True,
|
| 61 |
+
trust_remote_code=True,
|
| 62 |
+
device_map="balanced").eval() # "auto", "balanced"
|
| 63 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
|
| 64 |
+
self.generation_config = dict(
|
| 65 |
+
do_sample=True,
|
| 66 |
+
temperature=0.5,
|
| 67 |
+
top_p = 0.0,
|
| 68 |
+
top_k = 1,
|
| 69 |
+
max_new_tokens=16384
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
def build_transform(self, input_size):
|
| 73 |
+
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
|
| 74 |
+
transform = T.Compose([
|
| 75 |
+
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
|
| 76 |
+
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
|
| 77 |
+
T.ToTensor(),
|
| 78 |
+
T.Normalize(mean=MEAN, std=STD)
|
| 79 |
+
])
|
| 80 |
+
return transform
|
| 81 |
+
|
| 82 |
+
def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
|
| 83 |
+
best_ratio_diff = float('inf')
|
| 84 |
+
best_ratio = (1, 1)
|
| 85 |
+
area = width * height
|
| 86 |
+
for ratio in target_ratios:
|
| 87 |
+
target_aspect_ratio = ratio[0] / ratio[1]
|
| 88 |
+
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
|
| 89 |
+
if ratio_diff < best_ratio_diff:
|
| 90 |
+
best_ratio_diff = ratio_diff
|
| 91 |
+
best_ratio = ratio
|
| 92 |
+
elif ratio_diff == best_ratio_diff:
|
| 93 |
+
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
|
| 94 |
+
best_ratio = ratio
|
| 95 |
+
return best_ratio
|
| 96 |
+
|
| 97 |
+
def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=896, use_thumbnail=False):
|
| 98 |
+
orig_width, orig_height = image.size
|
| 99 |
+
aspect_ratio = orig_width / orig_height
|
| 100 |
+
|
| 101 |
+
target_ratios = set(
|
| 102 |
+
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
|
| 103 |
+
i * j <= max_num and i * j >= min_num)
|
| 104 |
+
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
|
| 105 |
+
|
| 106 |
+
target_aspect_ratio = self.find_closest_aspect_ratio(
|
| 107 |
+
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
|
| 108 |
+
|
| 109 |
+
target_width = image_size * target_aspect_ratio[0]
|
| 110 |
+
target_height = image_size * target_aspect_ratio[1]
|
| 111 |
+
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
|
| 112 |
+
|
| 113 |
+
resized_img = image.resize((target_width, target_height))
|
| 114 |
+
processed_images = []
|
| 115 |
+
for i in range(blocks):
|
| 116 |
+
box = (
|
| 117 |
+
(i % (target_width // image_size)) * image_size,
|
| 118 |
+
(i // (target_width // image_size)) * image_size,
|
| 119 |
+
((i % (target_width // image_size)) + 1) * image_size,
|
| 120 |
+
((i // (target_width // image_size)) + 1) * image_size
|
| 121 |
+
)
|
| 122 |
+
split_img = resized_img.crop(box)
|
| 123 |
+
processed_images.append(split_img)
|
| 124 |
+
assert len(processed_images) == blocks
|
| 125 |
+
if use_thumbnail and len(processed_images) != 1:
|
| 126 |
+
thumbnail_img = image.resize((image_size, image_size))
|
| 127 |
+
processed_images.append(thumbnail_img)
|
| 128 |
+
return processed_images
|
| 129 |
+
|
| 130 |
+
def load_image(self, image_file, input_size=896, max_num=12):
|
| 131 |
+
image = Image.open(image_file).convert('RGB')
|
| 132 |
+
transform = self.build_transform(input_size=input_size)
|
| 133 |
+
images = self.dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
|
| 134 |
+
pixel_values = [transform(image) for image in images]
|
| 135 |
+
pixel_values = torch.stack(pixel_values)
|
| 136 |
+
return pixel_values
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def forward_multi_image(self, image_paths: list, question: dict):
|
| 140 |
+
pixel_values = []
|
| 141 |
+
num_patches_list = []
|
| 142 |
+
resize_size = 448
|
| 143 |
+
for i, image_path in enumerate(image_paths):
|
| 144 |
+
pixel_value = self.load_image(image_path, input_size=resize_size).to(torch.bfloat16).cuda()
|
| 145 |
+
pixel_values.append(pixel_value)
|
| 146 |
+
num_patches_list.append(pixel_value.size(0))
|
| 147 |
+
pixel_values = torch.cat(tuple(pixel_values), dim=0)
|
| 148 |
+
print(question)
|
| 149 |
+
response, history = self.model.chat(self.tokenizer, pixel_values, question["prompt"], self.generation_config, history=None, return_history=True)
|
| 150 |
+
print(response)
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
def test_spatial_from_blink():
|
| 154 |
+
hf_path = "IflyBot/IflyBotVLM"
|
| 155 |
+
ifly_robo_infer = IflyRoboInference(hf_path)
|
| 156 |
+
question = {
|
| 157 |
+
"idx": "val_Spatial_Relation_143",
|
| 158 |
+
"sub_task" : "Spatial Relation",
|
| 159 |
+
"prompt": "<image> Is the person behind the cup?\nSelect from the following choices.\n(A) yes\n(B) no.\nPlease answer directly with only the letter of the correct option and nothing else."
|
| 160 |
+
}
|
| 161 |
+
image_path = [
|
| 162 |
+
"./examples-images/val_Spatial_Relation_143_1.jpg"
|
| 163 |
+
]
|
| 164 |
+
ifly_robo_infer.forward_multi_image(image_path, question)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def test_visual_correspondence_from_blink():
|
| 168 |
+
hf_path = "IflyBot/IflyBotVLM"
|
| 169 |
+
ifly_robo_infer = IflyRoboInference(hf_path)
|
| 170 |
+
question = {
|
| 171 |
+
"idx": "val_Visual_Correspondence_1",
|
| 172 |
+
"sub_task" : "Visual Correspondence",
|
| 173 |
+
"prompt": "<image> <image> A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\n(A) Point A\n(B) Point B\n(C) Point C\n(D) Point D.\nPlease answer directly with only the letter of the correct option and nothing else."
|
| 174 |
+
}
|
| 175 |
+
image_path = [
|
| 176 |
+
"./examples-images/val_Visual_Correspondence_1_1.jpg",
|
| 177 |
+
"./examples-images/val_Visual_Correspondence_1_2.jpg"
|
| 178 |
+
]
|
| 179 |
+
ifly_robo_infer.forward_multi_image(image_path, question)
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
if __name__ == '__main__':
|
| 183 |
+
test_spatial_from_blink()
|
| 184 |
+
test_visual_correspondence_from_blink()
|
| 185 |
+
test_task_plan_from_egoplan2()
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
</details>
|