metadata
license: mit
IflyBotVLM
Introduction
IflyBotVLM is a 8B open-source vision-language model(VLM) designed for embodied brain.
Model Architecture
Model Performance
IflyBotVLM demonstrates superior performance across various challenging benchmarks.
IflyBotVLM-8B achieves state-of-the-art (SOTA) or near-SOTA performance on ten spatial comprehension, spatial perception, and temporal task planning benchmarks: Where2Place, Refspatial-bench, ShareRobot-affordance, ShareRobot-trajectory, BLINK(spatial), EmbSpatial, ERQA, CVBench, SAT, EgoPlan2.
Quick Start
Using 🤗 Transformers to Chat
We provide an example code to run InternVL3-8B using transformers.
Please use transformers>=4.37.2 to ensure the model works normally.
Python code
import math
import numpy as np
import torch
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer,AutoConfig
from tqdm import tqdm
import json
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
class IflyRoboInference:
def __init__(self, model_path=''):
self.model = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
load_in_8bit=False,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
device_map="balanced").eval() # "auto", "balanced"
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
self.generation_config = dict(
do_sample=True,
temperature=0.5,
top_p = 0.0,
top_k = 1,
max_new_tokens=16384
)
def build_transform(self, input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=896, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
target_aspect_ratio = self.find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(self, image_file, input_size=896, max_num=12):
image = Image.open(image_file).convert('RGB')
transform = self.build_transform(input_size=input_size)
images = self.dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
def forward_multi_image(self, image_paths: list, question: dict):
pixel_values = []
num_patches_list = []
resize_size = 448
for i, image_path in enumerate(image_paths):
pixel_value = self.load_image(image_path, input_size=resize_size).to(torch.bfloat16).cuda()
pixel_values.append(pixel_value)
num_patches_list.append(pixel_value.size(0))
pixel_values = torch.cat(tuple(pixel_values), dim=0)
print(question)
response, history = self.model.chat(self.tokenizer, pixel_values, question["prompt"], self.generation_config, history=None, return_history=True)
print(response)
def test_spatial_from_blink():
hf_path = "IflyBot/IflyBotVLM"
ifly_robo_infer = IflyRoboInference(hf_path)
question = {
"idx": "val_Spatial_Relation_143",
"sub_task" : "Spatial Relation",
"prompt": "<image> Is the person behind the cup?\nSelect from the following choices.\n(A) yes\n(B) no.\nPlease answer directly with only the letter of the correct option and nothing else."
}
image_path = [
"./examples-images/val_Spatial_Relation_143_1.jpg"
]
ifly_robo_infer.forward_multi_image(image_path, question)
def test_visual_correspondence_from_blink():
hf_path = "IflyBot/IflyBotVLM"
ifly_robo_infer = IflyRoboInference(hf_path)
question = {
"idx": "val_Visual_Correspondence_1",
"sub_task" : "Visual Correspondence",
"prompt": "<image> <image> A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\n(A) Point A\n(B) Point B\n(C) Point C\n(D) Point D.\nPlease answer directly with only the letter of the correct option and nothing else."
}
image_path = [
"./examples-images/val_Visual_Correspondence_1_1.jpg",
"./examples-images/val_Visual_Correspondence_1_2.jpg"
]
ifly_robo_infer.forward_multi_image(image_path, question)
if __name__ == '__main__':
test_spatial_from_blink()
test_visual_correspondence_from_blink()
test_task_plan_from_egoplan2()

