merge

b01eeba 6 months ago

7.71 kB

license: mit

IflyBotVLM

Introduction

IflyBotVLM is a 8B open-source vision-language model(VLM) designed for embodied brain.

Model Architecture

Model Performance

IflyBotVLM demonstrates superior performance across various challenging benchmarks.

IflyBotVLM-8B achieves state-of-the-art (SOTA) or near-SOTA performance on ten spatial comprehension, spatial perception, and temporal task planning benchmarks: Where2Place, Refspatial-bench, ShareRobot-affordance, ShareRobot-trajectory, BLINK(spatial), EmbSpatial, ERQA, CVBench, SAT, EgoPlan2.

Quick Start

Using 🤗 Transformers to Chat

We provide an example code to run InternVL3-8B using transformers.

Please use transformers>=4.37.2 to ensure the model works normally.

Python code

import math
import numpy as np
import torch
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer,AutoConfig
from tqdm import tqdm
import json


IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

class IflyRoboInference:
    def __init__(self, model_path=''):
        self.model = AutoModel.from_pretrained(
            model_path,
            torch_dtype=torch.bfloat16,
            load_in_8bit=False,
            low_cpu_mem_usage=True,
            use_flash_attn=True,
            trust_remote_code=True,
            device_map="balanced").eval()  # "auto", "balanced"
        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
        self.generation_config = dict(
            do_sample=True,
            temperature=0.5,
            top_p = 0.0,
            top_k = 1,
            max_new_tokens=16384
            )

    def build_transform(self, input_size):
        MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
        transform = T.Compose([
            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
            T.ToTensor(),
            T.Normalize(mean=MEAN, std=STD)
        ])
        return transform

    def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
        best_ratio_diff = float('inf')
        best_ratio = (1, 1)
        area = width * height
        for ratio in target_ratios:
            target_aspect_ratio = ratio[0] / ratio[1]
            ratio_diff = abs(aspect_ratio - target_aspect_ratio)
            if ratio_diff < best_ratio_diff:
                best_ratio_diff = ratio_diff
                best_ratio = ratio
            elif ratio_diff == best_ratio_diff:
                if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                    best_ratio = ratio
        return best_ratio

    def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=896, use_thumbnail=False):
        orig_width, orig_height = image.size
        aspect_ratio = orig_width / orig_height

        target_ratios = set(
            (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
            i * j <= max_num and i * j >= min_num)
        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

        target_aspect_ratio = self.find_closest_aspect_ratio(
            aspect_ratio, target_ratios, orig_width, orig_height, image_size)

        target_width = image_size * target_aspect_ratio[0]
        target_height = image_size * target_aspect_ratio[1]
        blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

        resized_img = image.resize((target_width, target_height))
        processed_images = []
        for i in range(blocks):
            box = (
                (i % (target_width // image_size)) * image_size,
                (i // (target_width // image_size)) * image_size,
                ((i % (target_width // image_size)) + 1) * image_size,
                ((i // (target_width // image_size)) + 1) * image_size
            )
            split_img = resized_img.crop(box)
            processed_images.append(split_img)
        assert len(processed_images) == blocks
        if use_thumbnail and len(processed_images) != 1:
            thumbnail_img = image.resize((image_size, image_size))
            processed_images.append(thumbnail_img)
        return processed_images

    def load_image(self, image_file, input_size=896, max_num=12):
        image = Image.open(image_file).convert('RGB')
        transform = self.build_transform(input_size=input_size)
        images = self.dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
        pixel_values = [transform(image) for image in images]
        pixel_values = torch.stack(pixel_values)
        return pixel_values


    def forward_multi_image(self, image_paths: list,  question: dict):
        pixel_values = []
        num_patches_list = []
        resize_size = 448
        for  i, image_path in enumerate(image_paths):
            pixel_value = self.load_image(image_path, input_size=resize_size).to(torch.bfloat16).cuda()
            pixel_values.append(pixel_value)
            num_patches_list.append(pixel_value.size(0))
        pixel_values = torch.cat(tuple(pixel_values), dim=0)
        print(question)
        response, history = self.model.chat(self.tokenizer, pixel_values, question["prompt"], self.generation_config, history=None, return_history=True)
        print(response)


def test_spatial_from_blink():
    hf_path = "IflyBot/IflyBotVLM"
    ifly_robo_infer = IflyRoboInference(hf_path)
    question = {
        "idx": "val_Spatial_Relation_143",
        "sub_task" : "Spatial Relation",
        "prompt": "<image> Is the person behind the cup?\nSelect from the following choices.\n(A) yes\n(B) no.\nPlease answer directly with only the letter of the correct option and nothing else."
    }
    image_path = [
        "./examples-images/val_Spatial_Relation_143_1.jpg"
    ]
    ifly_robo_infer.forward_multi_image(image_path, question)


def test_visual_correspondence_from_blink():
    hf_path = "IflyBot/IflyBotVLM"
    ifly_robo_infer = IflyRoboInference(hf_path)
    question = {
        "idx": "val_Visual_Correspondence_1",
        "sub_task" : "Visual Correspondence",
        "prompt": "<image> <image> A point is circled on the first image, labeled with REF. We change the camera position or lighting and shoot the second image. You are given multiple red-circled points on the second image, choices of \"A, B, C, D\" are drawn beside each circle. Which point on the second image corresponds to the point in the first image? Select from the following options.\n(A) Point A\n(B) Point B\n(C) Point C\n(D) Point D.\nPlease answer directly with only the letter of the correct option and nothing else."
    }
    image_path = [
        "./examples-images/val_Visual_Correspondence_1_1.jpg",
        "./examples-images/val_Visual_Correspondence_1_2.jpg"
    ]
    ifly_robo_infer.forward_multi_image(image_path, question)


if __name__ == '__main__':
    test_spatial_from_blink()
    test_visual_correspondence_from_blink()
    test_task_plan_from_egoplan2()