mx262
/

MiniMonkey

Safetensors

internvl_chat

custom_code

Model card Files Files and versions

xet

Community

mx262 commited on Sep 5, 2024

Commit

59a5edb

verified ·

1 Parent(s): 0624b31

Upload internvl_chat.py

Browse files

Files changed (1) hide show

internvl_chat.py +318 -37

internvl_chat.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import torch
-from transformers import AutoTokenizer, AutoModel, CLIPImageProcessor
 import warnings
 from PIL import Image
 from .base import BaseModel
@@ -7,11 +7,13 @@ from ..smp import *
 from ..dataset import DATASET_TYPE
 import pandas as pd
 import string
 import torchvision.transforms as T
 import transformers
 from torchvision.transforms.functional import InterpolationMode
-import random
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
@@ -143,35 +145,94 @@ def load_image2(image_file, input_size=448, target_aspect_ratio=(1,1), min_num=1
     pixel_values = torch.stack(pixel_values)
     return pixel_values
 class InternVLChat(BaseModel):
     INSTALL_REQ = False
-    INTERLEAVE = False
-    def __init__(self, model_path='OpenGVLab/InternVL-Chat-V1-5', load_in_8bit=False, **kwargs):
         assert model_path is not None
         assert version_cmp(transformers.__version__, '4.36.2', 'ge')
         self.model_path = model_path
         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
-        device = torch.cuda.current_device()
-        self.device = device
-        self.model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16,
-                                               trust_remote_code=True,
-                                               load_in_8bit=load_in_8bit).eval()
-        if not load_in_8bit:
-            self.model = self.model.to(device)
-        self.image_size = self.model.config.vision_config.image_size
-        if 'V1-1' in model_path:
-            kwargs_default = dict(do_sample=False, max_new_tokens=1024, top_p=None, num_beams=5)
         else:
-            kwargs_default = dict(do_sample=False, max_new_tokens=1024, top_p=None, num_beams=1)
-        kwargs_default.update(kwargs)
-        self.kwargs = kwargs_default
         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
     def use_custom_prompt(self, dataset):
-        return True
     def build_multi_choice_prompt(self, line, dataset=None):
         question = line['question']
@@ -196,28 +257,41 @@ class InternVLChat(BaseModel):
         return prompt
     def build_prompt(self, line, dataset=None):
         assert self.use_custom_prompt(dataset)
         assert dataset is None or isinstance(dataset, str)
         tgt_path = self.dump_image(line, dataset)
-        if 'V1-1' in self.model_path:
             kwargs_default = dict(do_sample=False, max_new_tokens=1024, top_p=None, num_beams=5)
         else:
             kwargs_default = dict(do_sample=False, max_new_tokens=1024, top_p=None, num_beams=1)
         self.kwargs = kwargs_default
         if dataset is not None and listinstr(['MME'], dataset):
             question = line['question']
             prompt = question + ' Answer the question using a single word or phrase.'
-            if 'V1-2' not in self.model_path:
-                self.kwargs = dict(do_sample=True, max_new_tokens=5, top_k=50, num_beams=5, top_p=0.9)
         elif dataset is not None and listinstr(['HallusionBench'], dataset):
             question = line['question']
             prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
-        elif dataset is not None and DATASET_TYPE(dataset) == 'multi-choice':
             prompt = self.build_multi_choice_prompt(line, dataset)
         elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
-            if 'MathVista' in dataset:
                 prompt = line['question']
             elif listinstr(['LLaVABench'], dataset):
                 question = line['question']
@@ -229,14 +303,11 @@ class InternVLChat(BaseModel):
                 prompt = question + '\nAnswer the question using a single word or phrase.'
         else:
             prompt = line['question']
         message = [dict(type='text', value=prompt)]
         message.extend([dict(type='image', value=s) for s in tgt_path])
         return message
-    def generate(self, message, dataset=None):
-        prompt, image_path = self.message_to_promptimg(message)
         if dataset is not None and listinstr(['ChartQA_TEST'], dataset):
             self.max_num = 12
             self.max_num2 = 3
@@ -245,33 +316,243 @@ class InternVLChat(BaseModel):
             self.max_num2 = 15
             self.min_num = 14
             self.min_num2 = 5
-        elif dataset is not None and listinstr(['InfoVQA_VAL', 'InfoVQA_TEST'], dataset):
             self.max_num = 23
             self.max_num2 = 5
             self.min_num = 15
             self.min_num2 = 3
-        elif dataset is not None and listinstr(['OCRBench'], dataset):
             self.max_num = 24
             self.max_num2 = 8
             self.min_num = 9
             self.min_num2 = 5
         else:
             self.max_num = 8
             self.max_num2 = 4
             self.min_num = 3
             self.min_num2 = 1
-        pixel_values, target_aspect_ratio = load_image(image_path, min_num=self.min_num, max_num=self.max_num)
-        pixel_values = pixel_values.cuda().to(torch.bfloat16)
-        pixel_values2 = load_image2(image_path, target_aspect_ratio=target_aspect_ratio, min_num=self.min_num2, max_num=self.max_num2)
-        pixel_values2 = pixel_values2.cuda().to(torch.bfloat16)
-        pixel_values = torch.cat((pixel_values[:-1],  pixel_values2[:-1], pixel_values[-1:]), 0)
         with torch.no_grad():
-            response = self.model.chat(self.tokenizer, pixel_values=pixel_values, target_aspect_ratio=target_aspect_ratio,
                                        question=prompt, generation_config=self.kwargs)
-        response = response.split('[UNUSED_TOKEN_145]')[0]
         return response
     def generate_inner(self, message, dataset=None):
-        return self.generate(message, dataset)

 import torch
+from transformers import AutoTokenizer, AutoConfig, AutoModel, CLIPImageProcessor
 import warnings
 from PIL import Image
 from .base import BaseModel
 from ..dataset import DATASET_TYPE
 import pandas as pd
 import string
+import torch.distributed as dist
 import torchvision.transforms as T
 import transformers
 from torchvision.transforms.functional import InterpolationMode
+import re
 IMAGENET_MEAN = (0.485, 0.456, 0.406)
 IMAGENET_STD = (0.229, 0.224, 0.225)
     pixel_values = torch.stack(pixel_values)
     return pixel_values
+# This function is used to split InternVL2-Llama3-76B
+def split_model(model_name):
+    import math
+    device_map = {}
+    num_gpus = torch.cuda.device_count()
+    rank, world_size = get_rank_and_world_size()
+    num_gpus = num_gpus // world_size
+    num_layers = {'InternVL2-8B': 32, 'InternVL2-26B': 48,
+                  'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]
+    # Since the first GPU will be used for ViT, treat it as 0.8 GPU.
+    num_layers_per_gpu = math.ceil(num_layers / (num_gpus - 0.2))
+    num_layers_per_gpu = [num_layers_per_gpu] * num_gpus
+    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.8)
+    layer_cnt = 0
+    for i, num_layer in enumerate(num_layers_per_gpu):
+        for j in range(num_layer):
+            device_map[f'language_model.model.layers.{layer_cnt}'] = rank + world_size * i
+            layer_cnt += 1
+    device_map['vision_model'] = rank
+    device_map['mlp1'] = rank
+    device_map['language_model.model.tok_embeddings'] = rank
+    device_map['language_model.model.embed_tokens'] = rank
+    device_map['language_model.output'] = rank
+    device_map['language_model.model.norm'] = rank
+    device_map['language_model.lm_head'] = rank
+    device_map[f'language_model.model.layers.{num_layers - 1}'] = rank
+    return device_map
 class InternVLChat(BaseModel):
     INSTALL_REQ = False
+    INTERLEAVE = True
+    def __init__(self, model_path='OpenGVLab/InternVL-Chat-V1-5', load_in_8bit=False, version='V1.0', **kwargs):
         assert model_path is not None
         assert version_cmp(transformers.__version__, '4.36.2', 'ge')
         self.model_path = model_path
         self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
+        # Regular expression to match the pattern 'Image' followed by a number, e.g. Image1
+        self.pattern = r'Image(\d+)'
+        # Replacement pattern to insert a hyphen between 'Image' and the number, e.g. Image-1
+        self.replacement = r'Image-\1'
+        # Convert InternVL2 response to dataset format
+        # e.g. Image1 -> Image-1
+        # Regular expression to match the pattern 'Image-' followed by a number
+        self.reverse_pattern = r'Image-(\d+)'
+        # Replacement pattern to remove the hyphen (Image-1 -> Image1)
+        self.reverse_replacement = r'Image\1'
+        if listinstr(['InternVL2-Llama3-76B'], model_path):
+            device_map = split_model(model_path.split('/')[-1])
+            self.model = AutoModel.from_pretrained(
+                model_path,
+                torch_dtype=torch.bfloat16,
+                load_in_8bit=load_in_8bit,
+                trust_remote_code=True,
+                low_cpu_mem_usage=True,
+                device_map=device_map).eval()
         else:
+            device = torch.cuda.current_device()
+            self.device = device
+            self.model = AutoModel.from_pretrained(
+                model_path,
+                torch_dtype=torch.bfloat16,
+                trust_remote_code=True,
+                load_in_8bit=load_in_8bit).eval()
+            if not load_in_8bit:
+                self.model = self.model.to(device)
+        self.image_size = self.model.config.vision_config.image_size
+        self.version = version
+        self.kwargs = kwargs
         warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
     def use_custom_prompt(self, dataset):
+        if dataset is not None and listinstr(['MMDU'], dataset):
+            # For Multi-Turn we don't have custom prompt
+            return False
+        else:
+            return True
     def build_multi_choice_prompt(self, line, dataset=None):
         question = line['question']
         return prompt
+    def build_video_prompt(self, prompt, dataset=None, max_nframe=64):
+        for start in range(0, max_nframe, 8):
+            images_to_remove = ''.join([f'<image-{i}>' for i in range(start + 1, start + 9)])
+            prompt = prompt.replace(images_to_remove, '')
+        for i in range(max_nframe):
+            prompt = prompt.replace(f'<image-{i + 1}>', f'Frame{i + 1}')
+        if listinstr(['MMBench-Video'], dataset):
+            prompt = prompt.replace('\nAnswer:', '')
+            prompt += '\nAnswer the question using a single word or phrase.'
+        elif listinstr(['Video-MME'], dataset):
+            prompt = prompt.replace('\nAnswer:', '')
+            prompt += "\nAnswer with the option's letter from the given choices directly."
+        return prompt
     def build_prompt(self, line, dataset=None):
         assert self.use_custom_prompt(dataset)
         assert dataset is None or isinstance(dataset, str)
         tgt_path = self.dump_image(line, dataset)
+        if self.version == 'V1.1':
             kwargs_default = dict(do_sample=False, max_new_tokens=1024, top_p=None, num_beams=5)
         else:
             kwargs_default = dict(do_sample=False, max_new_tokens=1024, top_p=None, num_beams=1)
         self.kwargs = kwargs_default
         if dataset is not None and listinstr(['MME'], dataset):
             question = line['question']
             prompt = question + ' Answer the question using a single word or phrase.'
         elif dataset is not None and listinstr(['HallusionBench'], dataset):
             question = line['question']
             prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
+        elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
             prompt = self.build_multi_choice_prompt(line, dataset)
         elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
+            if listinstr(['MathVista', 'MathVision'], dataset):
                 prompt = line['question']
             elif listinstr(['LLaVABench'], dataset):
                 question = line['question']
                 prompt = question + '\nAnswer the question using a single word or phrase.'
         else:
             prompt = line['question']
         message = [dict(type='text', value=prompt)]
         message.extend([dict(type='image', value=s) for s in tgt_path])
         return message
+    def set_max_num(self, dataset):
         if dataset is not None and listinstr(['ChartQA_TEST'], dataset):
             self.max_num = 12
             self.max_num2 = 3
             self.max_num2 = 15
             self.min_num = 14
             self.min_num2 = 5
+        elif dataset is not None and listinstr(['InfoVQA_VAL', 'InfoVQA_TEST', 'SEEDBench_IMG'], dataset):
             self.max_num = 23
             self.max_num2 = 5
             self.min_num = 15
             self.min_num2 = 3
+        elif dataset is not None and listinstr(['OCRBench', 'POPE'], dataset):
             self.max_num = 24
             self.max_num2 = 8
             self.min_num = 9
             self.min_num2 = 5
+        elif dataset is not None and listinstr(['MME', 'HallusionBench'], dataset):
+            self.max_num = 11
+            self.max_num2 = 6
+            self.min_num = 4
+            self.min_num2 = 2
+        elif dataset is not None and listinstr(['AI2D_TEST'], dataset):
+            self.max_num = 12
+            self.max_num2 = 6
+            self.min_num = 5
+            self.min_num2 = 2
+        elif dataset is not None and listinstr(['CCBench'], dataset):
+            self.max_num = 24
+            self.max_num2 = 8
+            self.min_num = 9
+            self.min_num2 = 4
         else:
             self.max_num = 8
             self.max_num2 = 4
             self.min_num = 3
             self.min_num2 = 1
+    def generate_v1_2(self, message, dataset=None):
+        self.INTERLEAVE = False
+        prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
+        image = Image.open(image_path).convert('RGB')
+        image = image.resize((self.image_size, self.image_size))
+        image_processor = CLIPImageProcessor.from_pretrained(self.model_path)
+        pixel_values = image_processor(images=image, return_tensors='pt').pixel_values
+        pixel_values = pixel_values.to(torch.bfloat16).to(self.device)
         with torch.no_grad():
+            response = self.model.chat(self.tokenizer, pixel_values=pixel_values,
                                        question=prompt, generation_config=self.kwargs)
+        return response
+    def generate_v1_5(self, message, dataset=None):
+        image_num = len([x for x in message if x['type'] == 'image'])
+        prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+        if listinstr(['Video'], dataset):
+            prompt = self.build_video_prompt(prompt, dataset)
+        if image_num > 1:
+            image_path = [x['value'] for x in message if x['type'] == 'image']
+            pixel_values_list = []
+            for file_name in image_path:
+                pixel_values_list.append(load_image(file_name, max_num=self.max_num).cuda().to(torch.bfloat16))
+            pixel_values = torch.cat(pixel_values_list, dim=0)
+        elif image_num == 1:
+            image_path = [x['value'] for x in message if x['type'] == 'image'][0]
+            pixel_values = load_image(image_path, max_num=self.max_num).cuda().to(torch.bfloat16)
+        else:
+            pixel_values = None
+        with torch.no_grad():
+            response = self.model.chat(
+                self.tokenizer,
+                pixel_values=pixel_values,
+                question=prompt,
+                generation_config=self.kwargs,
+                verbose=False)
+        return response
+    def generate_v2(self, message, dataset=None):
+        image_num = len([x for x in message if x['type'] == 'image'])
+        if image_num == 1:
+            prompt = '<image>\n' + '\n'.join([x['value'] for x in message if x['type'] == 'text'])
+        else:
+            prompt, image_idx = '', 1
+            for x in message:
+                if x['type'] == 'text':
+                    prompt += x['value']
+                elif x['type'] == 'image':
+                    prompt += f'<image-{image_idx}>'
+                    image_idx += 1
+            prompt = ' '.join([f'<image-{i + 1}>: <image>' for i in range(image_num)]) + '\n' + prompt
+        if listinstr(['Video'], dataset):
+            prompt = self.build_video_prompt(prompt, dataset)
+        if image_num > 1:
+            image_path = [x['value'] for x in message if x['type'] == 'image']
+            num_patches_list = []
+            pixel_values_list = []
+            for image_idx, file_name in enumerate(image_path):
+                upscale_flag = image_idx == 0 and dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset)
+                curr_pixel_values = load_image(
+                    file_name, max_num=self.max_num, upscale=upscale_flag).cuda().to(torch.bfloat16)
+                curr_pixel_values, target_aspect_ratio = load_image(image_path, min_num=self.min_num, max_num=self.max_num)
+                curr_pixel_values = curr_pixel_values.cuda().to(torch.bfloat16)
+                curr_pixel_values2 = load_image2(image_path, target_aspect_ratio=target_aspect_ratio, min_num=self.min_num2, max_num=self.max_num2)
+                curr_pixel_values2 = curr_pixel_values2.cuda().to(torch.bfloat16)
+                curr_pixel_values = torch.cat((curr_pixel_values[:-1],  curr_pixel_values2[:-1], curr_pixel_values[-1:]), 0)
+                num_patches_list.append(curr_pixel_values.size(0))
+                pixel_values_list.append(curr_pixel_values)
+            pixel_values = torch.cat(pixel_values_list, dim=0)
+        elif image_num == 1:
+            image_path = [x['value'] for x in message if x['type'] == 'image'][0]
+            upscale_flag = listinstr(['MMMU_DEV_VAL'], dataset)
+            pixel_values, target_aspect_ratio = load_image(image_path, min_num=self.min_num, max_num=self.max_num)
+            pixel_values = pixel_values.cuda().to(torch.bfloat16)
+            pixel_values2 = load_image2(image_path, target_aspect_ratio=target_aspect_ratio, min_num=self.min_num2, max_num=self.max_num2)
+            pixel_values2 = pixel_values2.cuda().to(torch.bfloat16)
+            pixel_values = torch.cat((pixel_values[:-1],  pixel_values2[:-1], pixel_values[-1:]), 0)
+            num_patches_list = [pixel_values.size(0)]
+        else:
+            pixel_values = None
+            num_patches_list = []
+        with torch.no_grad():
+            response = self.model.chat(
+                self.tokenizer,
+                pixel_values=pixel_values,
+                target_aspect_ratio=(1,1),
+                num_patches_list=num_patches_list,
+                question=prompt,
+                generation_config=self.kwargs,
+                verbose=False
+            )
         return response
     def generate_inner(self, message, dataset=None):
+        self.set_max_num(dataset)
+        print(f'InternVL model version: {self.version}')
+        if self.version in ['V1.1', 'V1.2']:
+            return self.generate_v1_2(message, dataset)
+        elif self.version == 'V1.5':
+            return self.generate_v1_5(message, dataset)
+        elif self.version == 'V2.0':
+            return self.generate_v2(message, dataset)
+        else:
+            raise ValueError(f'Unsupported version: {self.version}')
+    def build_history(self, message):
+        # Global Variables
+        image_path = []
+        image_cnt = 0
+        def concat_tilist(tilist):
+            nonlocal image_cnt  # Declare image_cnt as nonlocal to modify it
+            prompt = ''
+            for item in tilist:
+                # Substitute the pattern in the text
+                if item['type'] == 'text':
+                    prompt += re.sub(self.pattern, self.replacement, item['value'])
+                elif item['type'] == 'image':
+                    image_cnt += 1
+                    prompt += '<image>\n'
+                    image_path.append(item['value'])
+            return prompt
+        # Only previous messages
+        assert len(message) % 2 == 0
+        history = []
+        for i in range(len(message) // 2):
+            m1, m2 = message[2 * i], message[2 * i + 1]
+            assert m1['role'] == 'user' and m2['role'] == 'assistant'
+            history.append((concat_tilist(m1['content']), concat_tilist(m2['content'])))
+        return history, image_path, image_cnt
+    def chat_inner_v2(self, message, dataset=None):
+        image_cnt = 0
+        if len(message) > 1:
+            history, image_path, image_cnt = self.build_history(message[:-1])
+        else:
+            history, image_path, image_cnt = None, [], 1
+        current_msg = message[-1]
+        question = ''
+        # If message is just text in the conversation
+        if len(current_msg['content']) == 1 and current_msg['content'][0]['type'] == 'text':
+            question = current_msg['content'][0]['value']
+            question = re.sub(self.pattern, self.replacement, question)  # Fix pattern as per InternVL
+        else:
+            for msg in current_msg['content']:
+                if msg['type'] == 'text':
+                    question += re.sub(self.pattern, self.replacement, msg['value'])
+                elif msg['type'] == 'image':
+                    image_cnt += 1
+                    question += '<image>\n'
+                    image_path.append(msg['value'])
+        if image_cnt > 1:
+            num_patches_list = []
+            pixel_values_list = []
+            for image_idx, file_name in enumerate(image_path):
+                upscale_flag = image_idx == 0 and dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset)
+                curr_pixel_values = load_image(
+                    file_name, max_num=self.max_num, upscale=upscale_flag).cuda().to(torch.bfloat16)
+                num_patches_list.append(curr_pixel_values.size(0))
+                pixel_values_list.append(curr_pixel_values)
+            pixel_values = torch.cat(pixel_values_list, dim=0)
+        elif image_cnt == 1:
+            upscale_flag = listinstr(['MMMU_DEV_VAL'], dataset)
+            pixel_values = load_image(
+                image_path, max_num=self.max_num, upscale=upscale_flag).cuda().to(torch.bfloat16)
+            num_patches_list = [pixel_values.size(0)]
+        else:
+            pixel_values = None
+            num_patches_list = []
+        response, history = self.model.chat(
+            self.tokenizer,
+            pixel_values=pixel_values,
+            target_aspect_ratio=target_aspect_ratio,
+            num_patches_list=num_patches_list,
+            question=question,
+            generation_config=self.kwargs,
+            history=history,
+            return_history=True
+        )
+        response = re.sub(self.reverse_pattern, self.reverse_replacement, response)
+        return response
+    def chat_inner(self, message, dataset=None):
+        self.set_max_num(dataset)
+        if self.version in ['V1.1', 'V1.2']:
+            raise ValueError(f'Unsupported version for Multi-Turn: {self.version}')
+        elif self.version == 'V1.5':
+            raise ValueError(f'Unsupported version for Multi-Turn: {self.version}')
+        elif self.version == 'V2.0':
+            kwargs_default = dict(do_sample=False, max_new_tokens=512, top_p=None, num_beams=1)
+            self.kwargs = kwargs_default
+            return self.chat_inner_v2(message, dataset)
+        else:
+            raise ValueError(f'Unsupported version for Multi-Turn: {self.version}')