tuandunghcmut commited on Apr 11, 2025

Commit

e7887f2

verified ·

1 Parent(s): 0b87f0f

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

InternVL/.github/ISSUE_TEMPLATE/1-bug-report.yml +54 -0
InternVL/.github/ISSUE_TEMPLATE/2-feature-request.yml +31 -0
InternVL/.github/ISSUE_TEMPLATE/3-documentation.yml +23 -0
InternVL/internvl_g/eval/evaluate_caption.py +237 -0
InternVL/internvl_g/internvl/dist_utils.py +101 -0
InternVL/internvl_g/internvl/model/__init__.py +0 -0
InternVL/internvl_g/internvl/model/internvl_stage2_retrieval/__init__.py +87 -0
InternVL/internvl_g/internvl/model/internvl_stage2_retrieval/modeling_intern_vit.py +342 -0
InternVL/internvl_g/internvl/model/internvl_stage2_retrieval/modeling_internvl.py +669 -0
InternVL/internvl_g/internvl/model/internvl_stage2_retrieval/modeling_qllama.py +1073 -0
InternVL/internvl_g/internvl/train/__init__.py +0 -0
InternVL/internvl_g/internvl/train/dataset.py +283 -0
InternVL/internvl_g/internvl/train/internvl_stage2_finetune.py +286 -0
InternVL/internvl_g/internvl/train/trainer_monkey_patch.py +150 -0
InternVL/internvl_g/shell/finetune/internvl_stage2_finetune_coco_364_bs1024_ep5.sh +58 -0
InternVL/internvl_g/shell/finetune/internvl_stage2_finetune_flickr_364_bs1024_ep10.sh +58 -0
InternVL/internvl_g/shell/finetune/internvl_stage2_finetune_flickrcn_364_bs1024_ep10.sh +58 -0
InternVL/internvl_g/shell/head_finetune/internvl_stage2_finetune_coco_224_bs1024_ep5_head_4gpu.sh +59 -0
InternVL/internvl_g/shell/head_finetune/internvl_stage2_finetune_flickr_224_bs1024_ep10_head_4gpu.sh +59 -0
InternVL/internvl_g/shell/head_finetune/internvl_stage2_finetune_flickrcn_224_bs1024_ep10_head_4gpu.sh +59 -0
InternVL/internvl_g/shell/lora_finetune/internvl_stage2_finetune_coco_224_bs1024_ep5_lora16_4gpu.sh +61 -0
InternVL/internvl_g/shell/lora_finetune/internvl_stage2_finetune_flickr_224_bs1024_ep10_lora16_4gpu.sh +61 -0
InternVL/internvl_g/shell/lora_finetune/internvl_stage2_finetune_flickrcn_224_bs1024_ep10_lora16_4gpu.sh +61 -0
InternVL/segmentation/configs/_base_/datasets/ade20k_504x504.py +56 -0
InternVL/segmentation/configs/_base_/datasets/ade20k_504x504_1of16.py +56 -0
InternVL/segmentation/configs/_base_/datasets/cityscapes_1024x1024.py +35 -0
InternVL/segmentation/configs/_base_/models/apcnet_r50-d8.py +44 -0
InternVL/segmentation/configs/_base_/models/bisenetv1_r18-d32.py +68 -0
InternVL/segmentation/configs/_base_/models/danet_r50-d8.py +44 -0
InternVL/segmentation/configs/_base_/models/deeplabv3plus_r50-d8.py +46 -0
InternVL/segmentation/configs/_base_/models/dmnet_r50-d8.py +44 -0
InternVL/segmentation/configs/_base_/models/encnet_r50-d8.py +48 -0
InternVL/segmentation/configs/_base_/models/erfnet_fcn.py +32 -0
InternVL/segmentation/configs/_base_/models/fastfcn_r50-d32_jpu_psp.py +53 -0
InternVL/segmentation/configs/_base_/models/fcn_hr18.py +52 -0
InternVL/segmentation/configs/_base_/models/fpn_r50.py +36 -0
InternVL/segmentation/configs/_base_/models/isanet_r50-d8.py +45 -0
InternVL/segmentation/configs/_base_/models/lraspp_m-v3-d8.py +25 -0
InternVL/segmentation/configs/_base_/models/pointrend_r50.py +56 -0
InternVL/segmentation/configs/_base_/models/pspnet_unet_s5-d16.py +50 -0
InternVL/segmentation/configs/_base_/models/upernet_r50.py +44 -0
InternVL/segmentation/configs/_base_/schedules/schedule_10k.py +9 -0
InternVL/segmentation/configs/_base_/schedules/schedule_160k.py +9 -0
InternVL/segmentation/configs/_base_/schedules/schedule_20k.py +9 -0
InternVL/segmentation/configs/_base_/schedules/schedule_320k.py +9 -0
InternVL/segmentation/configs/_base_/schedules/schedule_40k.py +9 -0
InternVL/segmentation/configs/_base_/schedules/schedule_5k.py +9 -0
InternVL/segmentation/configs/_base_/schedules/schedule_80k.py +9 -0
InternVL/segmentation/configs/intern_vit_6b/few_shot/linear_intern_vit_6b_504_10k_ade20k_bs16_lr4e-5_1of8.py +72 -0
InternVL/segmentation/configs/intern_vit_6b/few_shot/linear_intern_vit_6b_504_20k_ade20k_bs16_lr4e-5_1of4.py +72 -0

InternVL/.github/ISSUE_TEMPLATE/1-bug-report.yml ADDED Viewed

	@@ -0,0 +1,54 @@

+name: 🐞 Bug report
+description: Create a report to help us reproduce and fix the bug
+title: "[Bug] "
+labels: ['Bug']
+body:
+- type: checkboxes
+  attributes:
+    label: Checklist
+    options:
+    - label: 1. I have searched related issues but cannot get the expected help.
+    - label: 2. The bug has not been fixed in the latest version.
+    - label: 3. Please note that if the bug-related issue you submitted lacks corresponding environment info and a minimal reproducible demo, it will be challenging for us to reproduce and resolve the issue, reducing the likelihood of receiving feedback.
+- type: textarea
+  attributes:
+    label: Describe the bug
+    description: A clear and concise description of what the bug is.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Reproduction
+    description: |
+      1. What command or script did you run?
+    placeholder: |
+      A placeholder for the command.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Environment
+    description: |
+      1. Please run `lmdeploy check_env` to collect necessary environment information and paste it here.
+      2. You may add addition that may be helpful for locating the problem, such as
+         - Which **model** are you using?
+         - How you installed PyTorch \[e.g., pip, conda, source\]
+         - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.)
+    placeholder: Environment here.
+    render: Shell
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Error traceback
+    description: |
+      If applicable, paste the error trackback here.
+    placeholder: Logs and traceback here.
+    render: Shell
+- type: markdown
+  attributes:
+    value: >
+     If you have already identified the reason, you can provide the information here. If you are willing to create a PR to fix it, please also leave a comment here and that would be much appreciated!
+     Thanks for your bug report. We appreciate it a lot.

InternVL/.github/ISSUE_TEMPLATE/2-feature-request.yml ADDED Viewed

	@@ -0,0 +1,31 @@

+name: 🚀 Feature request
+description: Suggest an idea for this project
+title: "[Feature] "
+body:
+- type: markdown
+  attributes:
+    value: |
+      We strongly appreciate you creating a PR to implement this feature [here](https://github.com/OpenGVLab/InternVL/pulls)!
+      If you need our help, please fill in as much of the following form as you're able to.
+      **The less clear the description, the longer it will take to solve it.**
+- type: textarea
+  attributes:
+    label: Motivation
+    description: |
+      A clear and concise description of the motivation of the feature.
+      Ex1. It is inconvenient when \[....\].
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Related resources
+    description: |
+      If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.
+- type: textarea
+  attributes:
+    label: Additional context
+    description: |
+      Add any other context or screenshots about the feature request here.
+      If you would like to implement the feature and create a PR, please leave a comment here and that would be much appreciated.

InternVL/.github/ISSUE_TEMPLATE/3-documentation.yml ADDED Viewed

	@@ -0,0 +1,23 @@

+name: 📚 Documentation
+description: Report an issue related to the documentation.
+labels: "kind/doc,status/unconfirmed"
+title: "[Docs] "
+body:
+- type: textarea
+  attributes:
+    label: 📚 The doc issue
+    description: >
+      A clear and concise description the issue.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Suggest a potential alternative/fix
+    description: >
+      Tell us how we could improve the documentation in this regard.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!

InternVL/internvl_g/eval/evaluate_caption.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import argparse
+import itertools
+import json
+import os
+import random
+import time
+from functools import partial
+import torch
+import torchvision.transforms as T
+from internvl.model.internvl_stage2 import InternVLConfig, InternVLModel
+from PIL import Image
+from pycocoevalcap.eval import COCOEvalCap
+from pycocotools.coco import COCO
+from torchvision.transforms.functional import InterpolationMode
+from tqdm import tqdm
+from transformers import LlamaTokenizer
+ds_collections = {
+    'flickr30k': {
+        'root': 'data/flickr30k/',
+        'annotation': 'data/flickr30k/flickr30k_test_karpathy.json',
+    },
+    'coco': {
+        'root': 'data/coco/',
+        'annotation': ['data/coco/annotations/coco_karpathy_test.json',
+                       'data/coco/annotations/coco_karpathy_test_gt.json'],
+    },
+    'nocaps': {
+        'root': 'data/nocaps/images',
+        'annotation': 'data/nocaps/nocaps_val_4500_captions.json',
+    },
+}
+class CaptionDataset(torch.utils.data.Dataset):
+    def __init__(self, name, root, annotation, prompt, input_size=224):
+        if name == 'coco':
+            self.images = json.load(open(annotation))
+        else:
+            self.images = json.load(open(annotation))['images']
+        self.name = name
+        self.prompt = prompt
+        self.root = root
+        self.transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
+        ])
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, idx):
+        if self.name == 'coco':
+            filename = self.images[idx]['image']
+            image_id = int(filename.split('_')[-1].replace('.jpg', ''))
+            image_path = os.path.join(self.root, filename)
+        else:
+            image_id = self.images[idx]['id']
+            if 'file_name' in self.images[idx]:
+                image_path = os.path.join(self.root, self.images[idx]['file_name'])
+            else:
+                image_path = os.path.join(self.root, self.images[idx]['image'])
+        image = Image.open(image_path)
+        pixel_values = self.transform(image).unsqueeze(0)
+        return {
+            'image_id': image_id,
+            'input_text': self.prompt,
+            'pixel_values': pixel_values
+        }
+def collate_fn(inputs, tokenizer):
+    pixel_values = torch.cat([_['pixel_values'] for _ in inputs], dim=0)
+    image_ids = [_['image_id'] for _ in inputs]
+    input_texts = [_['input_text'] for _ in inputs]
+    input_tokens = tokenizer(input_texts, return_tensors='pt')
+    return pixel_values, image_ids, input_tokens.input_ids, input_tokens.attention_mask
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+    def __iter__(self):
+        yield from self._local_indices
+    def __len__(self):
+        return len(self._local_indices)
+def evaluate_qllama_model():
+    prompts = ['English caption:']
+    print('prompts:', prompts)
+    config = InternVLConfig.from_pretrained(args.checkpoint)
+    model = InternVLModel.from_pretrained(args.checkpoint, config=config).eval()
+    model = model.to(torch.float16).cuda()
+    tokenizer = LlamaTokenizer.from_pretrained(args.checkpoint)
+    tokenizer.add_eos_token = False
+    random.seed(args.seed)
+    summaries = []
+    for prompt in prompts:
+        for ds_name in args.datasets:
+            annotation = ds_collections[ds_name]['annotation']
+            if type(annotation) == list:
+                annotation = annotation[0]
+            if model.config.force_image_size is not None:
+                image_size = model.config.force_image_size
+            else:
+                image_size = model.config.vision_config.image_size
+            dataset = CaptionDataset(
+                name=ds_name,
+                root=ds_collections[ds_name]['root'],
+                annotation=annotation,
+                prompt=prompt,
+                input_size=image_size,
+            )
+            dataloader = torch.utils.data.DataLoader(
+                dataset=dataset,
+                sampler=InferenceSampler(len(dataset)),
+                batch_size=args.batch_size,
+                num_workers=args.num_workers,
+                pin_memory=True,
+                drop_last=False,
+                collate_fn=partial(collate_fn, tokenizer=tokenizer),
+            )
+            image_ids, captions = [], []
+            for _, (pixel_values, ids, input_ids, attention_mask) in tqdm(enumerate(dataloader)):
+                pred = model.generate(
+                    pixel_values=pixel_values.cuda().to(torch.float16),
+                    input_ids=input_ids.cuda(),
+                    attention_mask=attention_mask.cuda(),
+                    do_sample=False,
+                    num_beams=args.num_beams,
+                    max_new_tokens=30,
+                    min_new_tokens=8,
+                    use_cache=True
+                )
+                image_ids.extend(ids)
+                caption = [tokenizer.decode(_.cpu(), skip_special_tokens=True).strip() for _ in pred]
+                captions.extend(caption)
+                print(caption)
+            torch.distributed.barrier()
+            world_size = torch.distributed.get_world_size()
+            merged_ids = [None for _ in range(world_size)]
+            merged_captions = [None for _ in range(world_size)]
+            torch.distributed.all_gather_object(merged_ids, image_ids)
+            torch.distributed.all_gather_object(merged_captions, captions)
+            merged_ids = [_ for _ in itertools.chain.from_iterable(merged_ids)]
+            merged_captions = [_ for _ in itertools.chain.from_iterable(merged_captions)]
+            average_length = sum(len(x.split()) for x in merged_captions) / len(merged_captions)
+            print(f'Average length: {average_length}')
+            if torch.distributed.get_rank() == 0:
+                print(f'Evaluating {ds_name} ...')
+                results = []
+                for image_id, caption in zip(merged_ids, merged_captions):
+                    results.append({
+                        'image_id': int(image_id),
+                        'caption': caption,
+                    })
+                time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
+                results_file = f'{ds_name}_{time_prefix}.json'
+                results_file = os.path.join(args.out_dir, results_file)
+                json.dump(results, open(results_file, 'w'))
+                annotation = ds_collections[ds_name]['annotation']
+                if type(annotation) == list:
+                    annotation = annotation[-1]
+                coco = COCO(annotation)
+                coco_result = coco.loadRes(results_file)
+                coco_eval = COCOEvalCap(coco, coco_result)
+                coco_eval.evaluate()
+                summary = coco_eval.eval.items()
+                print([ds_name, prompt, average_length, summary])
+                summaries.append([ds_name, prompt, average_length, summary])
+            torch.distributed.barrier()
+    for summary in summaries:
+        print(summary)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--datasets', type=str, default='coco,flickr30k,nocaps')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    parser.add_argument('--num-beams', type=int, default=5)
+    parser.add_argument('--out-dir', type=str, default='results')
+    parser.add_argument('--seed', type=int, default=0)
+    args = parser.parse_args()
+    os.makedirs(args.out_dir, exist_ok=True)
+    args.datasets = args.datasets.split(',')
+    print('datasets:', args.datasets)
+    assert args.batch_size == 1, 'Only batch size 1 is supported'
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+    evaluate_qllama_model()

InternVL/internvl_g/internvl/dist_utils.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import os
+import socket
+import subprocess
+from datetime import timedelta
+import torch
+import torch.multiprocessing as mp
+from torch import distributed as dist
+timeout = timedelta(minutes=60)
+def _find_free_port():
+    # Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    # Binding to port 0 will cause the OS to find an available port for us
+    sock.bind(('', 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    # NOTE: there is still a chance the port could be taken by other processes.
+    return port
+def _is_free_port(port):
+    ips = socket.gethostbyname_ex(socket.gethostname())[-1]
+    ips.append('localhost')
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        return all(s.connect_ex((ip, port)) != 0 for ip in ips)
+def init_dist(launcher, backend='nccl', **kwargs):
+    if mp.get_start_method(allow_none=True) is None:
+        mp.set_start_method('spawn')
+    if launcher == 'pytorch':
+        _init_dist_pytorch(backend, **kwargs)
+    elif launcher == 'mpi':
+        _init_dist_mpi(backend, **kwargs)
+    elif launcher == 'slurm':
+        _init_dist_slurm(backend, **kwargs)
+    else:
+        raise ValueError(f'Invalid launcher type: {launcher}')
+def _init_dist_pytorch(backend, **kwargs):
+    # TODO: use local_rank instead of rank % num_gpus
+    rank = int(os.environ['RANK'])
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(rank % num_gpus)
+    dist.init_process_group(backend=backend, **kwargs)
+def _init_dist_mpi(backend, **kwargs):
+    local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+    torch.cuda.set_device(local_rank)
+    if 'MASTER_PORT' not in os.environ:
+        # 29500 is torch.distributed default port
+        os.environ['MASTER_PORT'] = '29500'
+    if 'MASTER_ADDR' not in os.environ:
+        raise KeyError('The environment variable MASTER_ADDR is not set')
+    os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE']
+    os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK']
+    dist.init_process_group(backend=backend, **kwargs)
+def _init_dist_slurm(backend, port=None):
+    """Initialize slurm distributed training environment.
+    If argument ``port`` is not specified, then the master port will be system
+    environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
+    environment variable, then a default port ``29500`` will be used.
+    Args:
+        backend (str): Backend of torch.distributed.
+        port (int, optional): Master port. Defaults to None.
+    """
+    proc_id = int(os.environ['SLURM_PROCID'])
+    ntasks = int(os.environ['SLURM_NTASKS'])
+    node_list = os.environ['SLURM_NODELIST']
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(proc_id % num_gpus)
+    addr = subprocess.getoutput(
+        f'scontrol show hostname {node_list} | head -n1')
+    # specify master port
+    if port is not None:
+        os.environ['MASTER_PORT'] = str(port)
+    elif 'MASTER_PORT' in os.environ:
+        pass  # use MASTER_PORT in the environment variable
+    else:
+        # if torch.distributed default port(29500) is available
+        # then use it, else find a free port
+        if _is_free_port(29500):
+            os.environ['MASTER_PORT'] = '29500'
+        else:
+            os.environ['MASTER_PORT'] = str(_find_free_port())
+    # use MASTER_ADDR in the environment variable if it already exists
+    if 'MASTER_ADDR' not in os.environ:
+        os.environ['MASTER_ADDR'] = addr
+    os.environ['WORLD_SIZE'] = str(ntasks)
+    os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
+    os.environ['RANK'] = str(proc_id)
+    dist.init_process_group(backend=backend, timeout=timeout)

InternVL/internvl_g/internvl/model/__init__.py ADDED Viewed

File without changes

InternVL/internvl_g/internvl/model/internvl_stage2_retrieval/__init__.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from torchvision.transforms import InterpolationMode
+from transformers import LlamaTokenizer
+from .configuration_intern_vit import InternVisionConfig
+from .configuration_internvl import InternVLConfig
+from .modeling_intern_vit import InternVisionModel
+from .modeling_internvl import InternVL_C, InternVL_G, InternVLModel
+__all__ = ['InternVisionConfig', 'InternVisionModel', 'InternVLConfig',
+           'InternVLModel', 'InternVL_C', 'InternVL_G']
+# Prefix the text "summarize:"
+class InternVLTokenizer(nn.Module):
+    def __init__(self, model_path):
+        super(InternVLTokenizer, self).__init__()
+        self.tokenizer = LlamaTokenizer.from_pretrained(model_path)
+        self.tokenizer.pad_token = ' '  # allow padding
+        self.tokenizer.add_eos_token = True
+    def forward(self, text, prefix='summarize:'):
+        if type(text) == str:
+            text = prefix + text
+        elif type(text) == list:
+            text = [prefix + item for item in text]
+        text = self.tokenizer(text, return_tensors='pt', max_length=80, truncation=True, padding='max_length').input_ids
+        return text
+def build_transform(task, image_size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
+    if task == 'retrieval':
+        transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
+            T.ToTensor(),
+            T.Normalize(mean=mean, std=std)])
+    else:
+        transform = T.Compose([
+            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+            T.Resize(image_size, interpolation=InterpolationMode.BICUBIC),
+            T.CenterCrop(image_size),
+            T.ToTensor(),
+            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
+    return transform
+def load_internvl_c_huggingface(ckpt_path, device, task):
+    model = InternVL_C.from_pretrained(ckpt_path, torch_dtype=torch.float16).to(device)
+    if model.config.use_backbone_lora:
+        model.vision_model.merge_and_unload()
+        model.vision_model = model.vision_model.model
+    if model.config.use_qllama_lora:
+        model.qllama.merge_and_unload()
+        model.qllama = model.qllama.model
+    if model.config.force_image_size is not None:
+        image_size = model.config.force_image_size
+    else:
+        image_size = model.config.vision_config.image_size
+    transform = build_transform(task, image_size)
+    tokenizer = InternVLTokenizer(ckpt_path)
+    return model, transform, tokenizer
+def load_internvl_g_huggingface(ckpt_path, device, task):
+    model = InternVL_G.from_pretrained(ckpt_path, torch_dtype=torch.float16).to(device)
+    if model.config.use_backbone_lora:
+        model.vision_model.merge_and_unload()
+        model.vision_model = model.vision_model.model
+    if model.config.use_qllama_lora:
+        model.qllama.merge_and_unload()
+        model.qllama = model.qllama.model
+    if model.config.force_image_size is not None:
+        image_size = model.config.force_image_size
+    else:
+        image_size = model.config.vision_config.image_size
+    transform = build_transform(task, image_size)
+    tokenizer = InternVLTokenizer(ckpt_path)
+    return model, transform, tokenizer

InternVL/internvl_g/internvl/model/internvl_stage2_retrieval/modeling_intern_vit.py ADDED Viewed

	@@ -0,0 +1,342 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from timm.models.layers import DropPath
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (BaseModelOutput,
+                                           BaseModelOutputWithPooling)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from .configuration_intern_vit import InternVisionConfig
+try:
+    from .flash_attention import FlashAttention
+    has_flash_attn = True
+except:
+    print('FlashAttention is not installed.')
+    has_flash_attn = False
+logger = logging.get_logger(__name__)
+class InternRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+try:
+    from apex.normalization import FusedRMSNorm
+    InternRMSNorm = FusedRMSNorm  # noqa
+    logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
+except ImportError:
+    # using the normal InternRMSNorm
+    pass
+except Exception:
+    logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
+    pass
+class InternVisionEmbeddings(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        batch_size = pixel_values.shape[0]
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        embeddings = embeddings + self.position_embedding.to(target_dtype)
+        return embeddings
+class InternAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.use_flash_attn = config.use_flash_attn and has_flash_attn
+        if config.use_flash_attn and not has_flash_attn:
+            print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
+                f' {self.num_heads}).'
+            )
+        self.scale = self.head_dim ** -0.5
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
+        self.attn_drop = nn.Dropout(config.attention_dropout)
+        self.proj_drop = nn.Dropout(config.dropout)
+        self.qk_normalization = config.qk_normalization
+        if self.qk_normalization:
+            self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+        if self.use_flash_attn:
+            self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+    def _naive_attn(self, x):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+        if self.qk_normalization:
+            B_, H_, N_, D_ = q.shape
+            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+        attn = ((q * self.scale) @ k.transpose(-2, -1))
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+    def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
+        qkv = self.qkv(x)
+        qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
+        if self.qk_normalization:
+            q, k, v = qkv.unbind(2)
+            q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
+            k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
+            qkv = torch.stack([q, k, v], dim=2)
+        context, _ = self.inner_attn(
+            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
+        )
+        outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
+        outs = self.proj_drop(outs)
+        return outs
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
+        return x
+class InternMLP(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.act = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class InternVisionEncoderLayer(nn.Module):
+    def __init__(self, config: InternVisionConfig, drop_path_rate: float):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.attn = InternAttention(config)
+        self.mlp = InternMLP(config)
+        self.norm1 = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.norm2 = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
+        """
+        hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
+        hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states)) * self.ls2)
+        return hidden_states
+class InternVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`InternEncoderLayer`].
+    Args:
+        config (`InternConfig`):
+            The corresponding vision configuration for the `InternEncoder`.
+    """
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
+        self.layers = nn.ModuleList([
+            InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = True
+    def forward(
+            self,
+            inputs_embeds,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_states = () if output_hidden_states else None
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    encoder_layer,
+                    hidden_states)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                )
+            hidden_states = layer_outputs
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states
+        )
+class InternVisionModel(PreTrainedModel):
+    main_input_name = 'pixel_values'
+    config_class = InternVisionConfig
+    def __init__(self, config: InternVisionConfig):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = InternVisionEmbeddings(config)
+        self.encoder = InternVisionEncoder(config)
+    def resize_pos_embeddings(self, old_size, new_size, patch_size):
+        pos_emb = self.embeddings.position_embedding
+        _, num_positions, embed_dim = pos_emb.shape
+        cls_emb = pos_emb[:, :1, :]
+        pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
+        pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
+        pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
+        pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
+        self.embeddings.position_embedding = nn.Parameter(pos_emb)
+        logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
+    def get_input_embeddings(self):
+        return self.embeddings
+    def forward(
+            self,
+            pixel_values: Optional[torch.FloatTensor] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            pixel_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError('You have to specify pixel_values or pixel_embeds')
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        else:
+            if len(pixel_values.shape) == 4:
+                hidden_states = self.embeddings(pixel_values)
+            else:
+                raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state
+        pooled_output = last_hidden_state[:, 0, :]
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )

InternVL/internvl_g/internvl/model/internvl_stage2_retrieval/modeling_internvl.py ADDED Viewed

	@@ -0,0 +1,669 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from dataclasses import dataclass
+from functools import partial
+from typing import Any, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from peft import LoraConfig, get_peft_model
+from timm.models.layers import DropPath
+from torch import nn
+from transformers import GenerationConfig
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import ModelOutput, logging
+from .configuration_internvl import InternVLConfig
+from .modeling_intern_vit import (InternVisionEmbeddings, InternVisionEncoder,
+                                  InternVisionModel)
+from .modeling_qllama import LlamaForCausalLM, _expand_mask, _make_causal_mask
+try:
+    from .flash_attention import FlashAttention  # v1/v2
+except:
+    print('FlashAttention is not installed.')
+logger = logging.get_logger(__name__)
+class InternVLPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = InternVLConfig
+    base_model_prefix = 'internvl'
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [
+        r'position_ids',
+    ]
+    _no_split_modules = ['InternAttention', 'LlamaDecoderLayer', 'LlamaForCausalLM']
+    _skip_keys_device_placement = 'past_key_values'
+    _keep_in_fp32_modules = ['wo']
+    # def _init_weights(self, module):
+    #     """Initialize the weights"""
+    #     factor = self.config.initializer_range
+    #     if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
+    #         module.weight.data.normal_(mean=0.0, std=factor)
+    #         if hasattr(module, 'bias') and module.bias is not None:
+    #             module.bias.data.zero_()
+    #     if isinstance(module, InternVisionEmbeddings):
+    #         if hasattr(self.config, 'vision_config'):
+    #             factor = self.config.vision_config.initializer_range
+    #         nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor)
+    #         nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
+    #     elif isinstance(module, nn.LayerNorm):
+    #         module.bias.data.zero_()
+    #         module.weight.data.fill_(1.0)
+    #     elif isinstance(module, nn.Linear) and module.bias is not None:
+    #         module.bias.data.zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, InternVisionModel):
+            module.gradient_checkpointing = value
+        if isinstance(module, InternVisionEncoder):
+            module.gradient_checkpointing = value
+class CrossAttention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., attn_head_dim=None, out_dim=None):
+        super().__init__()
+        if out_dim is None:
+            out_dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        assert all_head_dim == dim
+        self.q = nn.Linear(dim, all_head_dim, bias=False)
+        self.k = nn.Linear(dim, all_head_dim, bias=False)
+        self.v = nn.Linear(dim, all_head_dim, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.k_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.k_bias = None
+            self.v_bias = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, out_dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, k=None, v=None):
+        B, N, C = x.shape
+        N_k = k.shape[1]
+        N_v = v.shape[1]
+        q_bias, k_bias, v_bias = None, None, None
+        if self.q_bias is not None:
+            q_bias = self.q_bias
+            k_bias = self.k_bias
+            v_bias = self.v_bias
+        q = F.linear(input=x, weight=self.q.weight, bias=q_bias)
+        q = q.reshape(B, N, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)  # (B, N_head, N_q, dim)
+        k = F.linear(input=k, weight=self.k.weight, bias=k_bias)
+        k = k.reshape(B, N_k, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)
+        v = F.linear(input=v, weight=self.v.weight, bias=v_bias)
+        v = v.reshape(B, N_v, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))  # (B, N_head, N_q, N_k)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class AttentiveBlock(nn.Module):
+    def __init__(self, dim, num_heads, qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, attn_head_dim=None, out_dim=None):
+        super().__init__()
+        self.norm1_q = norm_layer(dim)
+        self.norm1_k = norm_layer(dim)
+        self.norm1_v = norm_layer(dim)
+        self.cross_attn = CrossAttention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop,
+            proj_drop=drop, attn_head_dim=attn_head_dim, out_dim=out_dim)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x_q, x_kv, pos_q, pos_k, bool_masked_pos, rel_pos_bias=None):
+        x_q = self.norm1_q(x_q + pos_q)
+        x_k = self.norm1_k(x_kv + pos_k)
+        x_v = self.norm1_v(x_kv)
+        x = self.cross_attn(x_q, k=x_k, v=x_v)
+        return x
+class AttentionPoolingBlock(AttentiveBlock):
+    def forward(self, x):
+        x_q = x.mean(1, keepdim=True)
+        x_kv, pos_q, pos_k = x, 0, 0
+        x = super().forward(x_q, x_kv, pos_q, pos_k, bool_masked_pos=None, rel_pos_bias=None)
+        x = x.squeeze(1)
+        return x
+@dataclass
+class InternVLModelOutput(ModelOutput):
+    """
+    Class defining the outputs of [`InternVLModelOutput`].
+    """
+    loss: Optional[torch.FloatTensor] = None
+    loss_itm: Optional[torch.FloatTensor] = None
+    loss_itc: Optional[torch.FloatTensor] = None
+    loss_itg: Optional[torch.FloatTensor] = None
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k]
+            if k not in ['loss', 'loss_itm', 'loss_itc', 'loss_itg']
+            else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+class GatherLayer(torch.autograd.Function):
+    """Gather tensors from all process, supporting backward propagation.
+    """
+    @staticmethod
+    def forward(ctx, input):
+        ctx.save_for_backward(input)
+        output = [torch.zeros_like(input) for _ in range(dist.get_world_size())]
+        dist.all_gather(output, input)
+        return torch.stack(output, 0)
+    @staticmethod
+    def backward(ctx, grads):
+        input, = ctx.saved_tensors
+        dist.all_reduce(grads)
+        grad_out = torch.zeros_like(input)
+        grad_out[:] = grads[dist.get_rank()]
+        return grad_out
+class InternVLModel(InternVLPreTrainedModel):
+    config_class = InternVLConfig
+    main_input_name = 'pixel_values'
+    def __init__(self, config: InternVLConfig):
+        super().__init__(config)
+        text_hidden_size = config.qllama_config.hidden_size
+        vision_hidden_size = config.vision_config.hidden_size
+        clip_embed_dim = config.clip_embed_dim
+        attn_pool_num_heads = config.attn_pool_num_heads
+        config.qllama_config.num_query_token = config.num_query_token
+        self.num_query_token = config.num_query_token
+        self.label_smoothing = config.label_smoothing
+        self.vision_model = InternVisionModel(config.vision_config)  # frozen
+        self.qllama = LlamaForCausalLM(config.qllama_config)  # frozen
+        self.query_tokens = nn.Parameter(  # trainable
+            torch.zeros(1, config.num_query_token, text_hidden_size)
+        )
+        self.text_projection = nn.Parameter(torch.empty(text_hidden_size, clip_embed_dim))  # frozen
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))  # trainable
+        self.clip_projector = AttentionPoolingBlock(  # frozen
+            dim=vision_hidden_size, num_heads=attn_pool_num_heads, qkv_bias=True, qk_scale=None,
+            drop=0., attn_drop=0., norm_layer=partial(nn.LayerNorm, eps=1e-5), out_dim=clip_embed_dim)
+        self.clip_projector2 = AttentionPoolingBlock(  # trainable
+            dim=text_hidden_size, num_heads=attn_pool_num_heads, qkv_bias=True, qk_scale=None,
+            drop=0., attn_drop=0., norm_layer=partial(nn.LayerNorm, eps=1e-5), out_dim=clip_embed_dim)
+        self.itm_head = nn.Linear(text_hidden_size, 2)  # trainable
+        self.gradient_checkpointing = True
+        # Initialize weights and apply final processing
+        # self.post_init()
+        if config.use_backbone_lora:
+            self.wrap_backbone_lora(r=config.use_backbone_lora, lora_alpha=config.use_backbone_lora * 2)
+        if config.use_qllama_lora:
+            self.wrap_qllama_lora(r=config.use_qllama_lora, lora_alpha=config.use_qllama_lora * 2)
+        if config.force_image_size:
+            self.vision_model.resize_pos_embeddings(
+                old_size=config.vision_config.image_size,
+                new_size=config.force_image_size,
+                patch_size=config.vision_config.patch_size
+            )
+    def wrap_backbone_lora(self, r=128, lora_alpha=256, lora_dropout=0.05):
+        lora_config = LoraConfig(
+            r=r,
+            target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2'],
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+        )
+        self.vision_model = get_peft_model(self.vision_model, lora_config)
+        self.vision_model.print_trainable_parameters()
+    def wrap_qllama_lora(self, r=128, lora_alpha=256, lora_dropout=0.05):
+        lora_config = LoraConfig(
+            r=r,
+            target_modules=['self_attn.q_proj', 'self_attn.k_proj', 'self_attn.v_proj', 'self_attn.o_proj',
+                            'mlp.gate_proj', 'mlp.down_proj', 'mlp.up_proj'],
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+        )
+        self.qllama = get_peft_model(self.qllama, lora_config)
+        self.qllama.print_trainable_parameters()
+    def get_input_embeddings(self):
+        return self.qllama.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.qllama.set_input_embeddings(value)
+    def set_output_embeddings(self, new_embeddings):
+        self.qllama.set_output_embeddings(new_embeddings)
+    def get_output_embeddings(self) -> nn.Module:
+        return self.qllama.get_output_embeddings()
+    @torch.no_grad()
+    def _prepare_attention_mask(
+            self,
+            image_attention_mask: torch.LongTensor,
+            attention_mask: torch.LongTensor,
+            input_embeds: torch.FloatTensor,
+            repeat_time: int,
+    ):
+        # itm, itc
+        attention_mask = torch.cat([image_attention_mask, attention_mask], dim=1)
+        expand_mask = _expand_mask(attention_mask, input_embeds.dtype).to(
+            input_embeds.device)  # [bsz, 1, tgt_seq_len, src_seq_len]
+        itm_mask_neg, itm_mask_pos, itc_mask = torch.chunk(expand_mask, repeat_time, dim=0)
+        itc_mask[:, :, :self.num_query_token, self.num_query_token:] = torch.finfo(input_embeds.dtype).min
+        itc_mask[:, :, self.num_query_token:, :self.num_query_token] = torch.finfo(input_embeds.dtype).min
+        itc_mask_causal = _make_causal_mask(
+            (itc_mask.shape[0], itc_mask.shape[2] - self.num_query_token),
+            input_embeds.dtype,
+            device=input_embeds.device
+        )
+        # use causal mask for text in itc
+        itc_mask[:, :, self.num_query_token:, self.num_query_token:] += itc_mask_causal
+        attention_mask = torch.cat([itm_mask_neg, itm_mask_pos, itc_mask], dim=0)
+        return attention_mask
+    def forward(
+            self,
+            pixel_values: torch.FloatTensor,
+            positive_input_ids: torch.FloatTensor,
+            positive_attention_mask: torch.LongTensor,
+            negative_input_ids: torch.FloatTensor,
+            negative_attention_mask: torch.LongTensor,
+            summarize_input_ids: torch.FloatTensor,
+            summarize_attention_mask: torch.LongTensor,
+            input_ids: torch.FloatTensor,
+            attention_mask: torch.LongTensor,
+            image_ids: torch.LongTensor,
+            labels: torch.LongTensor,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, InternVLModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+        image_embeds = vision_outputs[0]
+        backbone_embeds = self.clip_projector(image_embeds)
+        # step 2: prepare input_ids and attention_mask for two sub-tasks:
+        # 1) image-text matching; 2) image-text contrastive learning.
+        batch_size = input_ids.shape[0]
+        input_ids = torch.cat([negative_input_ids, positive_input_ids,
+                               summarize_input_ids], dim=0)  # [3 * batch_size, seq_len]
+        itm_attention_mask = torch.cat(
+            [negative_attention_mask, positive_attention_mask], dim=0)
+        attention_mask = torch.cat(
+            [itm_attention_mask, summarize_attention_mask], dim=0)  # [3 * batch_size, seq_len]
+        repeat_time = input_ids.size(0) // batch_size
+        # step 3: forward the input_ids and attention_mask through the text encoder.
+        input_embeds = self.get_input_embeddings()(input_ids)
+        query_tokens = self.query_tokens.repeat(repeat_time * batch_size, 1, 1)
+        input_embeds = torch.cat([query_tokens, input_embeds], dim=1)
+        image_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        attention_mask = self._prepare_attention_mask(
+            image_attention_mask, attention_mask, input_embeds, repeat_time
+        )
+        if type(self.qllama.model) == LlamaForCausalLM:
+            outputs = self.qllama.model.model.forward_train(
+                inputs_embeds=input_embeds,
+                vision_hidden_states=image_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                repeat_time=repeat_time,
+            ).last_hidden_state
+        else:
+            outputs = self.qllama.model.forward_train(
+                inputs_embeds=input_embeds,
+                vision_hidden_states=image_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                repeat_time=repeat_time,
+            ).last_hidden_state
+        image_embeds = outputs[:, :self.num_query_token]
+        text_embeds = outputs[:, self.num_query_token:]
+        image_itm_neg, image_itm_pos, image_itc = image_embeds.chunk(repeat_time, dim=0)
+        text_itm_neg, text_itm_pos, text_itc = text_embeds.chunk(repeat_time, dim=0)
+        image_itm = torch.cat([image_itm_neg, image_itm_pos], dim=0)
+        ###============== Image-Text Matching ===================###
+        image_itm = self.itm_head(image_itm)
+        logits = image_itm.mean(dim=1)
+        itm_labels = torch.cat([
+            torch.zeros(batch_size, dtype=torch.long, device=logits.device),
+            torch.ones(batch_size, dtype=torch.long, device=logits.device)
+        ], dim=0)
+        loss_itm = F.cross_entropy(logits, itm_labels)
+        neg_match_acc = ((logits[:batch_size].argmax(dim=-1) == 0) / batch_size).sum()
+        pos_match_acc = ((logits[batch_size:].argmax(dim=-1) == 1) / batch_size).sum()
+        ###============== Image-Text Contrastive ===================###
+        image_itc = self.clip_projector2(image_itc)
+        selected = summarize_attention_mask.sum(1) - 1
+        text_itc = text_itc[torch.arange(text_itc.shape[0]), selected]
+        text_itc = text_itc @ self.text_projection
+        # normalized features
+        backbone_embeds = backbone_embeds / backbone_embeds.norm(dim=1, keepdim=True)
+        image_itc = image_itc / image_itc.norm(dim=1, keepdim=True)
+        text_itc = text_itc / text_itc.norm(dim=1, keepdim=True)
+        backbone_embeds_all = GatherLayer.apply(backbone_embeds).flatten(0, 1)
+        image_itc_all = GatherLayer.apply(image_itc).flatten(0, 1)
+        text_itc_all = GatherLayer.apply(text_itc).flatten(0, 1)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        sim_i2t = logit_scale * (image_itc @ text_itc_all.t())
+        sim_t2i = logit_scale * (text_itc @ image_itc_all.t())
+        backbone_i2t = logit_scale * (backbone_embeds @ text_itc_all.t())
+        backbone_t2i = logit_scale * (text_itc @ backbone_embeds_all.t())
+        image_ids = image_ids.view(-1, 1)
+        image_ids_all = GatherLayer.apply(image_ids).flatten(0, 1)
+        pos_idx = torch.eq(image_ids, image_ids_all.t()).float()
+        sim_targets = pos_idx / pos_idx.sum(1, keepdim=True)
+        loss_t2i = -torch.sum(F.log_softmax(sim_t2i, dim=1) * sim_targets, dim=1).mean()
+        loss_i2t = -torch.sum(F.log_softmax(sim_i2t, dim=1) * sim_targets, dim=1).mean()
+        loss_backbone_t2i = -torch.sum(F.log_softmax(backbone_t2i, dim=1) * sim_targets, dim=1).mean()
+        loss_backbone_i2t = -torch.sum(F.log_softmax(backbone_i2t, dim=1) * sim_targets, dim=1).mean()
+        loss_itc = (loss_t2i + loss_i2t) / 2 + (loss_backbone_t2i + loss_backbone_i2t) / 2
+        vision_sim = F.cosine_similarity(backbone_embeds.detach(), image_itc).mean()
+        loss = loss_itm + loss_itc
+        if dist.get_rank() == 0:
+            print(f'loss: {loss.item()}, loss_itm: {loss_itm.item()}, loss_itc: {loss_itc.item()}, '
+                  f'vision_similarity: {round(vision_sim.item(), 5)}, '
+                  f'logit scale: {round(1.0 / logit_scale.item(), 5)}, '
+                  f'pos_match_acc: {round(pos_match_acc.item(), 4)}, '
+                  f'neg_match_acc: {round(neg_match_acc.item(), 4)}')
+        return InternVLModelOutput(
+            loss=loss,
+            loss_itc=loss_itc.detach(),
+            loss_itm=loss_itm.detach(),
+        )
+    @torch.no_grad()
+    def generate(
+            self,
+            pixel_values: torch.FloatTensor,
+            input_ids: torch.FloatTensor,
+            attention_mask: torch.LongTensor,
+            generation_config: Optional[GenerationConfig] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            **generate_kwargs,
+    ) -> torch.LongTensor:
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+        image_embeds = vision_outputs[0]
+        batch_size = image_embeds.shape[0]
+        input_embeds = self.get_input_embeddings()(input_ids)
+        query_tokens = self.query_tokens.repeat(batch_size, 1, 1)
+        input_embeds = torch.cat([query_tokens, input_embeds], dim=1)
+        image_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        attention_mask = torch.cat([image_attention_mask, attention_mask], dim=1)
+        outputs = self.qllama.generate(
+            inputs_embeds=input_embeds,
+            attention_mask=attention_mask,
+            vision_hidden_states=image_embeds,
+            generation_config=generation_config,
+            use_zero_attention_mask=True,
+            **generate_kwargs,
+        )
+        return outputs
+    def get_text_features(
+            self,
+            input_ids: torch.Tensor,
+            attention_mask: torch.Tensor,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+            text_outputs (`CausalLMOutputWithPast`, or `tuple(torch.FloatTensor)` if `return_dict=False`):
+                The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that
+                contains the language model logits, the past key values and the hidden states if
+                `output_hidden_states=True`.
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        input_embeds = self.get_input_embeddings()(input_ids)
+        attention_mask = _expand_mask(attention_mask, input_embeds.dtype).to(
+            input_embeds.device)  # [bsz, 1, tgt_seq_len, src_seq_len]
+        attention_mask += _make_causal_mask(
+            (attention_mask.shape[0], attention_mask.shape[2]),
+            input_embeds.dtype,
+            device=input_embeds.device
+        )
+        if type(self.qllama.model) == LlamaForCausalLM:
+            outputs = self.qllama.model.model.forward_train(
+                inputs_embeds=input_embeds,
+                vision_hidden_states=None,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            ).last_hidden_state
+        else:
+            outputs = self.qllama.model.forward_train(
+                inputs_embeds=input_embeds,
+                vision_hidden_states=None,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            ).last_hidden_state
+        return outputs
+    def get_image_features(
+            self,
+            pixel_values: torch.FloatTensor,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict)
+        image_embeds = vision_outputs[0]
+        backbone_embeds = image_embeds
+        batch_size = image_embeds.shape[0]
+        input_embeds = self.query_tokens.repeat(batch_size, 1, 1)
+        attention_mask = torch.ones(input_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        attention_mask = _expand_mask(attention_mask, input_embeds.dtype).to(
+            input_embeds.device)  # [bsz, 1, tgt_seq_len, src_seq_len]
+        if type(self.qllama.model) == LlamaForCausalLM:
+            outputs = self.qllama.model.model.forward_train(
+                inputs_embeds=input_embeds,
+                vision_hidden_states=image_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            ).last_hidden_state
+        else:
+            outputs = self.qllama.model.forward_train(
+                inputs_embeds=input_embeds,
+                vision_hidden_states=image_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            ).last_hidden_state
+        return backbone_embeds, outputs
+class InternVL_C(InternVLModel):
+    def encode_image(self, image):
+        vision_outputs = self.vision_model(
+            pixel_values=image,
+            output_hidden_states=False,
+            return_dict=True)
+        image_embeds = vision_outputs[0]
+        image_embeds = self.clip_projector(image_embeds)
+        return image_embeds
+    def encode_text(self, text):
+        attention_mask = text > 0
+        text_embeds = self.get_text_features(
+            input_ids=text,
+            attention_mask=attention_mask,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True,
+        )
+        text_embeds = text_embeds[torch.arange(text_embeds.shape[0]), attention_mask.sum(1) - 1]
+        text_embeds = text_embeds @ self.text_projection
+        return text_embeds
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        return logits_per_image, logits_per_text
+class InternVL_G(InternVLModel):
+    def encode_image(self, image):
+        backbone_embeds, image_embeds = self.get_image_features(
+            pixel_values=image,
+            output_hidden_states=False,
+            return_dict=True,
+        )
+        backbone_embeds = self.clip_projector(backbone_embeds)
+        image_embeds = self.clip_projector2(image_embeds)
+        # ensemble
+        backbone_embeds = backbone_embeds / backbone_embeds.norm(dim=1, keepdim=True)
+        image_embeds = image_embeds / image_embeds.norm(dim=1, keepdim=True)
+        image_embeds = image_embeds + backbone_embeds
+        return image_embeds
+    def encode_text(self, text):
+        attention_mask = text > 0
+        text_embeds = self.get_text_features(
+            input_ids=text,
+            attention_mask=attention_mask,
+            output_attentions=False,
+            output_hidden_states=False,
+            return_dict=True,
+        )
+        text_embeds = text_embeds[torch.arange(text_embeds.shape[0]), attention_mask.sum(1) - 1]
+        text_embeds = text_embeds @ self.text_projection
+        return text_embeds
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+        return logits_per_image, logits_per_text

InternVL/internvl_g/internvl/model/internvl_stage2_retrieval/modeling_qllama.py ADDED Viewed

	@@ -0,0 +1,1073 @@

+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch QLLaMA model."""
+import math
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from transformers import LlamaConfig
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (BaseModelOutputWithPast,
+                                           CausalLMOutputWithPast)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (add_start_docstrings,
+                                add_start_docstrings_to_model_forward, logging,
+                                replace_return_docstrings)
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = 'LlamaConfig'
+# Copied from transformers.models.bart.modeling_bart._make_causal_mask
+def _make_causal_mask(
+        input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
+):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device)
+    mask_cond = torch.arange(mask.size(-1), device=device)
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+# Copied from transformers.models.bart.modeling_bart._expand_mask
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        # convert into half-precision if necessary
+        if self.weight.dtype in [torch.float16, torch.bfloat16]:
+            hidden_states = hidden_states.to(self.weight.dtype)
+        return self.weight * hidden_states
+try:
+    from functools import partial
+    from apex.normalization import FusedRMSNorm
+    LlamaRMSNorm = partial(FusedRMSNorm, eps=1e-6)  # noqa
+    print('Discovered apex.normalization.FusedRMSNorm - will use it instead of LlamaRMSNorm')
+except ImportError:
+    # using the normal LlamaRMSNorm
+    pass
+except Exception:
+    print('discovered apex but it failed to load, falling back to LlamaRMSNorm')
+    pass
+class LlamaRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
+        self.register_buffer('inv_freq', inv_freq)
+        # Build here to make `torch.jit.trace` work.
+        self.max_seq_len_cached = max_position_embeddings
+        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer('cos_cached', emb.cos()[None, None, :, :], persistent=False)
+        self.register_buffer('sin_cached', emb.sin()[None, None, :, :], persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
+        if seq_len > self.max_seq_len_cached:
+            self.max_seq_len_cached = seq_len
+            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
+            freqs = torch.einsum('i,j->ij', t, self.inv_freq)
+            # Different from paper, but it uses a different permutation in order to obtain the same calculation
+            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+            self.register_buffer('cos_cached', emb.cos()[None, None, :, :], persistent=False)
+            self.register_buffer('sin_cached', emb.sin()[None, None, :, :], persistent=False)
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+        )
+class FixedLlamaRotaryEmbedding(torch.nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=self.inv_freq.device, dtype=torch.float32)
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer('cos_cached', emb.cos()[None, None, :, :], persistent=False)
+        self.register_buffer('sin_cached', emb.sin()[None, None, :, :], persistent=False)
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+        return (
+            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
+        )
+LlamaRotaryEmbedding = FixedLlamaRotaryEmbedding
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
+    gather_indices = position_ids[:, None, :, None]  # [bs, 1, seq_len, 1]
+    gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3])
+    cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class LlamaMLP(nn.Module):
+    def __init__(
+            self,
+            hidden_size: int,
+            intermediate_size: int,
+            hidden_act: str,
+    ):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+class LlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}'
+                f' and `num_heads`: {self.num_heads}).'
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.rotary_emb = LlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # [bsz, nh, t, hd]
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f'Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is'
+                f' {attn_weights.size()}'
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}'
+                )
+            attn_weights = attn_weights + attention_mask
+            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
+                f' {attn_output.size()}'
+            )
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class LlamaCrossAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.vision_hidden_size = 3200
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}'
+                f' and `num_heads`: {self.num_heads}).'
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self.norm1 = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.k_proj = nn.Linear(self.vision_hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.vision_hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.norm2 = LlamaRMSNorm(self.vision_hidden_size, eps=config.rms_norm_eps)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            vision_hidden_states: torch.Tensor,
+            repeat_time: int = 1,
+            attention_mask: Optional[torch.Tensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: bool = False,
+            use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        hidden_states = self.norm1(hidden_states)
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        vision_hidden_states = self.norm2(vision_hidden_states)
+        bs_v, kv_len, _ = vision_hidden_states.size()
+        key_states = self.k_proj(vision_hidden_states).view(
+            bs_v, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = self.v_proj(vision_hidden_states).view(
+            bs_v, kv_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.repeat(repeat_time, 1, 1, 1)
+        value_states = value_states.repeat(repeat_time, 1, 1, 1)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value[0].shape[-2]
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        past_key_value = (key_states, value_states) if use_cache else None
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f'Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is'
+                f' {attn_weights.size()}'
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}'
+                )
+            attn_weights = attn_weights + attention_mask
+            attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is'
+                f' {attn_output.size()}'
+            )
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights, past_key_value
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig, use_cross_attn: bool):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = LlamaAttention(config=config)
+        self.cross_attn = LlamaCrossAttention(config=config) if use_cross_attn else None
+        self.mlp = LlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.num_query_token = 96
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            vision_hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_value: Optional[Tuple[torch.Tensor]] = None,
+            output_attentions: Optional[bool] = False,
+            use_cache: Optional[bool] = False,
+            repeat_time: int = 1,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+        # when using generate function and cache mode, the size of hidden_states is 1,
+        # so we should not use cross attention
+        if self.cross_attn is not None and hidden_states.size(1) >= self.num_query_token \
+                and vision_hidden_states is not None:
+            query_feats = hidden_states[:, :self.num_query_token, :]
+            text_feats = hidden_states[:, self.num_query_token:, :]
+            residual = query_feats
+            query_feats, _, _ = self.cross_attn(
+                hidden_states=query_feats,
+                vision_hidden_states=vision_hidden_states,
+                attention_mask=None,  # not use attention mask in cross attention
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                repeat_time=repeat_time,
+            )
+            query_feats = residual + query_feats
+            hidden_states = torch.cat([query_feats, text_feats], dim=1)
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`LlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    'The bare LLaMA Model outputting raw hidden-states without any specific head on top.',
+    LLAMA_START_DOCSTRING,
+)
+class LlamaPreTrainedModel(PreTrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = 'model'
+    supports_gradient_checkpointing = True
+    _no_split_modules = ['LlamaDecoderLayer']
+    _keys_to_ignore_on_load_unexpected = [r'decoder\.version']
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, LlamaModel):
+            module.gradient_checkpointing = value
+        if isinstance(module, LlamaDecoderLayer):
+            module.gradient_checkpointing = value
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    'The bare LLaMA Model outputting raw hidden-states without any specific head on top.',
+    LLAMA_START_DOCSTRING,
+)
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+    Args:
+        config: LlamaConfig
+    """
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.cross_attention_frequency = config.cross_attention_frequency
+        self.num_query_token = config.num_query_token
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        use_cross_attn = [idx % self.cross_attention_frequency == 0 for idx in range(config.num_hidden_layers)]
+        self.layers = nn.ModuleList(
+            [LlamaDecoderLayer(config, use_cross_attn[idx]) for idx in range(config.num_hidden_layers)])
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        # self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape,
+                inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+                inputs_embeds.device
+            )
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            vision_hidden_states: Optional[torch.FloatTensor] = None,
+            repeat_time: Optional[int] = 1,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            use_zero_attention_mask: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError('You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time')
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError('You have to specify either decoder_input_ids or decoder_inputs_embeds')
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        )
+        if use_zero_attention_mask:
+            attention_mask[:, :, :self.num_query_token, :self.num_query_token] = 0
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            layer_outputs = decoder_layer(
+                hidden_states,
+                vision_hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                repeat_time=repeat_time,
+            )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def forward_train(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            vision_hidden_states: Optional[torch.FloatTensor] = None,
+            repeat_time: Optional[int] = 1,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError('You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time')
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError('You have to specify either decoder_input_ids or decoder_inputs_embeds')
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        # embed positions
+        # if attention_mask is None:
+        #     attention_mask = torch.ones(
+        #         (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+        #     )
+        # attention_mask = self._prepare_decoder_attention_mask(
+        #     attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+        # )
+        hidden_states = inputs_embeds
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...'
+                )
+                use_cache = False
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, None, repeat_time)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    vision_hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    vision_hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    repeat_time=repeat_time,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+class LlamaForCausalLM(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        # self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+            self,
+            input_ids: torch.LongTensor = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            past_key_values: Optional[List[torch.FloatTensor]] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            vision_hidden_states: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            use_zero_attention_mask: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoTokenizer, LlamaForCausalLM
+        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+        >>> prompt = "Hey, are you consciours? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            vision_hidden_states=vision_hidden_states,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            use_zero_attention_mask=use_zero_attention_mask,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+            self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None,
+            vision_hidden_states=None, use_zero_attention_mask=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        position_ids = kwargs.get('position_ids', None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {'inputs_embeds': inputs_embeds}
+        else:
+            model_inputs = {'input_ids': input_ids}
+        model_inputs.update(
+            {
+                'position_ids': position_ids,
+                'past_key_values': past_key_values,
+                'use_cache': kwargs.get('use_cache'),
+                'attention_mask': attention_mask,
+                'vision_hidden_states': vision_hidden_states,
+                'use_zero_attention_mask': use_zero_attention_mask,
+            }
+        )
+        return model_inputs
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past

InternVL/internvl_g/internvl/train/__init__.py ADDED Viewed

File without changes

InternVL/internvl_g/internvl/train/dataset.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import json
+import random
+import re
+from typing import Dict
+import torch
+import torchvision.transforms as T
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision.transforms.functional import InterpolationMode
+def build_transform(input_size):
+    # match fine-tune setting with blip2
+    # https://github.com/salesforce/LAVIS/blob/main/lavis/processors/blip_processors.py
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.RandomResizedCrop(input_size, scale=(0.5, 1.0),
+                            interpolation=InterpolationMode.BICUBIC),
+        T.RandomHorizontalFlip(),
+        T.ToTensor(),
+        T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
+    ])
+    return transform
+class FlickrDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(self, metas, tokenizer, data_args):
+        super(FlickrDataset, self).__init__()
+        f = open(metas['annotation'])
+        lines = f.readlines()[1:]
+        self.data_args = data_args
+        self.tokenizer = tokenizer
+        self.images = []
+        self.image_ids = []
+        self.captions = []
+        for line in lines:
+            image, caption = line.strip().split('.jpg,')
+            image_id = int(image)
+            caption = self.process_single_caption(caption)
+            image = image + '.jpg'
+            image_path = metas['root'] + '/' + image
+            self.images.append(image_path)
+            self.image_ids.append(image_id)
+            self.captions.append(caption)
+        print(f'There are {len(self.images)} images.')
+        print(f'There are {len(self.captions)} captions.')
+    def __len__(self):
+        return len(self.images)
+    def process_single_caption(self, caption, max_words=50):
+        caption = re.sub(r"([.!\"()*#:;~])", ' ', caption.lower())
+        caption = re.sub(r'\s{2,}', ' ', caption)
+        caption = caption.rstrip('\n')
+        caption = caption.strip(' ')
+        # truncate caption
+        caption_words = caption.split(' ')
+        if len(caption_words) > max_words:
+            caption = ' '.join(caption_words[: max_words])
+        return caption
+    def preprocess(self, image, caption, neg_caption):
+        model_inputs = dict()
+        # input image
+        image_transform = build_transform(input_size=self.data_args.force_image_size)
+        image = Image.open(image)
+        image = image.convert('RGB')
+        pixel_values = image_transform(image)
+        model_inputs['pixel_values'] = pixel_values
+        # for image-text matching
+        pos_model_inputs = self.tokenizer(
+            caption,
+            max_length=self.data_args.max_seq_length,
+            padding='max_length' if self.data_args.pad_to_max_length else False,
+            truncation=True,
+            return_tensors='pt',
+        )
+        model_inputs['positive_input_ids'] = pos_model_inputs['input_ids']
+        model_inputs['positive_attention_mask'] = pos_model_inputs['attention_mask']
+        neg_model_inputs = self.tokenizer(
+            neg_caption,
+            max_length=self.data_args.max_seq_length,
+            padding='max_length' if self.data_args.pad_to_max_length else False,
+            truncation=True,
+            return_tensors='pt',
+        )
+        model_inputs['negative_input_ids'] = neg_model_inputs['input_ids']
+        model_inputs['negative_attention_mask'] = neg_model_inputs['attention_mask']
+        # for image-text contrastive learning
+        summarize_model_inputs = self.tokenizer(
+            'summarize:' + caption,
+            max_length=self.data_args.max_seq_length,
+            padding='max_length' if self.data_args.pad_to_max_length else False,
+            truncation=True,
+            return_tensors='pt',
+        )
+        model_inputs['summarize_input_ids'] = summarize_model_inputs['input_ids']
+        model_inputs['summarize_attention_mask'] = summarize_model_inputs['attention_mask']
+        # for image-grounded text generation
+        prefix = f'English caption:'
+        content = caption
+        tokenized_prefix = self.tokenizer(
+            prefix, padding=False, truncation=True, return_tensors='pt',
+        )
+        prefix_input_ids = tokenized_prefix['input_ids'][:, :-1]  # remove eos
+        prefix_attention_mask = tokenized_prefix['attention_mask'][:, :-1]  # remove eos
+        tokenized_content = self.tokenizer(
+            content,
+            max_length=self.data_args.max_seq_length - prefix_input_ids.size(1) + 1,
+            padding='max_length' if self.data_args.pad_to_max_length else False,
+            truncation=True,
+            return_tensors='pt',
+        )
+        content_input_ids = tokenized_content['input_ids'][:, 1:]  # remove bos
+        content_attention_mask = tokenized_content['attention_mask'][:, 1:]  # remove bos
+        model_inputs['input_ids'] = torch.cat([prefix_input_ids, content_input_ids], dim=1)
+        model_inputs['attention_mask'] = torch.cat([prefix_attention_mask, content_attention_mask], dim=1)
+        labels = model_inputs['input_ids'].clone()
+        labels[labels == self.tokenizer.pad_token_id] = -100
+        labels[:, :prefix_input_ids.size(1) - 1] = -100
+        model_inputs['labels'] = labels
+        return model_inputs
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        i = i % len(self.images)
+        j = random.randint(0, len(self.images) - 1)
+        while self.image_ids[j] == self.image_ids[i]:
+            j = random.randint(0, len(self.images) - 1)
+        ret = self.preprocess(self.images[i], self.captions[i], self.captions[j])
+        # for image-text matching
+        ret['positive_input_ids'] = ret['positive_input_ids'][0]
+        ret['positive_attention_mask'] = ret['positive_attention_mask'][0]
+        ret['negative_input_ids'] = ret['negative_input_ids'][0]
+        ret['negative_attention_mask'] = ret['negative_attention_mask'][0]
+        # for image-text contrastive learning
+        ret['summarize_input_ids'] = ret['summarize_input_ids'][0]
+        ret['summarize_attention_mask'] = ret['summarize_attention_mask'][0]
+        # for image-grounded text generation
+        ret['input_ids'] = ret['input_ids'][0]
+        ret['attention_mask'] = ret['attention_mask'][0]
+        ret['labels'] = ret['labels'][0]
+        ret['image_ids'] = torch.Tensor([self.image_ids[i]]).long()
+        return ret
+class COCODataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(self, metas, tokenizer, data_args):
+        super(COCODataset, self).__init__()
+        annotations = json.load(open(metas['annotation']))
+        self.data_args = data_args
+        self.tokenizer = tokenizer
+        self.images = []
+        self.image_ids = []
+        self.captions = []
+        for annotation in annotations:
+            image_id = int(annotation['image_id'].split('_')[-1])
+            caption = annotation['caption']
+            caption = self.process_single_caption(caption)
+            image = annotation['image']
+            image_path = metas['root'] + '/' + image
+            self.images.append(image_path)
+            self.image_ids.append(image_id)
+            self.captions.append(caption)
+        print(f'There are {len(self.images)} images.')
+        print(f'There are {len(self.captions)} captions.')
+    def __len__(self):
+        return len(self.images)
+    def process_single_caption(self, caption, max_words=50):
+        caption = re.sub(r"([.!\"()*#:;~])", ' ', caption.lower())
+        caption = re.sub(r'\s{2,}', ' ', caption)
+        caption = caption.rstrip('\n')
+        caption = caption.strip(' ')
+        # truncate caption
+        caption_words = caption.split(' ')
+        if len(caption_words) > max_words:
+            caption = ' '.join(caption_words[: max_words])
+        return caption
+    def preprocess(self, image, caption, neg_caption):
+        model_inputs = dict()
+        # input image
+        image_transform = build_transform(input_size=self.data_args.force_image_size)
+        image = Image.open(image)
+        image = image.convert('RGB')
+        pixel_values = image_transform(image)
+        model_inputs['pixel_values'] = pixel_values
+        # for image-text matching
+        pos_model_inputs = self.tokenizer(
+            caption,
+            max_length=self.data_args.max_seq_length,
+            padding='max_length' if self.data_args.pad_to_max_length else False,
+            truncation=True,
+            return_tensors='pt',
+        )
+        model_inputs['positive_input_ids'] = pos_model_inputs['input_ids']
+        model_inputs['positive_attention_mask'] = pos_model_inputs['attention_mask']
+        neg_model_inputs = self.tokenizer(
+            neg_caption,
+            max_length=self.data_args.max_seq_length,
+            padding='max_length' if self.data_args.pad_to_max_length else False,
+            truncation=True,
+            return_tensors='pt',
+        )
+        model_inputs['negative_input_ids'] = neg_model_inputs['input_ids']
+        model_inputs['negative_attention_mask'] = neg_model_inputs['attention_mask']
+        # for image-text contrastive learning
+        summarize_model_inputs = self.tokenizer(
+            'summarize:' + caption,
+            max_length=self.data_args.max_seq_length,
+            padding='max_length' if self.data_args.pad_to_max_length else False,
+            truncation=True,
+            return_tensors='pt',
+        )
+        model_inputs['summarize_input_ids'] = summarize_model_inputs['input_ids']
+        model_inputs['summarize_attention_mask'] = summarize_model_inputs['attention_mask']
+        # for image-grounded text generation
+        prefix = f'English caption:'
+        content = caption
+        tokenized_prefix = self.tokenizer(
+            prefix, padding=False, truncation=True, return_tensors='pt',
+        )
+        prefix_input_ids = tokenized_prefix['input_ids'][:, :-1]  # remove eos
+        prefix_attention_mask = tokenized_prefix['attention_mask'][:, :-1]  # remove eos
+        tokenized_content = self.tokenizer(
+            content,
+            max_length=self.data_args.max_seq_length - prefix_input_ids.size(1) + 1,
+            padding='max_length' if self.data_args.pad_to_max_length else False,
+            truncation=True,
+            return_tensors='pt',
+        )
+        content_input_ids = tokenized_content['input_ids'][:, 1:]  # remove bos
+        content_attention_mask = tokenized_content['attention_mask'][:, 1:]  # remove bos
+        model_inputs['input_ids'] = torch.cat([prefix_input_ids, content_input_ids], dim=1)
+        model_inputs['attention_mask'] = torch.cat([prefix_attention_mask, content_attention_mask], dim=1)
+        labels = model_inputs['input_ids'].clone()
+        labels[labels == self.tokenizer.pad_token_id] = -100
+        labels[:, :prefix_input_ids.size(1) - 1] = -100
+        model_inputs['labels'] = labels
+        return model_inputs
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        i = i % len(self.images)
+        j = random.randint(0, len(self.images) - 1)
+        while self.image_ids[j] == self.image_ids[i]:
+            j = random.randint(0, len(self.images) - 1)
+        ret = self.preprocess(self.images[i], self.captions[i], self.captions[j])
+        # for image-text matching
+        ret['positive_input_ids'] = ret['positive_input_ids'][0]
+        ret['positive_attention_mask'] = ret['positive_attention_mask'][0]
+        ret['negative_input_ids'] = ret['negative_input_ids'][0]
+        ret['negative_attention_mask'] = ret['negative_attention_mask'][0]
+        # for image-text contrastive learning
+        ret['summarize_input_ids'] = ret['summarize_input_ids'][0]
+        ret['summarize_attention_mask'] = ret['summarize_attention_mask'][0]
+        # for image-grounded text generation
+        ret['input_ids'] = ret['input_ids'][0]
+        ret['attention_mask'] = ret['attention_mask'][0]
+        ret['labels'] = ret['labels'][0]
+        ret['image_ids'] = torch.Tensor([self.image_ids[i]]).long()
+        return ret

InternVL/internvl_g/internvl/train/internvl_stage2_finetune.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import logging
+import os
+import sys
+import warnings
+from dataclasses import dataclass, field
+from typing import Dict, Optional
+import torch.distributed as dist
+import transformers
+from internvl.dist_utils import init_dist
+from internvl.model.internvl_stage2_retrieval import (InternVLConfig,
+                                                      InternVLModel)
+from internvl.train.dataset import COCODataset, FlickrDataset
+from internvl.train.trainer_monkey_patch import replace_create_optimizer
+from PIL import Image, ImageFile, PngImagePlugin
+from transformers import (HfArgumentParser, LlamaTokenizer, Trainer,
+                          TrainingArguments, default_data_collator, set_seed)
+from transformers.trainer_utils import get_last_checkpoint
+from transformers.utils.logging import (enable_default_handler,
+                                        enable_explicit_format, set_verbosity)
+IGNORE_INDEX = -100
+Image.MAX_IMAGE_PIXELS = None
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+MaximumDecompressedSize = 1024
+MegaByte = 2 ** 20
+PngImagePlugin.MAX_TEXT_CHUNK = MaximumDecompressedSize * MegaByte
+warnings.filterwarnings('ignore')
+logger = logging.getLogger(__name__)
+os.environ['TOKENIZERS_PARALLELISM'] = 'true'
+ds_collections = {
+    'flickr30k_en_train': {
+        'root': './data/flickr30k/Images/',
+        'annotation': './data/flickr30k/flickr30k_train_karpathy.txt',
+    },
+    'flickr30k_cn_train': {
+        'root': './data/flickr30k/Images/',
+        'annotation': './data/flickr30k/flickr30k_cn_train.txt',
+    },
+    'coco_karpathy_train': {
+        'root': './data/coco/',
+        'annotation': './data/coco/annotations/coco_karpathy_train.json',
+    },
+}
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+    model_name_or_path: str = field(
+        metadata={'help': 'Path to pretrained model or model identifier from huggingface.co/models'}
+    )
+    freeze_model: bool = field(
+        default=False,
+        metadata={'help': 'Set to True to freeze the entire model.'},
+    )
+    freeze_vision_model: bool = field(
+        default=False,
+        metadata={'help': 'Set to True to freeze the vision backbone of the model.'},
+    )
+    freeze_qllama: bool = field(
+        default=False,
+        metadata={'help': 'Set to True to freeze the QLLaMA of the model.'},
+    )
+    unfreeze_qllama_head: bool = field(
+        default=False,
+        metadata={'help': 'Set to True to unfreeze the head of the QLLaMA.'},
+    )
+    unfreeze_crossattn: bool = field(
+        default=False,
+        metadata={'help': 'Set to True to unfreeze the cross attention layers in the QLLaMA.'},
+    )
+    use_backbone_lora: int = field(
+        default=0, metadata={'help': 'If non-zero, indicates the use of LoRA in the vision backbone of the model'}
+    )
+    use_qllama_lora: int = field(
+        default=0, metadata={'help': 'If non-zero, indicates the use of LoRA in the QLLaMA of the model'}
+    )
+    use_custom_trainer: bool = field(
+        default=False, metadata={'help': 'Set to True to enable the use of a custom trainer.'},
+    )
+    drop_path_rate: float = field(
+        default=0.0, metadata={'help': 'Specify the value of drop path rate in the vision backbone. Default is 0.'}
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    dataset_name: Optional[str] = field(
+        default='flickr30k_en_train',
+        metadata={'help': 'Specify the name of dataset to be used.'},
+    )
+    max_seq_length: Optional[int] = field(
+        default=80,
+        metadata={
+            'help': (
+                'The maximum total input sequence length after tokenization. Sequences longer '
+                'than this will be truncated, sequences shorter will be padded.'
+            )
+        },
+    )
+    force_image_size: Optional[int] = field(
+        default=224,
+        metadata={'help': 'Specify the image size for training models.'},
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            'help': (
+                'Whether to pad all samples to model maximum sentence length. '
+                'If False, will pad the samples dynamically when batching to the maximum length in the batch. More '
+                'efficient on GPU but very bad for TPU.'
+            )
+        },
+    )
+def main():
+    # Parse input arguments
+    # See all possible arguments in src/transformers/training_args.py
+    # If use DeepSpeed zero3, init_dist must before HfArgumentParser
+    launcher = os.environ.get('LAUNCHER', 'slurm')
+    init_dist(launcher=launcher, backend='nccl')
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith('.json'):
+        # If we pass only one argument to the script, and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
+    # information sent is the one passed as arguments along with your Python/PyTorch versions.
+    # send_example_telemetry('finetune Flickr30K', model_args, data_args)
+    # Setup logging
+    logging.basicConfig(
+        format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
+        datefmt='%m/%d/%Y %H:%M:%S',
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    if training_args.should_log:
+        # The default of training_args.log_level is passive, so we set log level at info here to have that default.
+        transformers.utils.logging.set_verbosity_info()
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    set_verbosity(log_level)
+    enable_default_handler()
+    enable_explicit_format()
+    # Log on each process the small summary:
+    logger.warning(
+        f'Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}'
+        + f'distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}'
+    )
+    logger.info(f'Training/evaluation parameters {training_args}')
+    # Detecting last checkpoint and eventually continue from last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f'Output directory ({training_args.output_dir}) already exists and is not empty. '
+                'Use --overwrite_output_dir to overcome.'
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f'Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change '
+                'the `--output_dir` or add `--overwrite_output_dir` to train from scratch.'
+            )
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # Load pretrained model, tokenizer, and image processor
+    tokenizer = LlamaTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        add_eos_token=True
+    )
+    if 'flickr' in data_args.dataset_name:
+        train_dataset = FlickrDataset(metas=ds_collections[data_args.dataset_name],
+                                      tokenizer=tokenizer, data_args=data_args)
+    elif 'coco' in data_args.dataset_name:
+        train_dataset = COCODataset(metas=ds_collections[data_args.dataset_name],
+                                    tokenizer=tokenizer, data_args=data_args)
+    config = InternVLConfig.from_pretrained(model_args.model_name_or_path)
+    config.vision_config.drop_path_rate = model_args.drop_path_rate
+    model = InternVLModel.from_pretrained(
+        model_args.model_name_or_path,
+        # ignore_mismatched_sizes=True,
+        config=config
+    )
+    if data_args.force_image_size != 224:
+        model.config.force_image_size = data_args.force_image_size
+        model.vision_model.resize_pos_embeddings(old_size=224, new_size=data_args.force_image_size, patch_size=14)
+    model.config.use_cache = False
+    model.config.qllama_config.use_cache = False
+    model.qllama.gradient_checkpointing = True
+    model.qllama.model.gradient_checkpointing = True
+    model.vision_model.gradient_checkpointing = True
+    model.vision_model.encoder.gradient_checkpointing = True
+    def _freeze_params(module):
+        for param in module.parameters():
+            param.requires_grad = False
+    if model_args.freeze_model:
+        _freeze_params(model)
+    if model_args.freeze_vision_model:
+        model.vision_model = model.vision_model.eval()
+        _freeze_params(model.vision_model)
+    if model_args.freeze_qllama:
+        model.qllama = model.qllama.eval()
+        _freeze_params(model.qllama)
+    if model_args.use_backbone_lora:
+        model.wrap_backbone_lora(r=model_args.use_backbone_lora, lora_alpha=model_args.use_backbone_lora * 2)
+        model.config.use_backbone_lora = model_args.use_backbone_lora
+    if model_args.use_qllama_lora:
+        model.wrap_qllama_lora(r=model_args.use_qllama_lora, lora_alpha=model_args.use_backbone_lora * 2)
+        model.config.use_qllama_lora = model_args.use_qllama_lora
+    if model_args.unfreeze_crossattn:
+        for name, param in model.qllama.named_parameters():
+            if 'cross_attn' in name:
+                param.requires_grad = True
+    if model_args.unfreeze_qllama_head:
+        model.qllama.lm_head.weight.requires_grad = True
+        model.text_projection.requires_grad = True
+    # print trainable parameters
+    if dist.get_rank() == 0:
+        for name, param in model.named_parameters():
+            print(name, param.requires_grad)
+    # set seed for torch dataloaders
+    set_seed(training_args.seed)
+    # Initialize our Trainer
+    if model_args.use_custom_trainer:
+        replace_create_optimizer()
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train else None,
+        eval_dataset=None,
+        tokenizer=tokenizer,
+        data_collator=default_data_collator,
+    )
+    # Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+        metrics = train_result.metrics
+        metrics['train_samples'] = len(train_dataset)
+        trainer.log_metrics('train', metrics)
+        trainer.save_metrics('train', metrics)
+        trainer.save_state()
+if __name__ == '__main__':
+    main()

InternVL/internvl_g/internvl/train/trainer_monkey_patch.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import json
+import os
+import torch
+import torch.nn as nn
+import transformers
+from transformers import Trainer, logging
+from transformers.trainer import is_sagemaker_mp_enabled
+logger = logging.get_logger(__name__)
+def get_num_layer_for_vit_and_qllama(var_name, vit_num_max_layer, llama_num_max_layer):
+    if var_name in ('query_tokens', 'logit_scale',):
+        return 0
+    if var_name.startswith('clip_projector.'):
+        return vit_num_max_layer
+    if var_name.startswith('clip_projector2.') or var_name.startswith('itm_head.') or \
+            var_name == 'text_projection':
+        return llama_num_max_layer
+    if var_name.startswith('vision_model.'):
+        if 'embeddings.' in var_name:
+            return 0
+        if 'layers.' in var_name:
+            var_name = var_name.split('layers.')[-1]
+            layer_id = int(var_name.split('.')[0])
+            return layer_id + 1
+    if var_name.startswith('qllama.'):
+        if 'embed_tokens' in var_name:
+            return 0
+        if 'layers.' in var_name:
+            var_name = var_name.split('layers.')[-1]
+            layer_id = int(var_name.split('.')[0])
+            return layer_id + 1
+        else:
+            return llama_num_max_layer
+    return 0
+def param_classification(name):
+    if name in ['query_tokens', 'text_projection', 'logit_scale']:
+        return 'qllama'
+    elif name.startswith('vision_model.'):
+        return 'vit'
+    elif name.startswith('qllama.'):
+        return 'qllama'
+    elif name.startswith('clip_projector.'):
+        return 'vit'
+    elif name.startswith('clip_projector2.'):
+        return 'qllama'
+    elif name.startswith('itm_head.'):
+        return 'qllama'
+    else:
+        return 'other'
+def create_optimizer(self):
+    """
+    Setup the optimizer.
+    We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+    Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+    """
+    opt_model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
+    parameter_groups = {}
+    try:  # for stage2 model
+        vit_num_layers = opt_model.config.vision_config.num_hidden_layers + 2
+        qllama_num_layers = opt_model.config.qllama_config.num_hidden_layers + 2
+    except:  # for stage3 model
+        vit_num_layers = opt_model.qllama.config.vision_config.num_hidden_layers + 2
+        qllama_num_layers = opt_model.qllama.config.qllama_config.num_hidden_layers + 2
+    print('vit_num_layers:', vit_num_layers)
+    print('qllama_num_layers:', qllama_num_layers)
+    vit_layer_decay_rate = float(os.getenv('VIT_LAYER_DECAY_RATE', 1.0))
+    qllama_layer_decay_rate = float(os.getenv('QLLAMA_LAYER_DECAY_RATE', 1.0))
+    print('vit_layer_decay_rate:', vit_layer_decay_rate)
+    print('qllama_layer_decay_rate:', qllama_layer_decay_rate)
+    for name, param in opt_model.named_parameters():
+        if not param.requires_grad:
+            continue  # frozen weights
+        if len(param.shape) == 1 or name.endswith('.bias'):
+            group_name = 'no_decay'
+            this_weight_decay = 0.
+        else:
+            group_name = 'decay'
+            this_weight_decay = self.args.weight_decay
+        cls = param_classification(name)
+        layer_id = get_num_layer_for_vit_and_qllama(name, vit_num_layers, qllama_num_layers)
+        group_name = '%s_layer_%d_%s' % (cls, layer_id, group_name)
+        if group_name not in parameter_groups:
+            if cls == 'vit':
+                scale = vit_layer_decay_rate ** (vit_num_layers - layer_id - 1)
+            else:
+                scale = qllama_layer_decay_rate ** (qllama_num_layers - layer_id - 1)
+            scale = min(1.0, scale)
+            parameter_groups[group_name] = {
+                'weight_decay': this_weight_decay,
+                'params': [],
+                'param_names': [],
+                'lr_scale': scale,
+                'group_name': group_name,
+                'lr': scale * self.args.learning_rate,
+            }
+        parameter_groups[group_name]['params'].append(param)
+        parameter_groups[group_name]['param_names'].append(name)
+        rank = torch.distributed.get_rank()
+        if rank == 0:
+            to_display = {}
+            for key in parameter_groups:
+                to_display[key] = {
+                    'param_names': parameter_groups[key]['param_names'],
+                    'lr_scale': parameter_groups[key]['lr_scale'],
+                    'lr': parameter_groups[key]['lr'],
+                    'weight_decay': parameter_groups[key]['weight_decay'],
+                }
+            print('Param groups = %s' % json.dumps(to_display, indent=2))
+    optimizer_grouped_parameters = list(parameter_groups.values())
+    optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+    self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+    if optimizer_cls.__name__ == 'Adam8bit':
+        import bitsandbytes
+        manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
+        skipped = 0
+        for module in opt_model.modules():
+            if isinstance(module, nn.Embedding):
+                skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
+                logger.info(f'skipped {module}: {skipped / 2 ** 20}M params')
+                manager.register_module_override(module, 'weight', {'optim_bits': 32})
+                logger.debug(f'bitsandbytes: will optimize {module} in fp32')
+        logger.info(f'skipped: {skipped / 2 ** 20}M params')
+    if is_sagemaker_mp_enabled():
+        import smdistributed.modelparallel.torch as smp
+        self.optimizer = smp.DistributedOptimizer(self.optimizer)
+    return self.optimizer
+def replace_create_optimizer():
+    print('Replace original create_optimizer with custom create_optimizer')
+    transformers.Trainer.create_optimizer = create_optimizer

InternVL/internvl_g/shell/finetune/internvl_stage2_finetune_coco_364_bs1024_ep5.sh ADDED Viewed

	@@ -0,0 +1,58 @@

+set -x
+export VIT_LAYER_DECAY_RATE=0.9
+export QLLAMA_LAYER_DECAY_RATE=0.9
+PARTITION=${PARTITION:-"VC2"}
+GPUS=${GPUS:-32}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+# number of gpus: 32
+# batch size per gpu: 32
+# gradient accumulation steps: 1
+# total batch size: 1024
+# epoch: 5
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_stage2_finetune.py \
+  --dataset_name 'coco_karpathy_train' \
+  --model_name_or_path "./pretrained/InternVL-14B-224px" \
+  --output_dir "./work_dirs/internvl_stage2_finetune_coco_364_bs1024_ep5" \
+  --overwrite_output_dir True \
+  --force_image_size 364 \
+  --drop_path_rate 0.3 \
+  --use_custom_trainer \
+  --dataloader_num_workers 2 \
+  --pad_to_max_length True \
+  --bf16 True \
+  --num_train_epochs 5 \
+  --per_device_train_batch_size 32 \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 5 \
+  --learning_rate 1e-6 \
+  --weight_decay 0.05 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 80 \
+  --do_train True \
+  --optim adamw_torch \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard"

InternVL/internvl_g/shell/finetune/internvl_stage2_finetune_flickr_364_bs1024_ep10.sh ADDED Viewed

	@@ -0,0 +1,58 @@

+set -x
+export VIT_LAYER_DECAY_RATE=0.9
+export QLLAMA_LAYER_DECAY_RATE=0.9
+PARTITION=${PARTITION:-"VC2"}
+GPUS=${GPUS:-32}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+# number of gpus: 32
+# batch size per gpu: 32
+# gradient accumulation steps: 1
+# total batch size: 1024
+# epoch: 10
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_stage2_finetune.py \
+  --dataset_name 'flickr30k_en_train' \
+  --model_name_or_path "./pretrained/InternVL-14B-224px" \
+  --output_dir "./work_dirs/internvl_stage2_finetune_flickr_364_bs1024_ep10" \
+  --overwrite_output_dir True \
+  --force_image_size 364 \
+  --drop_path_rate 0.3 \
+  --use_custom_trainer \
+  --dataloader_num_workers 2 \
+  --pad_to_max_length True \
+  --bf16 True \
+  --num_train_epochs 10 \
+  --per_device_train_batch_size 32 \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 5 \
+  --learning_rate 1e-6 \
+  --weight_decay 0.05 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 80 \
+  --do_train True \
+  --optim adamw_torch \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard"

InternVL/internvl_g/shell/finetune/internvl_stage2_finetune_flickrcn_364_bs1024_ep10.sh ADDED Viewed

	@@ -0,0 +1,58 @@

+set -x
+export VIT_LAYER_DECAY_RATE=0.9
+export QLLAMA_LAYER_DECAY_RATE=0.9
+PARTITION=${PARTITION:-"VC2"}
+GPUS=${GPUS:-32}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+# number of gpus: 32
+# batch size per gpu: 32
+# gradient accumulation steps: 1
+# total batch size: 1024
+# epoch: 10
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_stage2_finetune.py \
+  --dataset_name 'flickr30k_cn_train' \
+  --model_name_or_path "./pretrained/InternVL-14B-224px" \
+  --output_dir "./work_dirs/internvl_stage2_finetune_flickrcn_364_bs1024_ep10" \
+  --overwrite_output_dir True \
+  --force_image_size 364 \
+  --drop_path_rate 0.3 \
+  --use_custom_trainer \
+  --dataloader_num_workers 2 \
+  --pad_to_max_length True \
+  --bf16 True \
+  --num_train_epochs 10 \
+  --per_device_train_batch_size 32 \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 5 \
+  --learning_rate 1e-6 \
+  --weight_decay 0.05 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 80 \
+  --do_train True \
+  --optim adamw_torch \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard"

InternVL/internvl_g/shell/head_finetune/internvl_stage2_finetune_coco_224_bs1024_ep5_head_4gpu.sh ADDED Viewed

	@@ -0,0 +1,59 @@

+set -x
+GPUS=${GPUS:-4}
+BATCH_SIZE=${BATCH_SIZE:-32}
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+OUTPUT_DIR='work_dirs/internvl_stage2_finetune_coco_364_bs1024_ep5_head_4gpu'
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+# number of gpus: 32
+# batch size per gpu: 32
+# gradient accumulation steps: 1
+# total batch size: 1024
+# epoch: 5
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_stage2_finetune.py \
+  --dataset_name 'coco_karpathy_train' \
+  --model_name_or_path "./pretrained/InternVL-14B-224px" \
+  --output_dir ${OUTPUT_DIR} \
+  --overwrite_output_dir True \
+  --freeze_model \
+  --freeze_vision_model \
+  --freeze_qllama \
+  --unfreeze_qllama_head \
+  --force_image_size 224 \
+  --drop_path_rate 0.0 \
+  --dataloader_num_workers 2 \
+  --pad_to_max_length True \
+  --bf16 True \
+  --num_train_epochs 5 \
+  --per_device_train_batch_size ${BATCH_SIZE} \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 5 \
+  --learning_rate 1e-6 \
+  --weight_decay 0.05 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 80 \
+  --do_train True \
+  --optim adamw_torch \
+  --deepspeed "zero_stage3_config.json" \
+  --report_to "tensorboard"

InternVL/internvl_g/shell/head_finetune/internvl_stage2_finetune_flickr_224_bs1024_ep10_head_4gpu.sh ADDED Viewed

	@@ -0,0 +1,59 @@

+set -x
+GPUS=${GPUS:-4}
+BATCH_SIZE=${BATCH_SIZE:-32}
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+OUTPUT_DIR='work_dirs/internvl_stage2_finetune_flickr_364_bs1024_ep10_head_4gpu'
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+# number of gpus: 32
+# batch size per gpu: 32
+# gradient accumulation steps: 1
+# total batch size: 1024
+# epoch: 10
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_stage2_finetune.py \
+  --dataset_name 'flickr30k_en_train' \
+  --model_name_or_path "./pretrained/InternVL-14B-224px" \
+  --output_dir ${OUTPUT_DIR} \
+  --overwrite_output_dir True \
+  --freeze_model \
+  --freeze_vision_model \
+  --freeze_qllama \
+  --unfreeze_qllama_head \
+  --force_image_size 224 \
+  --drop_path_rate 0.0 \
+  --dataloader_num_workers 2 \
+  --pad_to_max_length True \
+  --bf16 True \
+  --num_train_epochs 10 \
+  --per_device_train_batch_size ${BATCH_SIZE} \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 5 \
+  --learning_rate 1e-6 \
+  --weight_decay 0.05 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 80 \
+  --do_train True \
+  --optim adamw_torch \
+  --deepspeed "zero_stage3_config.json" \
+  --report_to "tensorboard"

InternVL/internvl_g/shell/head_finetune/internvl_stage2_finetune_flickrcn_224_bs1024_ep10_head_4gpu.sh ADDED Viewed

	@@ -0,0 +1,59 @@

+set -x
+GPUS=${GPUS:-4}
+BATCH_SIZE=${BATCH_SIZE:-32}
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+OUTPUT_DIR='work_dirs/internvl_stage2_finetune_flickrcn_364_bs1024_ep10_head_4gpu'
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+# number of gpus: 32
+# batch size per gpu: 32
+# gradient accumulation steps: 1
+# total batch size: 1024
+# epoch: 10
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_stage2_finetune.py \
+  --dataset_name 'flickr30k_cn_train' \
+  --model_name_or_path "./pretrained/InternVL-14B-224px" \
+  --output_dir ${OUTPUT_DIR} \
+  --overwrite_output_dir True \
+  --freeze_model \
+  --freeze_vision_model \
+  --freeze_qllama \
+  --unfreeze_qllama_head \
+  --force_image_size 224 \
+  --drop_path_rate 0.0 \
+  --dataloader_num_workers 2 \
+  --pad_to_max_length True \
+  --bf16 True \
+  --num_train_epochs 10 \
+  --per_device_train_batch_size ${BATCH_SIZE} \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 5 \
+  --learning_rate 1e-6 \
+  --weight_decay 0.05 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 80 \
+  --do_train True \
+  --optim adamw_torch \
+  --deepspeed "zero_stage3_config.json" \
+  --report_to "tensorboard"

InternVL/internvl_g/shell/lora_finetune/internvl_stage2_finetune_coco_224_bs1024_ep5_lora16_4gpu.sh ADDED Viewed

	@@ -0,0 +1,61 @@

+set -x
+GPUS=${GPUS:-4}
+BATCH_SIZE=${BATCH_SIZE:-32}
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+OUTPUT_DIR='work_dirs/internvl_stage2_finetune_coco_364_bs1024_ep5_lora_4gpu'
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+# number of gpus: 32
+# batch size per gpu: 32
+# gradient accumulation steps: 1
+# total batch size: 1024
+# epoch: 5
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_stage2_finetune.py \
+  --dataset_name 'coco_karpathy_train' \
+  --model_name_or_path "./pretrained/InternVL-14B-224px" \
+  --output_dir ${OUTPUT_DIR} \
+  --overwrite_output_dir True \
+  --freeze_model \
+  --freeze_vision_model \
+  --freeze_qllama \
+  --unfreeze_qllama_head \
+  --use_backbone_lora 16 \
+  --use_qllama_lora 16 \
+  --force_image_size 224 \
+  --drop_path_rate 0.0 \
+  --dataloader_num_workers 2 \
+  --pad_to_max_length True \
+  --bf16 True \
+  --num_train_epochs 5 \
+  --per_device_train_batch_size ${BATCH_SIZE} \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 5 \
+  --learning_rate 1e-6 \
+  --weight_decay 0.05 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 80 \
+  --do_train True \
+  --optim adamw_torch \
+  --deepspeed "zero_stage3_config.json" \
+  --report_to "tensorboard"

InternVL/internvl_g/shell/lora_finetune/internvl_stage2_finetune_flickr_224_bs1024_ep10_lora16_4gpu.sh ADDED Viewed

	@@ -0,0 +1,61 @@

+set -x
+GPUS=${GPUS:-4}
+BATCH_SIZE=${BATCH_SIZE:-32}
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+OUTPUT_DIR='work_dirs/internvl_stage2_finetune_flickr_364_bs1024_ep10_lora_4gpu'
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+# number of gpus: 32
+# batch size per gpu: 32
+# gradient accumulation steps: 1
+# total batch size: 1024
+# epoch: 10
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_stage2_finetune.py \
+  --dataset_name 'flickr30k_en_train' \
+  --model_name_or_path "./pretrained/InternVL-14B-224px" \
+  --output_dir ${OUTPUT_DIR} \
+  --overwrite_output_dir True \
+  --freeze_model \
+  --freeze_vision_model \
+  --freeze_qllama \
+  --unfreeze_qllama_head \
+  --use_backbone_lora 16 \
+  --use_qllama_lora 16 \
+  --force_image_size 224 \
+  --drop_path_rate 0.0 \
+  --dataloader_num_workers 2 \
+  --pad_to_max_length True \
+  --bf16 True \
+  --num_train_epochs 10 \
+  --per_device_train_batch_size ${BATCH_SIZE} \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 5 \
+  --learning_rate 1e-6 \
+  --weight_decay 0.05 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 80 \
+  --do_train True \
+  --optim adamw_torch \
+  --deepspeed "zero_stage3_config.json" \
+  --report_to "tensorboard"

InternVL/internvl_g/shell/lora_finetune/internvl_stage2_finetune_flickrcn_224_bs1024_ep10_lora16_4gpu.sh ADDED Viewed

	@@ -0,0 +1,61 @@

+set -x
+GPUS=${GPUS:-4}
+BATCH_SIZE=${BATCH_SIZE:-32}
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+OUTPUT_DIR='work_dirs/internvl_stage2_finetune_flickrcn_364_bs1024_ep10_lora_4gpu'
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+# number of gpus: 32
+# batch size per gpu: 32
+# gradient accumulation steps: 1
+# total batch size: 1024
+# epoch: 10
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_stage2_finetune.py \
+  --dataset_name 'flickr30k_cn_train' \
+  --model_name_or_path "./pretrained/InternVL-14B-224px" \
+  --output_dir ${OUTPUT_DIR} \
+  --overwrite_output_dir True \
+  --freeze_model \
+  --freeze_vision_model \
+  --freeze_qllama \
+  --unfreeze_qllama_head \
+  --use_backbone_lora 16 \
+  --use_qllama_lora 16 \
+  --force_image_size 224 \
+  --drop_path_rate 0.0 \
+  --dataloader_num_workers 2 \
+  --pad_to_max_length True \
+  --bf16 True \
+  --num_train_epochs 10 \
+  --per_device_train_batch_size ${BATCH_SIZE} \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 5 \
+  --learning_rate 1e-6 \
+  --weight_decay 0.05 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 80 \
+  --do_train True \
+  --optim adamw_torch \
+  --deepspeed "zero_stage3_config.json" \
+  --report_to "tensorboard"

InternVL/segmentation/configs/_base_/datasets/ade20k_504x504.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# dataset settings
+dataset_type = 'ADE20KDataset'
+data_root = 'data/ade/ADEChallengeData2016'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (504, 504)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='Resize', img_scale=(2016, 504), ratio_range=(0.5, 2.0)),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2016, 504),
+        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+        flip=False,
+        transforms=[
+            dict(type='SETR_Resize', keep_ratio=True,
+                 crop_size=crop_size, setr_multi_scale=True),
+            dict(type='ResizeToMultiple', size_divisor=14),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir='images/training',
+        ann_dir='annotations/training',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir='images/validation',
+        ann_dir='annotations/validation',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir='images/validation',
+        ann_dir='annotations/validation',
+        pipeline=test_pipeline))

InternVL/segmentation/configs/_base_/datasets/ade20k_504x504_1of16.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# dataset settings
+dataset_type = 'ADE20KDataset'
+data_root = 'data/ade/ADEChallengeData2016'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (504, 504)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='Resize', img_scale=(2016, 504), ratio_range=(0.5, 2.0)),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2016, 504),
+        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='ResizeToMultiple', size_divisor=14),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir='images/training',
+        ann_dir='annotations/training',
+        max_image_num=20210 // 16,
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir='images/validation',
+        ann_dir='annotations/validation',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir='images/validation',
+        ann_dir='annotations/validation',
+        pipeline=test_pipeline))

InternVL/segmentation/configs/_base_/datasets/cityscapes_1024x1024.py ADDED Viewed

	@@ -0,0 +1,35 @@

+_base_ = './cityscapes.py'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (1024, 1024)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2048, 1024),
+        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    train=dict(pipeline=train_pipeline),
+    val=dict(pipeline=test_pipeline),
+    test=dict(pipeline=test_pipeline))

InternVL/segmentation/configs/_base_/models/apcnet_r50-d8.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='APCHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        pool_scales=(1, 2, 3, 6),
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))

InternVL/segmentation/configs/_base_/models/bisenetv1_r18-d32.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        type='BiSeNetV1',
+        in_channels=3,
+        context_channels=(128, 256, 512),
+        spatial_channels=(64, 64, 64, 128),
+        out_indices=(0, 1, 2),
+        out_channels=256,
+        backbone_cfg=dict(
+            type='ResNet',
+            in_channels=3,
+            depth=18,
+            num_stages=4,
+            out_indices=(0, 1, 2, 3),
+            dilations=(1, 1, 1, 1),
+            strides=(1, 2, 2, 2),
+            norm_cfg=norm_cfg,
+            norm_eval=False,
+            style='pytorch',
+            contract_dilation=True),
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        init_cfg=None),
+    decode_head=dict(
+        type='FCNHead',
+        in_channels=256,
+        in_index=0,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=64,
+            num_convs=1,
+            num_classes=19,
+            in_index=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=64,
+            num_convs=1,
+            num_classes=19,
+            in_index=2,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    ],
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))

InternVL/segmentation/configs/_base_/models/danet_r50-d8.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='DAHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        pam_channels=64,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))

InternVL/segmentation/configs/_base_/models/deeplabv3plus_r50-d8.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='DepthwiseSeparableASPPHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        dilations=(1, 12, 24, 36),
+        c1_in_channels=256,
+        c1_channels=48,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))

InternVL/segmentation/configs/_base_/models/dmnet_r50-d8.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='DMHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        filter_sizes=(1, 3, 5, 7),
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=dict(type='SyncBN', requires_grad=True),
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))

InternVL/segmentation/configs/_base_/models/encnet_r50-d8.py ADDED Viewed

	@@ -0,0 +1,48 @@

+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='EncHead',
+        in_channels=[512, 1024, 2048],
+        in_index=(1, 2, 3),
+        channels=512,
+        num_codes=32,
+        use_se_loss=True,
+        add_lateral=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_se_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))

InternVL/segmentation/configs/_base_/models/erfnet_fcn.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained=None,
+    backbone=dict(
+        type='ERFNet',
+        in_channels=3,
+        enc_downsample_channels=(16, 64, 128),
+        enc_stage_non_bottlenecks=(5, 8),
+        enc_non_bottleneck_dilations=(2, 4, 8, 16),
+        enc_non_bottleneck_channels=(64, 128),
+        dec_upsample_channels=(64, 16),
+        dec_stages_non_bottleneck=(2, 2),
+        dec_non_bottleneck_channels=(64, 16),
+        dropout_ratio=0.1,
+        init_cfg=None),
+    decode_head=dict(
+        type='FCNHead',
+        in_channels=16,
+        channels=128,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))

InternVL/segmentation/configs/_base_/models/fastfcn_r50-d32_jpu_psp.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 2, 2),
+        out_indices=(1, 2, 3),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    neck=dict(
+        type='JPU',
+        in_channels=(512, 1024, 2048),
+        mid_channels=512,
+        start_level=0,
+        end_level=-1,
+        dilations=(1, 2, 4, 8),
+        align_corners=False,
+        norm_cfg=norm_cfg),
+    decode_head=dict(
+        type='PSPHead',
+        in_channels=2048,
+        in_index=2,
+        channels=512,
+        pool_scales=(1, 2, 3, 6),
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=1,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))

InternVL/segmentation/configs/_base_/models/fcn_hr18.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='open-mmlab://msra/hrnetv2_w18',
+    backbone=dict(
+        type='HRNet',
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(18, 36)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(18, 36, 72)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(18, 36, 72, 144)))),
+    decode_head=dict(
+        type='FCNHead',
+        in_channels=[18, 36, 72, 144],
+        in_index=(0, 1, 2, 3),
+        channels=sum([18, 36, 72, 144]),
+        input_transform='resize_concat',
+        kernel_size=1,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=-1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))

InternVL/segmentation/configs/_base_/models/fpn_r50.py ADDED Viewed

	@@ -0,0 +1,36 @@

+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 1, 1),
+        strides=(1, 2, 2, 2),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=4),
+    decode_head=dict(
+        type='FPNHead',
+        in_channels=[256, 256, 256, 256],
+        in_index=[0, 1, 2, 3],
+        feature_strides=[4, 8, 16, 32],
+        channels=128,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))

InternVL/segmentation/configs/_base_/models/isanet_r50-d8.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='ISAHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        isa_channels=256,
+        down_factor=(8, 8),
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))

InternVL/segmentation/configs/_base_/models/lraspp_m-v3-d8.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# model settings
+norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        type='MobileNetV3',
+        arch='large',
+        out_indices=(1, 3, 16),
+        norm_cfg=norm_cfg),
+    decode_head=dict(
+        type='LRASPPHead',
+        in_channels=(16, 24, 960),
+        in_index=(0, 1, 2),
+        channels=128,
+        input_transform='multiple_select',
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='ReLU'),
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))

InternVL/segmentation/configs/_base_/models/pointrend_r50.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='CascadeEncoderDecoder',
+    num_stages=2,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 1, 1),
+        strides=(1, 2, 2, 2),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=4),
+    decode_head=[
+        dict(
+            type='FPNHead',
+            in_channels=[256, 256, 256, 256],
+            in_index=[0, 1, 2, 3],
+            feature_strides=[4, 8, 16, 32],
+            channels=128,
+            dropout_ratio=-1,
+            num_classes=19,
+            norm_cfg=norm_cfg,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='PointHead',
+            in_channels=[256],
+            in_index=[0],
+            channels=256,
+            num_fcs=3,
+            coarse_pred_each_layer=True,
+            dropout_ratio=-1,
+            num_classes=19,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    ],
+    # model training and testing settings
+    train_cfg=dict(
+        num_points=2048, oversample_ratio=3, importance_sample_ratio=0.75),
+    test_cfg=dict(
+        mode='whole',
+        subdivision_steps=2,
+        subdivision_num_points=8196,
+        scale_factor=2))

InternVL/segmentation/configs/_base_/models/pspnet_unet_s5-d16.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained=None,
+    backbone=dict(
+        type='UNet',
+        in_channels=3,
+        base_channels=64,
+        num_stages=5,
+        strides=(1, 1, 1, 1, 1),
+        enc_num_convs=(2, 2, 2, 2, 2),
+        dec_num_convs=(2, 2, 2, 2),
+        downsamples=(True, True, True, True),
+        enc_dilations=(1, 1, 1, 1, 1),
+        dec_dilations=(1, 1, 1, 1),
+        with_cp=False,
+        conv_cfg=None,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='ReLU'),
+        upsample_cfg=dict(type='InterpConv'),
+        norm_eval=False),
+    decode_head=dict(
+        type='PSPHead',
+        in_channels=64,
+        in_index=4,
+        channels=16,
+        pool_scales=(1, 2, 3, 6),
+        dropout_ratio=0.1,
+        num_classes=2,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=128,
+        in_index=3,
+        channels=64,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=2,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', crop_size=256, stride=170))

InternVL/segmentation/configs/_base_/models/upernet_r50.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 1, 1),
+        strides=(1, 2, 2, 2),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='UPerHead',
+        in_channels=[256, 512, 1024, 2048],
+        in_index=[0, 1, 2, 3],
+        pool_scales=(1, 2, 3, 6),
+        channels=512,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))

InternVL/segmentation/configs/_base_/schedules/schedule_10k.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=10000)
+checkpoint_config = dict(by_epoch=False, interval=1000)
+evaluation = dict(interval=1000, metric='mIoU', pre_eval=True)

InternVL/segmentation/configs/_base_/schedules/schedule_160k.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=160000)
+checkpoint_config = dict(by_epoch=False, interval=16000)
+evaluation = dict(interval=16000, metric='mIoU', pre_eval=True)

InternVL/segmentation/configs/_base_/schedules/schedule_20k.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=20000)
+checkpoint_config = dict(by_epoch=False, interval=2000)
+evaluation = dict(interval=2000, metric='mIoU', pre_eval=True)

InternVL/segmentation/configs/_base_/schedules/schedule_320k.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=320000)
+checkpoint_config = dict(by_epoch=False, interval=32000)
+evaluation = dict(interval=32000, metric='mIoU')

InternVL/segmentation/configs/_base_/schedules/schedule_40k.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=40000)
+checkpoint_config = dict(by_epoch=False, interval=4000)
+evaluation = dict(interval=4000, metric='mIoU', pre_eval=True)

InternVL/segmentation/configs/_base_/schedules/schedule_5k.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=5000)
+checkpoint_config = dict(by_epoch=False, interval=1000)
+evaluation = dict(interval=1000, metric='mIoU', pre_eval=True)

InternVL/segmentation/configs/_base_/schedules/schedule_80k.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=80000)
+checkpoint_config = dict(by_epoch=False, interval=8000)
+evaluation = dict(interval=8000, metric='mIoU', pre_eval=True)

InternVL/segmentation/configs/intern_vit_6b/few_shot/linear_intern_vit_6b_504_10k_ade20k_bs16_lr4e-5_1of8.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../../_base_/models/segmenter_vit-b16_mask.py',
+    '../../_base_/datasets/ade20k_504x504_1of8.py',
+    '../../_base_/default_runtime.py',
+    '../../_base_/schedules/schedule_10k.py'
+]
+deepspeed = False
+deepspeed_config = 'zero_configs/adam_zero1_bf16.json'
+pretrained = './pretrained/intern_vit_6b_224px.pth'
+model = dict(
+    pretrained=None,
+    backbone=dict(
+        _delete_=True,
+        type='InternViT6B',
+        pretrain_size=224,
+        img_size=504,
+        patch_size=14,
+        embed_dim=3200,
+        depth=48,
+        num_heads=25,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        drop_path_rate=0.4,
+        init_values=0.1,
+        with_cp=True,
+        use_flash_attn=True,
+        qk_normalization=True,
+        layerscale_force_fp32=False,
+        freeze_vit=False,
+        out_indices=[47],
+        pretrained=pretrained),
+    decode_head=dict(
+        _delete_=True,
+        type='FCNHead',
+        in_channels=3200,
+        channels=3200,
+        num_convs=0,
+        dropout_ratio=0.0,
+        concat_input=False,
+        num_classes=150,
+        with_norm=True,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    test_cfg=dict(mode='slide', crop_size=(504, 504), stride=(322, 322))
+)
+optimizer = dict(_delete_=True, type='AdamW', lr=4e-5, betas=(0.9, 0.999), weight_decay=0.05,
+                 constructor='CustomLayerDecayOptimizerConstructor',
+                 paramwise_cfg=dict(num_layers=48, layer_decay_rate=0.95))
+lr_config = dict(_delete_=True, policy='poly',
+                 warmup='linear',
+                 warmup_iters=200,
+                 warmup_ratio=1e-6,
+                 power=1.0, min_lr=0.0, by_epoch=False)
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data = dict(samples_per_gpu=2)
+runner = dict(type='IterBasedRunner')
+if deepspeed:
+    checkpoint_config = dict(deepspeed=deepspeed, by_epoch=False, interval=1000, max_keep_ckpts=2)
+else:
+    checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=2)
+evaluation = dict(interval=1000, metric='mIoU', save_best='auto')
+custom_hooks = [
+    dict(
+        type='ToBFloat16Hook',
+        priority=49),
+]

InternVL/segmentation/configs/intern_vit_6b/few_shot/linear_intern_vit_6b_504_20k_ade20k_bs16_lr4e-5_1of4.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+_base_ = [
+    '../../_base_/models/segmenter_vit-b16_mask.py',
+    '../../_base_/datasets/ade20k_504x504_1of4.py',
+    '../../_base_/default_runtime.py',
+    '../../_base_/schedules/schedule_20k.py'
+]
+deepspeed = False
+deepspeed_config = 'zero_configs/adam_zero1_bf16.json'
+pretrained = './pretrained/intern_vit_6b_224px.pth'
+model = dict(
+    pretrained=None,
+    backbone=dict(
+        _delete_=True,
+        type='InternViT6B',
+        pretrain_size=224,
+        img_size=504,
+        patch_size=14,
+        embed_dim=3200,
+        depth=48,
+        num_heads=25,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        drop_path_rate=0.4,
+        init_values=0.1,
+        with_cp=True,
+        use_flash_attn=True,
+        qk_normalization=True,
+        layerscale_force_fp32=False,
+        freeze_vit=False,
+        out_indices=[47],
+        pretrained=pretrained),
+    decode_head=dict(
+        _delete_=True,
+        type='FCNHead',
+        in_channels=3200,
+        channels=3200,
+        num_convs=0,
+        dropout_ratio=0.0,
+        concat_input=False,
+        num_classes=150,
+        with_norm=True,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    test_cfg=dict(mode='slide', crop_size=(504, 504), stride=(322, 322))
+)
+optimizer = dict(_delete_=True, type='AdamW', lr=4e-5, betas=(0.9, 0.999), weight_decay=0.05,
+                 constructor='CustomLayerDecayOptimizerConstructor',
+                 paramwise_cfg=dict(num_layers=48, layer_decay_rate=0.95))
+lr_config = dict(_delete_=True, policy='poly',
+                 warmup='linear',
+                 warmup_iters=400,
+                 warmup_ratio=1e-6,
+                 power=1.0, min_lr=0.0, by_epoch=False)
+# By default, models are trained on 8 GPUs with 2 images per GPU
+data = dict(samples_per_gpu=2)
+runner = dict(type='IterBasedRunner')
+if deepspeed:
+    checkpoint_config = dict(deepspeed=deepspeed, by_epoch=False, interval=1000, max_keep_ckpts=2)
+else:
+    checkpoint_config = dict(by_epoch=False, interval=1000, max_keep_ckpts=2)
+evaluation = dict(interval=1000, metric='mIoU', save_best='auto')
+custom_hooks = [
+    dict(
+        type='ToBFloat16Hook',
+        priority=49),
+]