tuandunghcmut commited on Apr 11, 2025

Commit

74c960e

verified ·

1 Parent(s): f40d9b1

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

EAGLE/eagle/model/language_model/__init__.py +0 -0
EAGLE/eagle/model/language_model/eagle_llama.py +173 -0
EAGLE/eagle/model/multimodal_encoder/__init__.py +0 -0
EAGLE/eagle/model/multimodal_encoder/clip_encoder.py +89 -0
EAGLE/eagle/model/multimodal_encoder/convnext_encoder.py +141 -0
EAGLE/eagle/model/multimodal_encoder/hr_clip_encoder.py +175 -0
EAGLE/eagle/model/multimodal_encoder/pix2struct_encoder.py +146 -0
EAGLE/eagle/model/multimodal_encoder/vision_models/__init__.py +0 -0
EAGLE/eagle/model/multimodal_encoder/vision_models/convnext.py +1108 -0
EAGLE/eagle/model/multimodal_encoder/vision_models/eva_vit.py +1235 -0
EAGLE/eagle/model/multimodal_projector/__init__.py +0 -0
EAGLE/eagle/model/multimodal_projector/builder.py +50 -0
EAGLE/lmms_eval/api/__init__.py +0 -0
EAGLE/lmms_eval/api/filter.py +53 -0
EAGLE/lmms_eval/api/instance.py +29 -0
EAGLE/lmms_eval/api/metrics.py +431 -0
EAGLE/lmms_eval/api/model.py +203 -0
EAGLE/lmms_eval/api/registry.py +139 -0
EAGLE/lmms_eval/api/samplers.py +94 -0
EAGLE/lmms_eval/api/task.py +1118 -0
EAGLE/lmms_eval/filters/__init__.py +44 -0
EAGLE/lmms_eval/filters/decontamination.py +23 -0
EAGLE/lmms_eval/filters/extraction.py +60 -0
EAGLE/lmms_eval/filters/selection.py +48 -0
EAGLE/lmms_eval/filters/transformation.py +48 -0
EAGLE/lmms_eval/models/__init__.py +16 -0
EAGLE/lmms_eval/models/eagle.py +415 -0
EAGLE/lmms_eval/models/gpt4v.py +129 -0
EAGLE/lmms_eval/tasks/__init__.py +155 -0
EAGLE/lmms_eval/tasks/_task_utils/file_utils.py +8 -0
EAGLE/lmms_eval/tasks/_task_utils/gpt_eval_utils.py +0 -0
EAGLE/lmms_eval/tasks/_task_utils/vqa_eval_metric.py +213 -0
EAGLE/lmms_eval/tasks/cmmmu/_cmmmu.yaml +4 -0
EAGLE/lmms_eval/tasks/cmmmu/_default_template_cmmmu_yaml +8 -0
EAGLE/lmms_eval/tasks/cmmmu/cmmmu_test.yaml +12 -0
EAGLE/lmms_eval/tasks/cmmmu/cmmmu_val.yaml +15 -0
EAGLE/lmms_eval/tasks/cmmmu/utils.py +421 -0
EAGLE/lmms_eval/tasks/gqa/gqa.yaml +32 -0
EAGLE/lmms_eval/tasks/gqa/utils.py +23 -0
EAGLE/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml +39 -0
EAGLE/lmms_eval/tasks/llava-in-the-wild/rule.json +11 -0
EAGLE/lmms_eval/tasks/llava-in-the-wild/utils.py +197 -0
EAGLE/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml +22 -0
EAGLE/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml +25 -0
EAGLE/lmms_eval/tasks/mmbench/cc_utils.py +109 -0
EAGLE/lmms_eval/tasks/mmbench/cn_utils.py +127 -0
EAGLE/lmms_eval/tasks/mmbench/en_utils.py +126 -0
EAGLE/lmms_eval/tasks/mmbench/mmbench.yaml +11 -0
EAGLE/lmms_eval/tasks/mmbench/mmbench_cc.yaml +34 -0
EAGLE/lmms_eval/tasks/mmbench/mmbench_cn.yaml +9 -0

EAGLE/eagle/model/language_model/__init__.py ADDED Viewed

File without changes

EAGLE/eagle/model/language_model/eagle_llama.py ADDED Viewed

	@@ -0,0 +1,173 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         LlamaConfig, LlamaModel, LlamaForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+from ..eagle_arch import EagleMetaModel, EagleMetaForCausalLM
+class EagleConfig(LlamaConfig):
+    model_type = "eagle_llama"
+class EagleLlamaModel(EagleMetaModel, LlamaModel):
+    config_class = EagleConfig
+    def __init__(self, config: LlamaConfig):
+        super(EagleLlamaModel, self).__init__(config)
+class EagleLlamaForCausalLM(LlamaForCausalLM, EagleMetaForCausalLM):
+    config_class = EagleConfig
+    def __init__(self, config):
+        super(LlamaForCausalLM, self).__init__(config)
+        self.model = EagleLlamaModel(config)
+        self.pretraining_tp = config.pretraining_tp
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs # for llama3, upgrade the transformers and will receive an additional argument cache_position
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                images,
+                image_sizes
+            )
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        if images is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                image_sizes=image_sizes
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
+                                      inputs_embeds=None, **kwargs):
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            inputs['images'] = images
+        if image_sizes is not None:
+            inputs['image_sizes'] = image_sizes
+        return inputs
+AutoConfig.register("eagle_llama", EagleConfig)
+AutoModelForCausalLM.register(EagleConfig, EagleLlamaForCausalLM)

EAGLE/eagle/model/multimodal_encoder/__init__.py ADDED Viewed

File without changes

EAGLE/eagle/model/multimodal_encoder/clip_encoder.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# This file is modified from https://github.com/haotian-liu/LLaVA/
+import torch
+import torch.nn as nn
+from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+class CLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        if not delay_load:
+            self.load_model()
+        elif getattr(args, 'unfreeze_mm_vision_tower', False):
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
+        self.vision_tower.requires_grad_(False)
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches_per_side(self):
+        return self.config.image_size // self.config.patch_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2

EAGLE/eagle/model/multimodal_encoder/convnext_encoder.py ADDED Viewed

	@@ -0,0 +1,141 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is modified from https://github.com/luogen1996/LLaVA-HR
+import torch
+import torch.nn as nn
+from transformers import CLIPImageProcessor
+from .vision_models.convnext import convnext_xxlarge
+from torch.utils.checkpoint import checkpoint
+cfg={
+    "crop_size": 256,
+    "do_center_crop": True,
+    "do_normalize": True,
+    "do_resize": True,
+    "feature_extractor_type": "CLIPFeatureExtractor",
+    "image_mean": [
+        0.48145466,
+        0.4578275,
+        0.40821073
+    ],
+    "image_std": [
+        0.26862954,
+        0.26130258,
+        0.27577711
+    ],
+    "resample": 3,
+    "size": 256
+}
+class ConvNextVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.freeze_vision=args.freeze_vision
+        self.input_image_size=args.input_image_size
+        self.vision_tower_name = vision_tower
+        self.select_layer = -1 # hardcode
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        self.load_model()
+    def load_model(self):
+        self.image_processor = CLIPImageProcessor(**cfg)
+        if 'xxlarge' in self.vision_tower_name:
+            self.vision_tower = convnext_xxlarge(self.vision_tower_name)
+            setattr(self.vision_tower, 'hidden_size', 3072)
+        else:
+            raise NotImplementedError
+        if self.freeze_vision:
+            self.vision_tower.requires_grad_(False)
+        # Hardcode
+        for s in self.vision_tower.stages:
+            s.grad_checkpointing = True
+        if self.input_image_size is not None:
+            self.image_processor.size=self.input_image_size
+            self.image_processor.crop_size={
+                'height':self.input_image_size,
+                'width': self.input_image_size
+            }
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs[self.select_layer]
+        return image_features
+    def forward_features(self, x):
+        x = self.vision_tower.stem(x)
+        image_forward_out=[]
+        for blk in self.vision_tower.stages:
+            x = blk(x)
+            b,c,h,w=x.shape
+            image_forward_out.append(x.view(b,c,-1).transpose(1,2))
+        return image_forward_out
+    def forward(self, images):
+        if self.freeze_vision:
+            with torch.no_grad():
+                image_features = self._forward_images(images)
+        else:
+            image_features = self._forward_images(images)
+        return image_features
+    def _forward_images(self, images):
+        image_forward_outs = self.forward_features(images.to(device=self.device, dtype=self.dtype))
+        image_features = self.feature_select(image_forward_outs)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return next(self.vision_tower.parameters()).dtype
+    @property
+    def device(self):
+        return next(self.vision_tower.parameters()).device
+    @property
+    def config(self):
+        assert  NotImplementedError
+        pass
+    @property
+    def num_attention_heads(self):
+        # as constant
+        return 16
+    @property
+    def num_layers(self):
+        # as constant
+        return 4
+    @property
+    def hidden_size(self):
+        return self.vision_tower.hidden_size
+    @property
+    def num_patches(self):
+        return (cfg['image_size'] // self.patch_embed.patch_size[0]) ** 2

EAGLE/eagle/model/multimodal_encoder/hr_clip_encoder.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# Mostly copy-paste from LLaVA-HR
+# https://github.com/luogen1996/LLaVA-HR
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+import math
+import torch
+import torch.nn.functional as F
+from typing import List, Optional
+def forward_embeddings(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+    batch_size = pixel_values.shape[0]
+    target_dtype = self.patch_embedding.weight.dtype
+    patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+    patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+    class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+    embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+    position_embeddings = self.position_embedding(self.position_ids)
+    if position_embeddings.shape[1]!=embeddings.shape[1]:
+        position_embeddings=resample_pos_embed(position_embeddings,embeddings.shape[1])
+    embeddings = embeddings + position_embeddings
+    return embeddings
+def resample_pos_embed(
+        posemb,
+        new_size: int,
+        num_prefix_tokens: int = 1,
+        interpolation: str = 'bicubic',
+        antialias: bool = True,
+        verbose: bool = False,
+):
+    new_size=[int(math.sqrt(new_size-num_prefix_tokens)),int(math.sqrt(new_size-num_prefix_tokens))]
+    num_pos_tokens = posemb.shape[1] - num_prefix_tokens
+    old_size = int(math.sqrt(num_pos_tokens))
+    bs=posemb.shape[0]
+    if num_prefix_tokens:
+        posemb_prefix, posemb = posemb[:,:num_prefix_tokens], posemb[:,num_prefix_tokens:]
+    else:
+        posemb_prefix, posemb = None, posemb
+    # do the interpolation
+    embed_dim = posemb.shape[-1]
+    orig_dtype = posemb.dtype
+    posemb = posemb.float()  # interpolate needs float32
+    posemb = posemb.reshape(bs, old_size, old_size, -1).permute(0, 3, 1, 2)
+    posemb = F.interpolate(posemb, size=new_size, mode=interpolation, antialias=antialias)
+    posemb = posemb.permute(0, 2, 3, 1).reshape(bs, -1, embed_dim)
+    posemb = posemb.to(dtype=orig_dtype)
+    # add back extra (class, etc) prefix tokens
+    if posemb_prefix is not None:
+        posemb = torch.cat([posemb_prefix, posemb],1)
+    if not torch.jit.is_scripting() and verbose:
+        print(f'Resized position embedding: {old_size} to {new_size}.')
+    return posemb
+class HRCLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.freeze_vision=args.freeze_vision
+        self.input_image_size=args.input_image_size
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+    def load_model(self):
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
+        # checkpointing for clip
+        self.vision_tower.vision_model.encoder.gradient_checkpointing =True
+        if self.freeze_vision:
+            self.vision_tower.requires_grad_(False)
+        cls_=self.vision_tower.vision_model.embeddings
+        bound_method = forward_embeddings.__get__(cls_, cls_.__class__)
+        setattr(cls_, 'forward', bound_method)
+        if self.input_image_size is not None:
+            self.image_processor.size=self.input_image_size
+            self.image_processor.crop_size={
+                'height':self.input_image_size,
+                'width': self.input_image_size
+            }
+        self.is_loaded = True
+    def forward(self, x):
+        # 448 image input
+        blks = self.vision_tower.vision_model.encoder.layers
+        x = self.vision_tower.vision_model.embeddings(x)
+        x = self.vision_tower.vision_model.pre_layrnorm(x[:, 1:])
+        # inference of fast branch
+        for blk in blks:
+            if self.training:
+                x=checkpoint(
+                    blk.__call__,
+                    x,
+                    None,
+                    None
+                )[0]
+            else:
+                x = blk(x, None, None)[0]
+        return x
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def num_attention_heads(self):
+        return self.config.num_attention_heads
+    @property
+    def num_layers(self):
+        return self.config.num_hidden_layers
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2

EAGLE/eagle/model/multimodal_encoder/pix2struct_encoder.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+import re
+from PIL import Image
+import torch
+import torch.nn as nn
+from transformers import AutoModel, CLIPImageProcessor
+from PIL import Image
+import requests
+import torch.nn.functional as F
+from transformers import AutoProcessor, Pix2StructVisionModel, Pix2StructProcessor, Pix2StructForConditionalGeneration
+cfg={
+    "crop_size": 256,
+    "do_center_crop": True,
+    "do_normalize": True,
+    "do_resize": True,
+    "feature_extractor_type": "CLIPFeatureExtractor",
+    "image_mean": [
+        0.48145466,
+        0.4578275,
+        0.40821073
+    ],
+    "image_std": [
+        0.26862954,
+        0.26130258,
+        0.27577711
+    ],
+    "resample": 3,
+    "size": 256
+}
+'''
+Pixel2Struct-Large Model (pretrained version)
+'''
+class Pix2StructLargeVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.do_resize = args.do_resize
+        self.de_normalize = args.de_normalize # de-normalize the input image and perform preprocessing with pix2struct processor
+        self.select_layer = args.mm_vision_select_layer # NOTE: not implemented yet, this parameter has no effect
+        self.input_image_size = args.input_image_size
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        self.freeze_vision = args.freeze_vision
+        self.args = args
+        if not self.is_loaded:
+            self.load_model()
+    def load_model(self):
+        if self.is_loaded:
+            return
+        whole_model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-large")
+        self.vision_tower = whole_model.encoder
+        self.pix2struct_processor = AutoProcessor.from_pretrained("google/pix2struct-large")
+        self.pix2struct_processor.image_processor.is_vqa = False
+        self.image_processor = CLIPImageProcessor(**cfg)
+        if self.input_image_size is not None:
+            self.image_processor.size=self.input_image_size
+            self.image_processor.crop_size={
+                'height':self.input_image_size,
+                'width': self.input_image_size
+            }
+        if self.freeze_vision:
+            self.vision_tower.requires_grad_(False)
+        self.image_mean = torch.tensor(self.image_processor.image_mean).view(1, 3, 1, 1)
+        self.image_std = torch.tensor(self.image_processor.image_std).view(1, 3, 1, 1)
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer] # [bs, n, c], cls at idx=0
+        if self.select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif self.select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+    # @torch.no_grad()
+    def forward(self, images):
+        if self.de_normalize:
+            mean = self.image_mean.clone().view(1, 3, 1, 1).to(dtype=images.dtype, device=images.device)
+            std = self.image_std.clone().view(1, 3, 1, 1).to(dtype=images.dtype, device=images.device)
+            x = (images * std + mean) * 255.0
+            x = self.pix2struct_processor(images=x.float(), return_tensors="pt")
+        image_features = self.vision_tower(**(x.to(device=self.device, dtype=self.dtype))).last_hidden_state
+        bs, n, c = image_features.shape
+        image_features  = image_features[:, :2025, :] # HARD CODE
+        if self.do_resize:
+            image_features = image_features.transpose(1,2).reshape(bs, c, 45, 45) # HARD CODE
+            image_features = F.interpolate(image_features.float(), size=(32, 32), mode='bilinear', align_corners=True).to(dtype=image_features.dtype) # HARD CODE
+            return image_features
+        else:
+            return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return next(self.vision_tower.parameters()).dtype
+    @property
+    def device(self):
+        return next(self.vision_tower.parameters()).device
+    @property
+    def config(self):
+        return self.vision_tower.config
+    @property
+    def hidden_size(self):
+        # Hard code
+        hidden_dim = 1536
+        return hidden_dim
+    @property
+    def num_patches(self):
+        return self.config['num_patches']

EAGLE/eagle/model/multimodal_encoder/vision_models/__init__.py ADDED Viewed

File without changes

EAGLE/eagle/model/multimodal_encoder/vision_models/convnext.py ADDED Viewed

	@@ -0,0 +1,1108 @@

+""" ConvNeXt
+Papers:
+* `A ConvNet for the 2020s` - https://arxiv.org/pdf/2201.03545.pdf
+@Article{liu2022convnet,
+  author  = {Zhuang Liu and Hanzi Mao and Chao-Yuan Wu and Christoph Feichtenhofer and Trevor Darrell and Saining Xie},
+  title   = {A ConvNet for the 2020s},
+  journal = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year    = {2022},
+}
+* `ConvNeXt-V2 - Co-designing and Scaling ConvNets with Masked Autoencoders` - https://arxiv.org/abs/2301.00808
+@article{Woo2023ConvNeXtV2,
+  title={ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders},
+  author={Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon and Saining Xie},
+  year={2023},
+  journal={arXiv preprint arXiv:2301.00808},
+}
+Original code and weights from:
+* https://github.com/facebookresearch/ConvNeXt, original copyright below
+* https://github.com/facebookresearch/ConvNeXt-V2, original copyright below
+Model defs atto, femto, pico, nano and _ols / _hnf variants are timm originals.
+Modifications and additions for timm hacked together by / Copyright 2022, Ross Wightman
+"""
+# ConvNeXt
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the MIT license
+# ConvNeXt-V2
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree (Attribution-NonCommercial 4.0 International (CC BY-NC 4.0))
+# No code was used directly from ConvNeXt-V2, however the weights are CC BY-NC 4.0 so beware if using commercially.
+from collections import OrderedDict
+from functools import partial
+from typing import Callable, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
+from timm.layers import trunc_normal_, AvgPool2dSame, DropPath, Mlp, GlobalResponseNormMlp, \
+    LayerNorm2d, LayerNorm, create_conv2d, get_act_layer, make_divisible, to_ntuple
+from timm.layers import NormMlpClassifierHead, ClassifierHead
+from timm.models._builder import build_model_with_cfg
+from timm.models._manipulate import named_apply, checkpoint_seq
+from timm.models._registry import generate_default_cfgs, register_model, register_model_deprecations
+__all__ = ['ConvNeXt']  # model_registry will add each entrypoint fn to this
+class Downsample(nn.Module):
+    def __init__(self, in_chs, out_chs, stride=1, dilation=1):
+        super().__init__()
+        avg_stride = stride if dilation == 1 else 1
+        if stride > 1 or dilation > 1:
+            avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
+            self.pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
+        else:
+            self.pool = nn.Identity()
+        if in_chs != out_chs:
+            self.conv = create_conv2d(in_chs, out_chs, 1, stride=1)
+        else:
+            self.conv = nn.Identity()
+    def forward(self, x):
+        x = self.pool(x)
+        x = self.conv(x)
+        return x
+class ConvNeXtBlock(nn.Module):
+    """ ConvNeXt Block
+    There are two equivalent implementations:
+      (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+      (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    Unlike the official impl, this one allows choice of 1 or 2, 1x1 conv can be faster with appropriate
+    choice of LayerNorm impl, however as model size increases the tradeoffs appear to change and nn.Linear
+    is a better choice. This was observed with PyTorch 1.10 on 3090 GPU, it could change over time & w/ different HW.
+    """
+    def __init__(
+            self,
+            in_chs: int,
+            out_chs: Optional[int] = None,
+            kernel_size: int = 7,
+            stride: int = 1,
+            dilation: Union[int, Tuple[int, int]] = (1, 1),
+            mlp_ratio: float = 4,
+            conv_mlp: bool = False,
+            conv_bias: bool = True,
+            use_grn: bool = False,
+            ls_init_value: Optional[float] = 1e-6,
+            act_layer: Union[str, Callable] = 'gelu',
+            norm_layer: Optional[Callable] = None,
+            drop_path: float = 0.,
+    ):
+        """
+        Args:
+            in_chs: Block input channels.
+            out_chs: Block output channels (same as in_chs if None).
+            kernel_size: Depthwise convolution kernel size.
+            stride: Stride of depthwise convolution.
+            dilation: Tuple specifying input and output dilation of block.
+            mlp_ratio: MLP expansion ratio.
+            conv_mlp: Use 1x1 convolutions for MLP and a NCHW compatible norm layer if True.
+            conv_bias: Apply bias for all convolution (linear) layers.
+            use_grn: Use GlobalResponseNorm in MLP (from ConvNeXt-V2)
+            ls_init_value: Layer-scale init values, layer-scale applied if not None.
+            act_layer: Activation layer.
+            norm_layer: Normalization layer (defaults to LN if not specified).
+            drop_path: Stochastic depth probability.
+        """
+        super().__init__()
+        out_chs = out_chs or in_chs
+        dilation = to_ntuple(2)(dilation)
+        act_layer = get_act_layer(act_layer)
+        if not norm_layer:
+            norm_layer = LayerNorm2d if conv_mlp else LayerNorm
+        mlp_layer = partial(GlobalResponseNormMlp if use_grn else Mlp, use_conv=conv_mlp)
+        self.use_conv_mlp = conv_mlp
+        self.conv_dw = create_conv2d(
+            in_chs,
+            out_chs,
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation[0],
+            depthwise=True,
+            bias=conv_bias,
+        )
+        self.norm = norm_layer(out_chs)
+        self.mlp = mlp_layer(out_chs, int(mlp_ratio * out_chs), act_layer=act_layer)
+        self.weight = nn.Parameter(ls_init_value * torch.ones(out_chs)) if ls_init_value is not None else None
+        if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
+            self.shortcut = Downsample(in_chs, out_chs, stride=stride, dilation=dilation[0])
+        else:
+            self.shortcut = nn.Identity()
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+    def forward(self, x):
+        shortcut = x
+        x = self.conv_dw(x)
+        if self.use_conv_mlp:
+            x = self.norm(x)
+            x = self.mlp(x)
+        else:
+            x = x.permute(0, 2, 3, 1)
+            x = self.norm(x)
+            x = self.mlp(x)
+            x = x.permute(0, 3, 1, 2)
+        if self.weight is not None:
+            x = x.mul(self.weight.reshape(1, -1, 1, 1))
+        x = self.drop_path(x) + self.shortcut(shortcut)
+        return x
+class ConvNeXtStage(nn.Module):
+    def __init__(
+            self,
+            in_chs,
+            out_chs,
+            kernel_size=7,
+            stride=2,
+            depth=2,
+            dilation=(1, 1),
+            drop_path_rates=None,
+            ls_init_value=1.0,
+            conv_mlp=False,
+            conv_bias=True,
+            use_grn=False,
+            act_layer='gelu',
+            norm_layer=None,
+            norm_layer_cl=None
+    ):
+        super().__init__()
+        self.grad_checkpointing = False
+        if in_chs != out_chs or stride > 1 or dilation[0] != dilation[1]:
+            ds_ks = 2 if stride > 1 or dilation[0] != dilation[1] else 1
+            pad = 'same' if dilation[1] > 1 else 0  # same padding needed if dilation used
+            self.downsample = nn.Sequential(
+                norm_layer(in_chs),
+                create_conv2d(
+                    in_chs,
+                    out_chs,
+                    kernel_size=ds_ks,
+                    stride=stride,
+                    dilation=dilation[0],
+                    padding=pad,
+                    bias=conv_bias,
+                ),
+            )
+            in_chs = out_chs
+        else:
+            self.downsample = nn.Identity()
+        drop_path_rates = drop_path_rates or [0.] * depth
+        stage_blocks = []
+        for i in range(depth):
+            stage_blocks.append(ConvNeXtBlock(
+                in_chs=in_chs,
+                out_chs=out_chs,
+                kernel_size=kernel_size,
+                dilation=dilation[1],
+                drop_path=drop_path_rates[i],
+                ls_init_value=ls_init_value,
+                conv_mlp=conv_mlp,
+                conv_bias=conv_bias,
+                use_grn=use_grn,
+                act_layer=act_layer,
+                norm_layer=norm_layer if conv_mlp else norm_layer_cl,
+            ))
+            in_chs = out_chs
+        self.blocks = nn.Sequential(*stage_blocks)
+    def forward(self, x):
+        x = self.downsample(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        return x
+class ConvNeXt(nn.Module):
+    r""" ConvNeXt
+        A PyTorch impl of : `A ConvNet for the 2020s`  - https://arxiv.org/pdf/2201.03545.pdf
+    """
+    def __init__(
+            self,
+            in_chans: int = 3,
+            num_classes: int = 1000,
+            global_pool: str = 'avg',
+            output_stride: int = 32,
+            depths: Tuple[int, ...] = (3, 3, 9, 3),
+            dims: Tuple[int, ...] = (96, 192, 384, 768),
+            kernel_sizes: Union[int, Tuple[int, ...]] = 7,
+            ls_init_value: Optional[float] = 1e-6,
+            stem_type: str = 'patch',
+            patch_size: int = 4,
+            head_init_scale: float = 1.,
+            head_norm_first: bool = False,
+            head_hidden_size: Optional[int] = None,
+            conv_mlp: bool = False,
+            conv_bias: bool = True,
+            use_grn: bool = False,
+            act_layer: Union[str, Callable] = 'gelu',
+            norm_layer: Optional[Union[str, Callable]] = None,
+            norm_eps: Optional[float] = None,
+            drop_rate: float = 0.,
+            drop_path_rate: float = 0.,
+    ):
+        """
+        Args:
+            in_chans: Number of input image channels.
+            num_classes: Number of classes for classification head.
+            global_pool: Global pooling type.
+            output_stride: Output stride of network, one of (8, 16, 32).
+            depths: Number of blocks at each stage.
+            dims: Feature dimension at each stage.
+            kernel_sizes: Depthwise convolution kernel-sizes for each stage.
+            ls_init_value: Init value for Layer Scale, disabled if None.
+            stem_type: Type of stem.
+            patch_size: Stem patch size for patch stem.
+            head_init_scale: Init scaling value for classifier weights and biases.
+            head_norm_first: Apply normalization before global pool + head.
+            head_hidden_size: Size of MLP hidden layer in head if not None and head_norm_first == False.
+            conv_mlp: Use 1x1 conv in MLP, improves speed for small networks w/ chan last.
+            conv_bias: Use bias layers w/ all convolutions.
+            use_grn: Use Global Response Norm (ConvNeXt-V2) in MLP.
+            act_layer: Activation layer type.
+            norm_layer: Normalization layer type.
+            drop_rate: Head pre-classifier dropout rate.
+            drop_path_rate: Stochastic depth drop rate.
+        """
+        super().__init__()
+        assert output_stride in (8, 16, 32)
+        kernel_sizes = to_ntuple(4)(kernel_sizes)
+        if norm_layer is None:
+            norm_layer = LayerNorm2d
+            norm_layer_cl = norm_layer if conv_mlp else LayerNorm
+            if norm_eps is not None:
+                norm_layer = partial(norm_layer, eps=norm_eps)
+                norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
+        else:
+            assert conv_mlp,\
+                'If a norm_layer is specified, conv MLP must be used so all norm expect rank-4, channels-first input'
+            norm_layer_cl = norm_layer
+            if norm_eps is not None:
+                norm_layer_cl = partial(norm_layer_cl, eps=norm_eps)
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        self.feature_info = []
+        assert stem_type in ('patch', 'overlap', 'overlap_tiered')
+        if stem_type == 'patch':
+            # NOTE: this stem is a minimal form of ViT PatchEmbed, as used in SwinTransformer w/ patch_size = 4
+            self.stem = nn.Sequential(
+                nn.Conv2d(in_chans, dims[0], kernel_size=patch_size, stride=patch_size, bias=conv_bias),
+                norm_layer(dims[0]),
+            )
+            stem_stride = patch_size
+        else:
+            mid_chs = make_divisible(dims[0] // 2) if 'tiered' in stem_type else dims[0]
+            self.stem = nn.Sequential(
+                nn.Conv2d(in_chans, mid_chs, kernel_size=3, stride=2, padding=1, bias=conv_bias),
+                nn.Conv2d(mid_chs, dims[0], kernel_size=3, stride=2, padding=1, bias=conv_bias),
+                norm_layer(dims[0]),
+            )
+            stem_stride = 4
+        self.stages = nn.Sequential()
+        dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        stages = []
+        prev_chs = dims[0]
+        curr_stride = stem_stride
+        dilation = 1
+        # 4 feature resolution stages, each consisting of multiple residual blocks
+        for i in range(4):
+            stride = 2 if curr_stride == 2 or i > 0 else 1
+            if curr_stride >= output_stride and stride > 1:
+                dilation *= stride
+                stride = 1
+            curr_stride *= stride
+            first_dilation = 1 if dilation in (1, 2) else 2
+            out_chs = dims[i]
+            stages.append(ConvNeXtStage(
+                prev_chs,
+                out_chs,
+                kernel_size=kernel_sizes[i],
+                stride=stride,
+                dilation=(first_dilation, dilation),
+                depth=depths[i],
+                drop_path_rates=dp_rates[i],
+                ls_init_value=ls_init_value,
+                conv_mlp=conv_mlp,
+                conv_bias=conv_bias,
+                use_grn=use_grn,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                norm_layer_cl=norm_layer_cl,
+            ))
+            prev_chs = out_chs
+            # NOTE feature_info use currently assumes stage 0 == stride 1, rest are stride 2
+            self.feature_info += [dict(num_chs=prev_chs, reduction=curr_stride, module=f'stages.{i}')]
+        self.stages = nn.Sequential(*stages)
+        self.num_features = prev_chs
+        # if head_norm_first == true, norm -> global pool -> fc ordering, like most other nets
+        # otherwise pool -> norm -> fc, the default ConvNeXt ordering (pretrained FB weights)
+        if head_norm_first:
+            assert not head_hidden_size
+            self.norm_pre = norm_layer(self.num_features)
+            self.head = ClassifierHead(
+                self.num_features,
+                num_classes,
+                pool_type=global_pool,
+                drop_rate=self.drop_rate,
+            )
+        else:
+            self.norm_pre = nn.Identity()
+            self.head = NormMlpClassifierHead(
+                self.num_features,
+                num_classes,
+                hidden_size=head_hidden_size,
+                pool_type=global_pool,
+                drop_rate=self.drop_rate,
+                norm_layer=norm_layer,
+                act_layer='gelu',
+            )
+        named_apply(partial(_init_weights, head_init_scale=head_init_scale), self)
+    @torch.jit.ignore
+    def group_matcher(self, coarse=False):
+        return dict(
+            stem=r'^stem',
+            blocks=r'^stages\.(\d+)' if coarse else [
+                (r'^stages\.(\d+)\.downsample', (0,)),  # blocks
+                (r'^stages\.(\d+)\.blocks\.(\d+)', None),
+                (r'^norm_pre', (99999,))
+            ]
+        )
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        for s in self.stages:
+            s.grad_checkpointing = enable
+    @torch.jit.ignore
+    def get_classifier(self):
+        return self.head.fc
+    def reset_classifier(self, num_classes=0, global_pool=None):
+        self.head.reset(num_classes, global_pool)
+    def forward_features(self, x):
+        x = self.stem(x)
+        x = self.stages(x)
+        x = self.norm_pre(x)
+        return x
+    def forward_head(self, x, pre_logits: bool = False):
+        return self.head(x, pre_logits=True) if pre_logits else self.head(x)
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.forward_head(x)
+        return x
+def _init_weights(module, name=None, head_init_scale=1.0):
+    if isinstance(module, nn.Conv2d):
+        trunc_normal_(module.weight, std=.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=.02)
+        nn.init.zeros_(module.bias)
+        if name and 'head.' in name:
+            module.weight.data.mul_(head_init_scale)
+            module.bias.data.mul_(head_init_scale)
+def checkpoint_filter_fn(state_dict, model):
+    """ Remap FB checkpoints -> timm """
+    if 'head.norm.weight' in state_dict or 'norm_pre.weight' in state_dict:
+        out_dict={}
+        out_dict = {k.replace('gamma', 'weight'): v for k, v in state_dict.items()}
+        return out_dict  # non-FB checkpoint
+    if 'model' in state_dict:
+        state_dict = state_dict['model']
+    out_dict = {}
+    if 'visual.trunk.stem.0.weight' in state_dict:
+        out_dict = {k.replace('visual.trunk.', '').replace('gamma', 'weight'): v for k, v in state_dict.items() if
+                    k.startswith('visual.trunk.')}
+        if 'visual.head.proj.weight' in state_dict:
+            out_dict['head.fc.weight'] = state_dict['visual.head.proj.weight']
+            out_dict['head.fc.bias'] = torch.zeros(state_dict['visual.head.proj.weight'].shape[0])
+        elif 'visual.head.mlp.fc1.weight' in state_dict:
+            out_dict['head.pre_logits.fc.weight'] = state_dict['visual.head.mlp.fc1.weight']
+            out_dict['head.pre_logits.fc.bias'] = state_dict['visual.head.mlp.fc1.bias']
+            out_dict['head.fc.weight'] = state_dict['visual.head.mlp.fc2.weight']
+            out_dict['head.fc.bias'] = torch.zeros(state_dict['visual.head.mlp.fc2.weight'].shape[0])
+        return out_dict
+    import re
+    for k, v in state_dict.items():
+        k = k.replace('downsample_layers.0.', 'stem.')
+        k = re.sub(r'stages.([0-9]+).([0-9]+)', r'stages.\1.blocks.\2', k)
+        k = re.sub(r'downsample_layers.([0-9]+).([0-9]+)', r'stages.\1.downsample.\2', k)
+        k = k.replace('dwconv', 'conv_dw')
+        k = k.replace('pwconv', 'mlp.fc')
+        if 'grn' in k:
+            k = k.replace('grn.beta', 'mlp.grn.bias')
+            k = k.replace('grn.gamma', 'mlp.grn.weight')
+            v = v.reshape(v.shape[-1])
+        k = k.replace('head.', 'head.fc.')
+        if k.startswith('norm.'):
+            k = k.replace('norm', 'head.norm')
+        if v.ndim == 2 and 'head' not in k:
+            model_shape = model.state_dict()[k].shape
+            v = v.reshape(model_shape)
+        k=k.replace('gamma','weight')
+        out_dict[k] = v
+    return out_dict
+def _create_convnext(variant, pretrained=False, **kwargs):
+    if kwargs.get('pretrained_cfg', '') == 'fcmae':
+        # NOTE fcmae pretrained weights have no classifier or final norm-layer (`head.norm`)
+        # This is workaround loading with num_classes=0 w/o removing norm-layer.
+        kwargs.setdefault('pretrained_strict', False)
+    model = build_model_with_cfg(
+        ConvNeXt, variant, pretrained,
+        pretrained_filter_fn=checkpoint_filter_fn,
+        feature_cfg=dict(out_indices=(0, 1, 2, 3), flatten_sequential=True),
+        **kwargs)
+    return model
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.0', 'classifier': 'head.fc',
+        **kwargs
+    }
+def _cfgv2(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.0', 'classifier': 'head.fc',
+        'license': 'cc-by-nc-4.0', 'paper_ids': 'arXiv:2301.00808',
+        'paper_name': 'ConvNeXt-V2: Co-designing and Scaling ConvNets with Masked Autoencoders',
+        'origin_url': 'https://github.com/facebookresearch/ConvNeXt-V2',
+        **kwargs
+    }
+default_cfgs = generate_default_cfgs({
+    # timm specific variants
+    'convnext_tiny.in12k_ft_in1k': _cfg(
+        hf_hub_id='timm/',
+        crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_small.in12k_ft_in1k': _cfg(
+        hf_hub_id='timm/',
+        crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_atto.d2_in1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_atto_d2-01bb0f51.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'convnext_atto_ols.a2_in1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_atto_ols_a2-78d1c8f3.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'convnext_femto.d1_in1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_femto_d1-d71d5b4c.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'convnext_femto_ols.d1_in1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_femto_ols_d1-246bf2ed.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'convnext_pico.d1_in1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_pico_d1-10ad7f0d.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'convnext_pico_ols.d1_in1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_pico_ols_d1-611f0ca7.pth',
+        hf_hub_id='timm/',
+        crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_nano.in12k_ft_in1k': _cfg(
+        hf_hub_id='timm/',
+        crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_nano.d1h_in1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_nano_d1h-7eb4bdea.pth',
+        hf_hub_id='timm/',
+        crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_nano_ols.d1h_in1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_nano_ols_d1h-ae424a9a.pth',
+        hf_hub_id='timm/',
+        crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_tiny_hnf.a2h_in1k': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_tiny_hnf_a2h-ab7e9df2.pth',
+        hf_hub_id='timm/',
+        crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_tiny.in12k_ft_in1k_384': _cfg(
+        hf_hub_id='timm/',
+       input_size=(3, 384, 384), pool_size=(12, 12),  crop_pct=1.0, crop_mode='squash'),
+    'convnext_small.in12k_ft_in1k_384': _cfg(
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0,  crop_mode='squash'),
+    'convnext_nano.in12k': _cfg(
+        hf_hub_id='timm/',
+        crop_pct=0.95, num_classes=11821),
+    'convnext_tiny.in12k': _cfg(
+        hf_hub_id='timm/',
+        crop_pct=0.95, num_classes=11821),
+    'convnext_small.in12k': _cfg(
+        hf_hub_id='timm/',
+        crop_pct=0.95, num_classes=11821),
+    'convnext_tiny.fb_in22k_ft_in1k': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_1k_224.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_small.fb_in22k_ft_in1k': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_224.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_base.fb_in22k_ft_in1k': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_large.fb_in22k_ft_in1k': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_224.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_xlarge.fb_in22k_ft_in1k': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_224_ema.pth',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_tiny.fb_in1k': _cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_small.fb_in1k': _cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_base.fb_in1k': _cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_large.fb_in1k': _cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnext_tiny.fb_in22k_ft_in1k_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_1k_384.pth',
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnext_small.fb_in22k_ft_in1k_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_384.pth',
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnext_base.fb_in22k_ft_in1k_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_384.pth',
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnext_large.fb_in22k_ft_in1k_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_384.pth',
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnext_xlarge.fb_in22k_ft_in1k_384': _cfg(
+        url='https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_384_ema.pth',
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnext_tiny.fb_in22k': _cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth",
+        hf_hub_id='timm/',
+        num_classes=21841),
+    'convnext_small.fb_in22k': _cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth",
+        hf_hub_id='timm/',
+        num_classes=21841),
+    'convnext_base.fb_in22k': _cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth",
+        hf_hub_id='timm/',
+        num_classes=21841),
+    'convnext_large.fb_in22k': _cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth",
+        hf_hub_id='timm/',
+        num_classes=21841),
+    'convnext_xlarge.fb_in22k': _cfg(
+        url="https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth",
+        hf_hub_id='timm/',
+        num_classes=21841),
+    'convnextv2_nano.fcmae_ft_in22k_in1k': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_224_ema.pt',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnextv2_nano.fcmae_ft_in22k_in1k_384': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_nano_22k_384_ema.pt',
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnextv2_tiny.fcmae_ft_in22k_in1k': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_224_ema.pt",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnextv2_tiny.fcmae_ft_in22k_in1k_384': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_tiny_22k_384_ema.pt",
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnextv2_base.fcmae_ft_in22k_in1k': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_224_ema.pt",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnextv2_base.fcmae_ft_in22k_in1k_384': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_base_22k_384_ema.pt",
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnextv2_large.fcmae_ft_in22k_in1k': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_224_ema.pt",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnextv2_large.fcmae_ft_in22k_in1k_384': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_large_22k_384_ema.pt",
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnextv2_huge.fcmae_ft_in22k_in1k_384': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_384_ema.pt",
+        hf_hub_id='timm/',
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnextv2_huge.fcmae_ft_in22k_in1k_512': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im22k/convnextv2_huge_22k_512_ema.pt",
+        hf_hub_id='timm/',
+        input_size=(3, 512, 512), pool_size=(15, 15), crop_pct=1.0, crop_mode='squash'),
+    'convnextv2_atto.fcmae_ft_in1k': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_atto_1k_224_ema.pt',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'convnextv2_femto.fcmae_ft_in1k': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_femto_1k_224_ema.pt',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'convnextv2_pico.fcmae_ft_in1k': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_pico_1k_224_ema.pt',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=0.95),
+    'convnextv2_nano.fcmae_ft_in1k': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_nano_1k_224_ema.pt',
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnextv2_tiny.fcmae_ft_in1k': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_tiny_1k_224_ema.pt",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnextv2_base.fcmae_ft_in1k': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_base_1k_224_ema.pt",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnextv2_large.fcmae_ft_in1k': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_large_1k_224_ema.pt",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnextv2_huge.fcmae_ft_in1k': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/im1k/convnextv2_huge_1k_224_ema.pt",
+        hf_hub_id='timm/',
+        test_input_size=(3, 288, 288), test_crop_pct=1.0),
+    'convnextv2_atto.fcmae': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_atto_1k_224_fcmae.pt',
+        hf_hub_id='timm/',
+        num_classes=0),
+    'convnextv2_femto.fcmae': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_femto_1k_224_fcmae.pt',
+        hf_hub_id='timm/',
+        num_classes=0),
+    'convnextv2_pico.fcmae': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_pico_1k_224_fcmae.pt',
+        hf_hub_id='timm/',
+        num_classes=0),
+    'convnextv2_nano.fcmae': _cfgv2(
+        url='https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_nano_1k_224_fcmae.pt',
+        hf_hub_id='timm/',
+        num_classes=0),
+    'convnextv2_tiny.fcmae': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_tiny_1k_224_fcmae.pt",
+        hf_hub_id='timm/',
+        num_classes=0),
+    'convnextv2_base.fcmae': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_base_1k_224_fcmae.pt",
+        hf_hub_id='timm/',
+        num_classes=0),
+    'convnextv2_large.fcmae': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_large_1k_224_fcmae.pt",
+        hf_hub_id='timm/',
+        num_classes=0),
+    'convnextv2_huge.fcmae': _cfgv2(
+        url="https://dl.fbaipublicfiles.com/convnext/convnextv2/pt_only/convnextv2_huge_1k_224_fcmae.pt",
+        hf_hub_id='timm/',
+        num_classes=0),
+    'convnextv2_small.untrained': _cfg(),
+    # CLIP weights, fine-tuned on in1k or in12k + in1k
+    'convnext_base.clip_laion2b_augreg_ft_in12k_in1k': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0),
+    'convnext_base.clip_laion2b_augreg_ft_in12k_in1k_384': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnext_large_mlp.clip_laion2b_soup_ft_in12k_in1k_320': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0),
+    'convnext_large_mlp.clip_laion2b_soup_ft_in12k_in1k_384': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnext_base.clip_laion2b_augreg_ft_in1k': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0),
+    'convnext_base.clip_laiona_augreg_ft_in1k_384': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0),
+    'convnext_large_mlp.clip_laion2b_augreg_ft_in1k': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0
+    ),
+    'convnext_large_mlp.clip_laion2b_augreg_ft_in1k_384': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'
+    ),
+    'convnext_xxlarge.clip_laion2b_soup_ft_in1k': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0),
+    'convnext_base.clip_laion2b_augreg_ft_in12k': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0),
+    'convnext_large_mlp.clip_laion2b_soup_ft_in12k_320': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821,
+        input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0),
+    'convnext_large_mlp.clip_laion2b_augreg_ft_in12k_384': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821,
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnext_large_mlp.clip_laion2b_soup_ft_in12k_384': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821,
+        input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, crop_mode='squash'),
+    'convnext_xxlarge.clip_laion2b_soup_ft_in12k': _cfg(
+        hf_hub_id='timm/',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=11821,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0),
+    # CLIP original image tower weights
+    'convnext_base.clip_laion2b': _cfg(
+        hf_hub_id='laion/CLIP-convnext_base_w-laion2B-s13B-b82K',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=640),
+    'convnext_base.clip_laion2b_augreg': _cfg(
+        hf_hub_id='laion/CLIP-convnext_base_w-laion2B-s13B-b82K-augreg',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=640),
+    'convnext_base.clip_laiona': _cfg(
+        hf_hub_id='laion/CLIP-convnext_base_w-laion_aesthetic-s13B-b82K',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=640),
+    'convnext_base.clip_laiona_320': _cfg(
+        hf_hub_id='laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=640),
+    'convnext_base.clip_laiona_augreg_320': _cfg(
+        hf_hub_id='laion/CLIP-convnext_base_w_320-laion_aesthetic-s13B-b82K-augreg',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=640),
+    'convnext_large_mlp.clip_laion2b_augreg': _cfg(
+        hf_hub_id='laion/CLIP-convnext_large_d.laion2B-s26B-b102K-augreg',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=768),
+    'convnext_large_mlp.clip_laion2b_ft_320': _cfg(
+        hf_hub_id='laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=768),
+    'convnext_large_mlp.clip_laion2b_ft_soup_320': _cfg(
+        hf_hub_id='laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, num_classes=768),
+    'convnext_xxlarge.clip_laion2b_soup': _cfg(
+        hf_hub_id='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=1024),
+    'convnext_xxlarge.clip_laion2b_rewind': _cfg(
+        hf_hub_id='laion/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-rewind',
+        hf_hub_filename='open_clip_pytorch_model.bin',
+        mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD,
+        input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, num_classes=1024),
+})
+# @register_model
+# def convnext_atto(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm femto variant (NOTE: still tweaking depths, will vary between 3-4M param, current is 3.7M
+#     model_args = dict(depths=(2, 2, 6, 2), dims=(40, 80, 160, 320), conv_mlp=True)
+#     model = _create_convnext('convnext_atto', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_atto_ols(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm femto variant with overlapping 3x3 conv stem, wider than non-ols femto above, current param count 3.7M
+#     model_args = dict(depths=(2, 2, 6, 2), dims=(40, 80, 160, 320), conv_mlp=True, stem_type='overlap_tiered')
+#     model = _create_convnext('convnext_atto_ols', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_femto(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm femto variant
+#     model_args = dict(depths=(2, 2, 6, 2), dims=(48, 96, 192, 384), conv_mlp=True)
+#     model = _create_convnext('convnext_femto', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_femto_ols(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm femto variant
+#     model_args = dict(depths=(2, 2, 6, 2), dims=(48, 96, 192, 384), conv_mlp=True, stem_type='overlap_tiered')
+#     model = _create_convnext('convnext_femto_ols', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_pico(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm pico variant
+#     model_args = dict(depths=(2, 2, 6, 2), dims=(64, 128, 256, 512), conv_mlp=True)
+#     model = _create_convnext('convnext_pico', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_pico_ols(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm nano variant with overlapping 3x3 conv stem
+#     model_args = dict(depths=(2, 2, 6, 2), dims=(64, 128, 256, 512), conv_mlp=True,  stem_type='overlap_tiered')
+#     model = _create_convnext('convnext_pico_ols', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_nano(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm nano variant with standard stem and head
+#     model_args = dict(depths=(2, 2, 8, 2), dims=(80, 160, 320, 640), conv_mlp=True)
+#     model = _create_convnext('convnext_nano', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_nano_ols(pretrained=False, **kwargs) -> ConvNeXt:
+#     # experimental nano variant with overlapping conv stem
+#     model_args = dict(depths=(2, 2, 8, 2), dims=(80, 160, 320, 640), conv_mlp=True, stem_type='overlap')
+#     model = _create_convnext('convnext_nano_ols', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_tiny_hnf(pretrained=False, **kwargs) -> ConvNeXt:
+#     # experimental tiny variant with norm before pooling in head (head norm first)
+#     model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), head_norm_first=True, conv_mlp=True)
+#     model = _create_convnext('convnext_tiny_hnf', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_tiny(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768))
+#     model = _create_convnext('convnext_tiny', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_small(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768])
+#     model = _create_convnext('convnext_small', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_base_clip(pretrained='', **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024])
+#     model = _create_convnext(pretrained, pretrained=True, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_base(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024])
+#     model = _create_convnext('convnext_base', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_large(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536])
+#     model = _create_convnext('convnext_large', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_large_mlp(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], head_hidden_size=1536)
+#     model = _create_convnext('convnext_large_mlp', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnext_xlarge(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048])
+#     model = _create_convnext('convnext_xlarge', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+def convnext_xxlarge(pretrained=False, **kwargs) -> ConvNeXt:
+    model_args = dict(depths=[3, 4, 30, 3], dims=[384, 768, 1536, 3072], norm_eps=kwargs.pop('norm_eps', 1e-5))
+    model = _create_convnext('convnext_xxlarge', pretrained=pretrained, **dict(model_args, **kwargs))
+    return model
+# @register_model
+# def convnextv2_atto(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm femto variant (NOTE: still tweaking depths, will vary between 3-4M param, current is 3.7M
+#     model_args = dict(
+#         depths=(2, 2, 6, 2), dims=(40, 80, 160, 320), use_grn=True, ls_init_value=None, conv_mlp=True)
+#     model = _create_convnext('convnextv2_atto', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnextv2_femto(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm femto variant
+#     model_args = dict(
+#         depths=(2, 2, 6, 2), dims=(48, 96, 192, 384), use_grn=True, ls_init_value=None, conv_mlp=True)
+#     model = _create_convnext('convnextv2_femto', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnextv2_pico(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm pico variant
+#     model_args = dict(
+#         depths=(2, 2, 6, 2), dims=(64, 128, 256, 512), use_grn=True, ls_init_value=None, conv_mlp=True)
+#     model = _create_convnext('convnextv2_pico', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnextv2_nano(pretrained=False, **kwargs) -> ConvNeXt:
+#     # timm nano variant with standard stem and head
+#     model_args = dict(
+#         depths=(2, 2, 8, 2), dims=(80, 160, 320, 640), use_grn=True, ls_init_value=None, conv_mlp=True)
+#     model = _create_convnext('convnextv2_nano', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnextv2_tiny(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), use_grn=True, ls_init_value=None)
+#     model = _create_convnext('convnextv2_tiny', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnextv2_small(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], use_grn=True, ls_init_value=None)
+#     model = _create_convnext('convnextv2_small', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnextv2_base(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], use_grn=True, ls_init_value=None)
+#     model = _create_convnext('convnextv2_base', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnextv2_large(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], use_grn=True, ls_init_value=None)
+#     model = _create_convnext('convnextv2_large', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# @register_model
+# def convnextv2_huge(pretrained=False, **kwargs) -> ConvNeXt:
+#     model_args = dict(depths=[3, 3, 27, 3], dims=[352, 704, 1408, 2816], use_grn=True, ls_init_value=None)
+#     model = _create_convnext('convnextv2_huge', pretrained=pretrained, **dict(model_args, **kwargs))
+#     return model
+# register_model_deprecations(__name__, {
+#     'convnext_tiny_in22ft1k': 'convnext_tiny.fb_in22k_ft_in1k',
+#     'convnext_small_in22ft1k': 'convnext_small.fb_in22k_ft_in1k',
+#     'convnext_base_in22ft1k': 'convnext_base.fb_in22k_ft_in1k',
+#     'convnext_large_in22ft1k': 'convnext_large.fb_in22k_ft_in1k',
+#     'convnext_xlarge_in22ft1k': 'convnext_xlarge.fb_in22k_ft_in1k',
+#     'convnext_tiny_384_in22ft1k': 'convnext_tiny.fb_in22k_ft_in1k_384',
+#     'convnext_small_384_in22ft1k': 'convnext_small.fb_in22k_ft_in1k_384',
+#     'convnext_base_384_in22ft1k': 'convnext_base.fb_in22k_ft_in1k_384',
+#     'convnext_large_384_in22ft1k': 'convnext_large.fb_in22k_ft_in1k_384',
+#     'convnext_xlarge_384_in22ft1k': 'convnext_xlarge.fb_in22k_ft_in1k_384',
+#     'convnext_tiny_in22k': 'convnext_tiny.fb_in22k',
+#     'convnext_small_in22k': 'convnext_small.fb_in22k',
+#     'convnext_base_in22k': 'convnext_base.fb_in22k',
+#     'convnext_large_in22k': 'convnext_large.fb_in22k',
+#     'convnext_xlarge_in22k': 'convnext_xlarge.fb_in22k',
+# })

EAGLE/eagle/model/multimodal_encoder/vision_models/eva_vit.py ADDED Viewed

	@@ -0,0 +1,1235 @@

+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is modified from https://github.com/baaivision/EVA
+import os
+import fvcore.nn.weight_init as weight_init
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+import numpy as np
+import logging
+from functools import partial
+from scipy import interpolate
+from math import pi
+from einops import rearrange, repeat
+import warnings
+from PIL import Image
+import torch.utils.checkpoint as cp
+from transformers import CLIPImageProcessor
+# from ..utils.attention import FlashAttention, FlashMHA
+# try:
+#     import xformers.ops as xops
+# except:
+#     pass
+logger = logging.getLogger(__name__)
+BatchNorm2d = torch.nn.BatchNorm2d
+class Conv2d(torch.nn.Conv2d):
+    """
+    A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features.
+    """
+    def __init__(self, *args, **kwargs):
+        """
+        Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`:
+        Args:
+            norm (nn.Module, optional): a normalization layer
+            activation (callable(Tensor) -> Tensor): a callable activation function
+        It assumes that norm layer is used before activation.
+        """
+        norm = kwargs.pop("norm", None)
+        activation = kwargs.pop("activation", None)
+        super().__init__(*args, **kwargs)
+        self.norm = norm
+        self.activation = activation
+    def forward(self, x):
+        # torchscript does not support SyncBatchNorm yet
+        # https://github.com/pytorch/pytorch/issues/40507
+        # and we skip these codes in torchscript since:
+        # 1. currently we only support torchscript in evaluation mode
+        # 2. features needed by exporting module to torchscript are added in PyTorch 1.6 or
+        # later version, `Conv2d` in these PyTorch versions has already supported empty inputs.
+        if not torch.jit.is_scripting():
+            with warnings.catch_warnings(record=True):
+                if x.numel() == 0 and self.training:
+                    # https://github.com/pytorch/pytorch/issues/12013
+                    assert not isinstance(
+                        self.norm, torch.nn.SyncBatchNorm
+                    ), "SyncBatchNorm does not support empty inputs!"
+        x = F.conv2d(
+            x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups
+        )
+        if self.norm is not None:
+            x = self.norm(x)
+        if self.activation is not None:
+            x = self.activation(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = x.shape
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
+    Hp, Wp = H + pad_h, W + pad_w
+    x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows, (Hp, Wp)
+def window_unpartition(windows, window_size, pad_hw, hw):
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        window_size (int): window size.
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    H, W = hw
+    B = windows.shape[0] // (Hp * Wp // window_size // window_size)
+    x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
+    if Hp > H or Wp > W:
+        x = x[:, :H, :W, :].contiguous()
+    return x
+def get_rel_pos(q_size, k_size, rel_pos):
+    """
+    Get relative positional embeddings according to the relative positions of
+        query and key sizes.
+    Args:
+        q_size (int): size of query q.
+        k_size (int): size of key k.
+        rel_pos (Tensor): relative position embeddings (L, C).
+    Returns:
+        Extracted positional embeddings according to relative positions.
+    """
+    max_rel_dist = int(2 * max(q_size, k_size) - 1)
+    use_log_interpolation = True
+    # Interpolate rel pos if needed.
+    if rel_pos.shape[0] != max_rel_dist:
+        if not use_log_interpolation:
+            # Interpolate rel pos.
+            rel_pos_resized = F.interpolate(
+                rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
+                size=max_rel_dist,
+                mode="linear",
+            )
+            rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
+        else:
+            src_size = rel_pos.shape[0]
+            dst_size = max_rel_dist
+            # q = 1.13492
+            q = 1.0903078
+            dis = []
+            cur = 1
+            for i in range(src_size // 2):
+                dis.append(cur)
+                cur += q ** (i + 1)
+            r_ids = [-_ for _ in reversed(dis)]
+            x = r_ids + [0] + dis
+            t = dst_size // 2.0
+            dx = np.arange(-t, t + 0.1, 1.0)
+            all_rel_pos_bias = []
+            for i in range(rel_pos.shape[1]):
+                z = rel_pos[:, i].view(src_size).cpu().float().numpy()
+                f = interpolate.interp1d(x, z, kind='cubic', fill_value="extrapolate")
+                all_rel_pos_bias.append(
+                    torch.Tensor(f(dx)).contiguous().view(-1, 1).to(rel_pos.device))
+            rel_pos_resized = torch.cat(all_rel_pos_bias, dim=-1)
+    else:
+        rel_pos_resized = rel_pos
+    # Scale the coords with short length if shapes for q and k are different.
+    q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0)
+    k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0)
+    relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
+    return rel_pos_resized[relative_coords.long()]
+def add_decomposed_rel_pos(attn, q, rel_pos_h, rel_pos_w, q_size, k_size):
+    """
+    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py   # noqa B950
+    Args:
+        attn (Tensor): attention map.
+        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
+        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
+        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
+        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
+    Returns:
+        attn (Tensor): attention map with added relative positional embeddings.
+    """
+    q_h, q_w = q_size
+    k_h, k_w = k_size
+    Rh = get_rel_pos(q_h, k_h, rel_pos_h)
+    Rw = get_rel_pos(q_w, k_w, rel_pos_w)
+    B, _, dim = q.shape
+    r_q = q.reshape(B, q_h, q_w, dim)
+    rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
+    rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
+    attn = (
+        attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :]
+    ).view(B, q_h * q_w, k_h * k_w)
+    return attn
+def get_abs_pos(abs_pos, has_cls_token, hw):
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    h, w = hw
+    if has_cls_token:
+        abs_pos = abs_pos[:, 1:]
+    xy_num = abs_pos.shape[1]
+    size = int(math.sqrt(xy_num))
+    assert size * size == xy_num
+    if size != h or size != w:
+        original_datatype = abs_pos.dtype
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2).float(), # bf16 is not implemented
+            size=(h, w),
+            mode="bicubic",
+            align_corners=False,
+        ).to(original_datatype)
+        return new_abs_pos.permute(0, 2, 3, 1)
+    else:
+        return abs_pos.reshape(1, h, w, -1)
+class PatchEmbed(nn.Module):
+    """
+    Image to Patch Embedding.
+    """
+    def __init__(
+        self, kernel_size=(16, 16), stride=(16, 16), padding=(0, 0), in_chans=3, embed_dim=768
+    ):
+        """
+        Args:
+            kernel_size (Tuple): kernel size of the projection layer.
+            stride (Tuple): stride of the projection layer.
+            padding (Tuple): padding size of the projection layer.
+            in_chans (int): Number of input image channels.
+            embed_dim (int):  embed_dim (int): Patch embedding dimension.
+        """
+        super().__init__()
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
+        )
+    def forward(self, x):
+        x = self.proj(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        return x
+def broadcat(tensors, dim = -1):
+    num_tensors = len(tensors)
+    shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
+    assert len(shape_lens) == 1, 'tensors must all have the same number of dimensions'
+    shape_len = list(shape_lens)[0]
+    dim = (dim + shape_len) if dim < 0 else dim
+    dims = list(zip(*map(lambda t: list(t.shape), tensors)))
+    expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
+    assert all([*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]), 'invalid dimensions for broadcastable concatentation'
+    max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
+    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
+    expanded_dims.insert(dim, (dim, dims[dim]))
+    expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
+    tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
+    return torch.cat(tensors, dim = dim)
+def rotate_half(x):
+    x = rearrange(x, '... (d r) -> ... d r', r = 2)
+    x1, x2 = x.unbind(dim = -1)
+    x = torch.stack((-x2, x1), dim = -1)
+    return rearrange(x, '... d r -> ... (d r)')
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim,
+        pt_seq_len,
+        ft_seq_len=None,
+        custom_freqs = None,
+        freqs_for = 'lang',
+        theta = 10000,
+        max_freq = 10,
+        num_freqs = 1,
+    ):
+        super().__init__()
+        if custom_freqs:
+            freqs = custom_freqs
+        elif freqs_for == 'lang':
+            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+        elif freqs_for == 'pixel':
+            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+        elif freqs_for == 'constant':
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f'unknown modality {freqs_for}')
+        if ft_seq_len is None: ft_seq_len = pt_seq_len
+        t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
+        freqs_h = torch.einsum('..., f -> ... f', t, freqs)
+        freqs_h = repeat(freqs_h, '... n -> ... (n r)', r = 2)
+        freqs_w = torch.einsum('..., f -> ... f', t, freqs)
+        freqs_w = repeat(freqs_w, '... n -> ... (n r)', r = 2)
+        freqs = broadcat((freqs_h[:, None, :], freqs_w[None, :, :]), dim = -1)
+        self.register_buffer("freqs_cos", freqs.cos())
+        self.register_buffer("freqs_sin", freqs.sin())
+        # print('======== shape of rope freq', self.freqs_cos.shape, '========')
+    def forward(self, t, start_index = 0):
+        rot_dim = self.freqs_cos.shape[-1]
+        end_index = start_index + rot_dim
+        assert rot_dim <= t.shape[-1], f'feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}'
+        t_left, t, t_right = t[..., :start_index], t[..., start_index:end_index], t[..., end_index:]
+        t = (t * self.freqs_cos) + (rotate_half(t) * self.freqs_sin)
+        return torch.cat((t_left, t, t_right), dim = -1)
+class VisionRotaryEmbeddingFast(nn.Module):
+    def __init__(
+        self,
+        dim,
+        pt_seq_len=16,
+        ft_seq_len=None,
+        custom_freqs = None,
+        freqs_for = 'lang',
+        theta = 10000,
+        max_freq = 10,
+        num_freqs = 1,
+    ):
+        super().__init__()
+        if custom_freqs:
+            freqs = custom_freqs
+        elif freqs_for == 'lang':
+            freqs = 1. / (theta ** (torch.arange(0, dim, 2)[:(dim // 2)].float() / dim))
+        elif freqs_for == 'pixel':
+            freqs = torch.linspace(1., max_freq / 2, dim // 2) * pi
+        elif freqs_for == 'constant':
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f'unknown modality {freqs_for}')
+        if ft_seq_len is None: ft_seq_len = pt_seq_len
+        t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
+        freqs = torch.einsum('..., f -> ... f', t, freqs)
+        freqs = repeat(freqs, '... n -> ... (n r)', r = 2)
+        freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim = -1)
+        freqs_cos = freqs.cos().view(-1, freqs.shape[-1])
+        freqs_sin = freqs.sin().view(-1, freqs.shape[-1])
+        self.register_buffer("freqs_cos", freqs_cos)
+        self.register_buffer("freqs_sin", freqs_sin)
+        # print('======== shape of rope freq', self.freqs_cos.shape, '========')
+    def forward(self, t): return  t * self.freqs_cos + rotate_half(t) * self.freqs_sin
+class FrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+    It contains non-trainable buffers called
+    "weight" and "bias", "running_mean", "running_var",
+    initialized to perform identity transformation.
+    The pre-trained backbone models from Caffe2 only contain "weight" and "bias",
+    which are computed from the original four parameters of BN.
+    The affine transform `x * weight + bias` will perform the equivalent
+    computation of `(x - running_mean) / sqrt(running_var) * weight + bias`.
+    When loading a backbone model from Caffe2, "running_mean" and "running_var"
+    will be left unchanged as identity transformation.
+    Other pre-trained backbone models may contain all 4 parameters.
+    The forward is implemented by `F.batch_norm(..., training=False)`.
+    """
+    _version = 3
+    def __init__(self, num_features, eps=1e-5):
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.register_buffer("weight", torch.ones(num_features))
+        self.register_buffer("bias", torch.zeros(num_features))
+        self.register_buffer("running_mean", torch.zeros(num_features))
+        self.register_buffer("running_var", torch.ones(num_features) - eps)
+    def forward(self, x):
+        if x.requires_grad:
+            # When gradients are needed, F.batch_norm will use extra memory
+            # because its backward op computes gradients for weight/bias as well.
+            scale = self.weight * (self.running_var + self.eps).rsqrt()
+            bias = self.bias - self.running_mean * scale
+            scale = scale.reshape(1, -1, 1, 1)
+            bias = bias.reshape(1, -1, 1, 1)
+            out_dtype = x.dtype  # may be half
+            return x * scale.to(out_dtype) + bias.to(out_dtype)
+        else:
+            # When gradients are not needed, F.batch_norm is a single fused op
+            # and provide more optimization opportunities.
+            return F.batch_norm(
+                x,
+                self.running_mean,
+                self.running_var,
+                self.weight,
+                self.bias,
+                training=False,
+                eps=self.eps,
+            )
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        version = local_metadata.get("version", None)
+        if version is None or version < 2:
+            # No running_mean/var in early versions
+            # This will silent the warnings
+            if prefix + "running_mean" not in state_dict:
+                state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean)
+            if prefix + "running_var" not in state_dict:
+                state_dict[prefix + "running_var"] = torch.ones_like(self.running_var)
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+    def __repr__(self):
+        return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps)
+    @classmethod
+    def convert_frozen_batchnorm(cls, module):
+        """
+        Convert all BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.
+        Args:
+            module (torch.nn.Module):
+        Returns:
+            If module is BatchNorm/SyncBatchNorm, returns a new module.
+            Otherwise, in-place convert module and return it.
+        Similar to convert_sync_batchnorm in
+        https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py
+        """
+        bn_module = nn.modules.batchnorm
+        bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)
+        res = module
+        if isinstance(module, bn_module):
+            res = cls(module.num_features)
+            if module.affine:
+                res.weight.data = module.weight.data.clone().detach()
+                res.bias.data = module.bias.data.clone().detach()
+            res.running_mean.data = module.running_mean.data
+            res.running_var.data = module.running_var.data
+            res.eps = module.eps
+        else:
+            for name, child in module.named_children():
+                new_child = cls.convert_frozen_batchnorm(child)
+                if new_child is not child:
+                    res.add_module(name, new_child)
+        return res
+class LayerNorm(nn.Module):
+    """
+    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
+    variance normalization over the channel dimension for inputs that have shape
+    (batch_size, channels, height, width).
+    https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa B950
+    """
+    def __init__(self, normalized_shape, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.normalized_shape = (normalized_shape,)
+    def forward(self, x):
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+class CNNBlockBase(nn.Module):
+    """
+    A CNN block is assumed to have input channels, output channels and a stride.
+    The input and output of `forward()` method must be NCHW tensors.
+    The method can perform arbitrary computation but must match the given
+    channels and stride specification.
+    Attribute:
+        in_channels (int):
+        out_channels (int):
+        stride (int):
+    """
+    def __init__(self, in_channels, out_channels, stride):
+        """
+        The `__init__` method of any subclass should also contain these arguments.
+        Args:
+            in_channels (int):
+            out_channels (int):
+            stride (int):
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.stride = stride
+    def freeze(self):
+        """
+        Make this block not trainable.
+        This method sets all parameters to `requires_grad=False`,
+        and convert all BatchNorm layers to FrozenBatchNorm
+        Returns:
+            the block itself
+        """
+        for p in self.parameters():
+            p.requires_grad = False
+        FrozenBatchNorm2d.convert_frozen_batchnorm(self)
+        return self
+def get_norm(norm, out_channels):
+    """
+    Args:
+        norm (str or callable): either one of BN, SyncBN, FrozenBN, GN;
+            or a callable that takes a channel number and returns
+            the normalization layer as a nn.Module.
+    Returns:
+        nn.Module or None: the normalization layer
+    """
+    if norm is None:
+        return None
+    if isinstance(norm, str):
+        if len(norm) == 0:
+            return None
+        norm = {
+            "BN": BatchNorm2d,
+            # Fixed in https://github.com/pytorch/pytorch/pull/36382
+            "SyncBN": nn.SyncBatchNorm,
+            "FrozenBN": FrozenBatchNorm2d,
+            "GN": lambda channels: nn.GroupNorm(32, channels),
+            # for debugging:
+            "nnSyncBN": nn.SyncBatchNorm,
+            "LN": lambda channels: LayerNorm(channels)
+        }[norm]
+    return norm(out_channels)
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        if self.drop_prob == 0. or not self.training:
+            return x
+        keep_prob = 1 - self.drop_prob
+        # work with diff dim tensors, not just 2D ConvNets
+        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
+        random_tensor = keep_prob + \
+            torch.rand(shape, dtype=x.dtype, device=x.device)
+        random_tensor.floor_()  # binarize
+        output = x.div(keep_prob) * random_tensor
+        return output
+class SwiGLU(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.SiLU, drop=0.,
+                norm_layer=nn.LayerNorm, subln=False
+            ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w1 = nn.Linear(in_features, hidden_features)
+        self.w2 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity()
+        self.w3 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x1 = self.w1(x)
+        x2 = self.w2(x)
+        hidden = self.act(x1) * x2
+        x = self.ffn_ln(hidden)
+        x = self.w3(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+            self,
+            dim,
+            num_heads=8,
+            qkv_bias=True,
+            qk_scale=None,
+            attn_head_dim=None,
+            norm_layer=nn.LayerNorm,
+            rope=None,
+            xattn=True,
+            subln=False
+        ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.subln = subln
+        self.q_proj = nn.Linear(dim, all_head_dim, bias=False)
+        self.k_proj = nn.Linear(dim, all_head_dim, bias=False)
+        self.v_proj = nn.Linear(dim, all_head_dim, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        self.rope = rope
+        self.xattn = xattn
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.inner_attn_ln = norm_layer(all_head_dim) if subln else nn.Identity()
+        if self.xattn:
+            factory_kwargs = {'device': 'cuda', 'dtype': torch.float16}
+            self.inner_attn = FlashAttention(attention_dropout=0.0, **factory_kwargs)
+    def forward(self, x):
+        B, H, W, C = x.shape
+        x = x.view(B, -1, C)
+        N = H * W
+        q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias)
+        k = F.linear(input=x, weight=self.k_proj.weight, bias=None)
+        v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias)
+        q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)     # B, num_heads, N, C
+        k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
+        v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
+        ## rope
+        q = self.rope(q).type_as(v)
+        k = self.rope(k).type_as(v)
+        if self.xattn:
+            q = q.permute(0, 2, 1, 3)   # B, num_heads, N, C -> B, N, num_heads, C
+            k = k.permute(0, 2, 1, 3)
+            v = v.permute(0, 2, 1, 3)
+            kv = torch.stack([k, v], dim=2)
+            x, attn_weights = self.inner_attn(q, kv, key_padding_mask=None, causal=False)
+            # x = xops.memory_efficient_attention(q, k, v)
+            x = x.reshape(B, N, -1)
+            x = self.inner_attn_ln(x)
+        else:
+            q = q * self.scale
+            attn = (q @ k.transpose(-2, -1))
+            attn = attn.softmax(dim=-1).type_as(x)
+            x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+            x = self.inner_attn_ln(x)
+        x = self.proj(x)
+        x = x.view(B, H, W, C)
+        return x
+class ResBottleneckBlock(CNNBlockBase):
+    """
+    The standard bottleneck residual block without the last activation layer.
+    It contains 3 conv layers with kernels 1x1, 3x3, 1x1.
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        bottleneck_channels,
+        norm="LN",
+        act_layer=nn.GELU,
+    ):
+        """
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            bottleneck_channels (int): number of output channels for the 3x3
+                "bottleneck" conv layers.
+            norm (str or callable): normalization for all conv layers.
+                See :func:`layers.get_norm` for supported format.
+            act_layer (callable): activation for all conv layers.
+        """
+        super().__init__(in_channels, out_channels, 1)
+        self.conv1 = Conv2d(in_channels, bottleneck_channels, 1, bias=False)
+        self.norm1 = get_norm(norm, bottleneck_channels)
+        self.act1 = act_layer()
+        self.conv2 = Conv2d(
+            bottleneck_channels,
+            bottleneck_channels,
+            3,
+            padding=1,
+            bias=False,
+        )
+        self.norm2 = get_norm(norm, bottleneck_channels)
+        self.act2 = act_layer()
+        self.conv3 = Conv2d(bottleneck_channels, out_channels, 1, bias=False)
+        self.norm3 = get_norm(norm, out_channels)
+        for layer in [self.conv1, self.conv2, self.conv3]:
+            weight_init.c2_msra_fill(layer)
+        for layer in [self.norm1, self.norm2]:
+            layer.weight.data.fill_(1.0)
+            layer.bias.data.zero_()
+        # zero init last norm layer.
+        self.norm3.weight.data.zero_()
+        self.norm3.bias.data.zero_()
+    def forward(self, x):
+        out = x
+        for layer in self.children():
+            out = layer(out)
+        out = x + out
+        return out
+class Block(nn.Module):
+    """Transformer blocks with support of window attention and residual propagation blocks"""
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4*2/3,
+        qkv_bias=True,
+        drop_path=0.0,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        window_size=0,
+        use_residual_block=False,
+        rope=None,
+        xattn=True,
+        subln=False,
+        # with_cp=True,
+    ):
+        """
+        Args:
+            dim (int): Number of input channels.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks. If it equals 0, then not
+                use window attention.
+            use_residual_block (bool): If True, use a residual block after the MLP block.
+            input_size (int or None): Input resolution for calculating the relative positional
+                parameter size.
+        """
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            rope=rope,
+            xattn=xattn,
+            subln=subln
+        )
+        # self.with_cp = with_cp
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = SwiGLU(
+                in_features=dim,
+                hidden_features=int(dim * mlp_ratio),
+                subln=True,
+                norm_layer=norm_layer,
+            )
+        self.window_size = window_size
+        self.use_residual_block = use_residual_block
+        if use_residual_block:
+            # Use a residual block with bottleneck channel as dim // 2
+            self.residual = ResBottleneckBlock(
+                in_channels=dim,
+                out_channels=dim,
+                bottleneck_channels=dim // 2,
+                norm="LN",
+            )
+    def _forward(self, x):
+        shortcut = x
+        x = self.norm1(x)
+        # Window partition
+        if self.window_size > 0:
+            H, W = x.shape[1], x.shape[2]
+            x, pad_hw = window_partition(x, self.window_size)
+        x = self.attn(x)
+        # Reverse window partition
+        if self.window_size > 0:
+            x = window_unpartition(x, self.window_size, pad_hw, (H, W))
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        if self.use_residual_block:
+            x = self.residual(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
+        return x
+    def forward(self, x, with_cp=False):
+        # if self.with_cp and self.training:
+        if with_cp:
+            x = cp.checkpoint(self._forward, x)
+        else:
+            x = self._forward(x)
+        return x
+#@BACKBONES.register_module()
+class EVAViT(nn.Module):
+    """
+    This module implements Vision Transformer (ViT) backbone in :paper:`vitdet`.
+    "Exploring Plain Vision Transformer Backbones for Object Detection",
+    https://arxiv.org/abs/2203.16527
+    """
+    def __init__(
+        self,
+        img_size=1024,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4*2/3,
+        qkv_bias=True,
+        drop_path_rate=0.0,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        act_layer=nn.GELU,
+        use_abs_pos=True,
+        use_rel_pos=False,
+        # sim_fpn=None,
+        rope=True,
+        pt_hw_seq_len=16,
+        intp_freq=True,
+        window_size=0,
+        global_window_size=0,
+        window_block_indexes=(),
+        residual_block_indexes=(),
+        pretrain_img_size=224,
+        pretrain_use_cls_token=True,
+        out_feature="last_feat",
+        subln=False,
+        xattn=True,
+        # with_cp=True,
+        frozen=False,
+    ):
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            qkv_bias (bool): If True, add a learnable bias to query, key, value.
+            drop_path_rate (float): Stochastic depth rate.
+            norm_layer (nn.Module): Normalization layer.
+            act_layer (nn.Module): Activation layer.
+            use_abs_pos (bool): If True, use absolute positional embeddings.
+            use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
+            rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
+            window_size (int): Window size for window attention blocks.
+            window_block_indexes (list): Indexes for blocks using window attention.
+            residual_block_indexes (list): Indexes for blocks using conv propagation.
+            use_act_checkpoint (bool): If True, use activation checkpointing.
+            pretrain_img_size (int): input image size for pretraining models.
+            pretrain_use_cls_token (bool): If True, pretrainig models use class token.
+            out_feature (str): name of the feature from the last block.
+        """
+        super().__init__()
+        self.pretrain_use_cls_token = pretrain_use_cls_token
+        self.patch_embed = PatchEmbed(
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+        )
+        self.frozen = frozen
+        self.gradient_checkpointing = False
+        if use_abs_pos:
+            # Initialize absolute positional embedding with pretrain image size.
+            num_patches = (pretrain_img_size // patch_size) * (pretrain_img_size // patch_size)
+            num_positions = (num_patches + 1) if pretrain_use_cls_token else num_patches
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, embed_dim))
+        else:
+            self.pos_embed = None
+        half_head_dim = embed_dim // num_heads // 2
+        hw_seq_len = img_size // patch_size
+        self.rope_win = VisionRotaryEmbeddingFast(
+            dim=half_head_dim,
+            pt_seq_len=pt_hw_seq_len,
+            ft_seq_len=window_size if intp_freq else None,
+        )
+        self.rope_glb = VisionRotaryEmbeddingFast(
+            dim=half_head_dim,
+            pt_seq_len=pt_hw_seq_len,
+            ft_seq_len=hw_seq_len if intp_freq else None,
+        )
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            block = Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                window_size=window_size if i in window_block_indexes else global_window_size,
+                use_residual_block=i in residual_block_indexes,
+                rope=self.rope_win if i in window_block_indexes else self.rope_glb,
+                xattn=xattn,
+                subln=subln,
+                # with_cp=with_cp,
+            )
+            self.blocks.append(block)
+        self._out_feature_channels = {out_feature: embed_dim}
+        self._out_feature_strides = {out_feature: patch_size}
+        self._out_features = [out_feature]
+        if self.pos_embed is not None:
+            nn.init.normal_(self.pos_embed, std=0.02)
+        self._freeze_stages()
+    def _freeze_stages(self):
+        if self.frozen:
+            self.eval()
+            for m in self.parameters():
+                m.requires_grad = False
+    def forward(self, x):
+        x = self.patch_embed(x)
+        if self.pos_embed is not None:
+            x = x + get_abs_pos(
+                self.pos_embed, self.pretrain_use_cls_token, (x.shape[1], x.shape[2])
+            )
+        for blk in self.blocks:
+            x = blk(x, with_cp=self.gradient_checkpointing)   # b, h, w, c
+        x = x.permute(0, 3, 1, 2) # b, c, h, w
+        return x
+class EVAVITVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer # NOTE: not implemented yet, this parameter has no effect
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        self.args = args
+        self.vision_tower, vision_tower_config = build_eva_vit(args=args,
+                                                       model_name=vision_tower,
+                                                       image_size=args.input_image_size
+                                                       )
+        self.input_image_size=args.input_image_size
+        self.vision_tower.config = vision_tower_config
+        self.freeze_vision = args.freeze_vision
+        if not self.is_loaded:
+            self.load_model()
+    def load_model(self):
+        if self.is_loaded:
+            return
+        # hardcode
+        self.image_processor = CLIPImageProcessor(crop_size={"height": self.args.input_image_size, "width": self.args.input_image_size},
+                                            size={'shortest_edge': self.args.input_image_size},
+                                            image_mean=[0.48145466, 0.4578275, 0.40821073],
+                                            image_std=[0.26862954, 0.26130258, 0.27577711])
+        # load weights
+        if self.args.vision_tower_pretrained_from is not None:
+            if not os.path.exists(self.args.vision_tower_pretrained_from):
+                import warnings
+                warnings.warn("The vision tower weights for EVA-02 vision tower does not exists, this will cause problem if you are training the model from scratch!")
+                self.is_loaded = True
+                return
+            pretrained_params = torch.load(self.args.vision_tower_pretrained_from)
+            if 'ema_state' in pretrained_params:
+                pretrained_params = pretrained_params['ema_state']
+            elif 'module' in pretrained_params:
+                pretrained_params = pretrained_params['module']
+            from collections import OrderedDict
+            new_params = OrderedDict()
+            kw = ""
+            if "det" in self.args.vision_tower_pretrained_from.lower():
+                kw = "backbone.net."
+            elif "clip" in self.args.vision_tower_pretrained_from.lower():
+                kw = "visual."
+            for k, v in pretrained_params.items():
+                if len(kw) > 0:
+                    if kw in k and ("rope" not in k):
+                        new_params[k.replace(kw, "")] = v
+                else:
+                    if "rope" not in k:
+                        new_params[k] = v
+            incompatiblekeys = self.vision_tower.load_state_dict(new_params, strict=False)
+            for k in incompatiblekeys[0]:
+                if "rope" not in k:
+                    warnings.warn(f"Find incompatible keys {k} in state dict.")
+        if self.freeze_vision:
+            self.vision_tower.requires_grad_(False)
+        self.is_loaded = True
+    # @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0))
+                image_feature = image_forward_out.flatten(2,3).transpose(1,2) # b, n, c
+                image_features.append(image_feature)
+        else:
+            image_forward_out = self.vision_tower(images.to(device=self.device, dtype=self.dtype))
+        return image_forward_out
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return next(self.vision_tower.parameters()).dtype
+    @property
+    def device(self):
+        return next(self.vision_tower.parameters()).device
+    @property
+    def config(self):
+        return self.vision_tower.config
+    @property
+    def hidden_size(self):
+        #return self.config.hidden_size
+        return self.config['hidden_dim']
+    @property
+    def num_patches(self):
+        # return (self.config.image_size // self.config.patch_size) ** 2
+        return self.config['num_patches']
+def build_eva_vit(args,
+                  model_name=None,
+                  image_size=224,
+                  window_attn=True
+                  ):
+    if "336" in args.vision_tower_pretrained_from:
+        pretrained_image_size = 336
+    else:
+        pretrained_image_size = 224
+    if "clip" in args.vision_tower_pretrained_from.lower():
+        subln = True
+    else:
+        subln = False
+    if model_name == 'eva02-l-16':
+        # shilong said that use this: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/det/eva02_L_coco_det_sys_o365.pth
+        if window_attn:
+            window_block_indexes = (list(range(0, 2)) + list(range(3, 5)) + list(range(6, 8)) + list(range(9, 11)) + list(range(12, 14)) + list(range(15, 17)) + list(range(18, 20)) + list(range(21, 23)))
+        else:
+            window_block_indexes = ()
+        model = EVAViT(
+                img_size=image_size,
+                patch_size=16,
+                window_size=16,
+                in_chans=3,
+                embed_dim=1024,
+                depth=24,
+                num_heads=16,
+                mlp_ratio=4*2/3,
+                window_block_indexes = window_block_indexes,
+                qkv_bias=True,
+                drop_path_rate=0.0,
+                xattn=False,
+                # with_cp=False,
+                # frozen=True,
+        )
+        # image_size = 224 # HARDCODE
+        eva_config = dict(image_size=image_size,
+                          patch_size=16,
+                          window_size=16,
+                          hidden_dim=1024,
+                          depth=24,
+                          num_heads=16,
+                          window_block_indexes=window_block_indexes,
+                          num_patches=image_size ** 2 // 16 ** 2,
+                          pretrained_from=args.vision_tower_pretrained_from
+                          )
+    elif model_name == 'eva02-l-14':
+                # shilong said that use this: https://huggingface.co/Yuxin-CV/EVA-02/blob/main/eva02/det/eva02_L_coco_det_sys_o365.pth
+        if window_attn:
+            window_block_indexes = (list(range(0, 2)) + list(range(3, 5)) + list(range(6, 8)) + list(range(9, 11)) + list(range(12, 14)) + list(range(15, 17)) + list(range(18, 20)) + list(range(21, 23)))
+        else:
+            window_block_indexes = ()
+        model = EVAViT(
+                img_size=image_size,
+                pretrain_img_size=pretrained_image_size,
+                patch_size=14,
+                window_size=16,
+                in_chans=3,
+                embed_dim=1024,
+                depth=24,
+                num_heads=16,
+                mlp_ratio=4*2/3,
+                window_block_indexes = window_block_indexes,
+                qkv_bias=True,
+                drop_path_rate=0.0,
+                xattn=False,
+                # with_cp=False,
+                subln=subln,
+                # frozen=True,
+        )
+        # image_size = 224 # HARDCODE
+        eva_config = dict(image_size=image_size,
+                          patch_size=14,
+                          window_size=16,
+                          hidden_dim=1024,
+                          depth=24,
+                          num_heads=16,
+                          window_block_indexes=window_block_indexes,
+                          num_patches=image_size ** 2 // 14 ** 2,
+                          pretrained_from=args.vision_tower_pretrained_from
+                          )
+    else:
+        raise NotImplementedError
+    return model, eva_config

EAGLE/eagle/model/multimodal_projector/__init__.py ADDED Viewed

File without changes

EAGLE/eagle/model/multimodal_projector/builder.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+import torch.nn as nn
+import re
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": 'identity'}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+def build_vision_projector(config, delay_load=False, fpn_input_dim=[], **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+    if projector_type == 'linear':
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+    if projector_type == 'identity':
+        return IdentityMap()
+    raise ValueError(f'Unknown projector type: {projector_type}')

EAGLE/lmms_eval/api/__init__.py ADDED Viewed

File without changes

EAGLE/lmms_eval/api/filter.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from dataclasses import dataclass
+from typing import List
+from lmms_eval.api.instance import Instance
+from datasets import Dataset
+class Filter:
+    """
+    Filter classes operate on a per-task level.
+    They take all model outputs (`instance.resps` for all `task.instances`)
+    across all instances of a task, and perform operations.
+    In a single run, one can configure any number of separate filters or lists of filters.
+    """
+    def __init__(self, *args, **kwargs) -> None:
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+    def apply(self, resps, docs):
+        """
+        Defines the operation to perform on a list of the `inst.resps` properties of `Instance` objects.
+        Should return the list of (filtered) response lists *in the same order as they were input*, e.g.
+        if pass in [<inst.resps for instance 0>, <inst.resps for instance 1>] should return
+        [<filtered resps for instance 0>, <filtered resps for instance 1>]
+        """
+        return resps
+@dataclass
+class FilterEnsemble:
+    """
+    FilterEnsemble creates a pipeline applying multiple filters.
+    Its intended usage is to stack multiple post-processing steps in order.
+    `task.apply_filters` should use a list of FilterEnsemble classes that it stores, to apply each
+    pipeline separately.
+    """
+    name: str
+    filters: List[Filter]
+    def apply(self, instances: List[Instance], docs: List[Dataset]) -> None:
+        resps = [inst.resps for inst in instances]  # operate just on the model responses
+        for f in self.filters:
+            # apply filters in sequence
+            resps = f.apply(resps, docs)
+        # add the end results after filtering to filtered_requests of their respective source instances.
+        # has key `self.name`: each FilterEnsemble applied in a given run should use a different name.
+        for inst, resp in zip(instances, resps):
+            inst.filtered_resps[self.name] = resp

EAGLE/lmms_eval/api/instance.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from dataclasses import dataclass, field
+from typing import Literal, Tuple
+@dataclass
+class Instance:
+    request_type: Literal["loglikelihood", "generate_until"]
+    arguments: tuple
+    idx: int
+    metadata: Tuple[str, int, int] = field(default_factory=lambda: (None, None, None))  # TODO: better typehints here
+    resps: list = field(default_factory=list)
+    filtered_resps: dict = field(default_factory=dict)
+    # initialized after init
+    task_name: str = None
+    doc_id: str = None
+    repeats: str = None
+    doc: dict = None
+    def __post_init__(self) -> None:
+        # unpack metadata field
+        self.task_name, self.doc_id, self.repeats = self.metadata
+    @property
+    def args(self):
+        """
+        Returns (string,) where `string` is the string to calculate loglikelihood over
+        """
+        return self.arguments if isinstance(self.arguments, tuple) else (self.arguments,)

EAGLE/lmms_eval/api/metrics.py ADDED Viewed

	@@ -0,0 +1,431 @@

+import math
+from collections.abc import Iterable
+import numpy as np
+import sacrebleu
+import sklearn.metrics
+import random
+import evaluate
+import torch
+from lmms_eval.api.registry import register_metric, register_aggregation
+import logging
+eval_logger = logging.getLogger("lmms-eval")
+# Register Aggregations First
+@register_aggregation("mean")
+def mean(arr):
+    return sum(arr) / len(arr)
+@register_aggregation("median")
+def median(arr):
+    return arr[len(arr) // 2]
+# Certain metrics must be calculated across all documents in a benchmark.
+# We use them as aggregation metrics, paired with no-op passthrough metric fns.
+@register_aggregation("perplexity")
+def perplexity(items):
+    # return math.exp(-mean(items))
+    items = torch.exp(torch.tensor(items)).tolist()
+    return sum(items) / len(items)
+@register_aggregation("weighted_perplexity")
+def weighted_perplexity(items):
+    return math.exp(-weighted_mean(items))
+@register_aggregation("bits_per_byte")
+def bits_per_byte(items):
+    return -weighted_mean(items) / math.log(2)
+@register_aggregation("f1")
+def f1_score(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    fscore = sklearn.metrics.f1_score(golds, preds)
+    return np.max(fscore)
+@register_aggregation("matthews_corrcoef")
+def matthews_corrcoef(items):
+    unzipped_list = list(zip(*items))
+    golds = unzipped_list[0]
+    preds = unzipped_list[1]
+    # print(preds)
+    return sklearn.metrics.matthews_corrcoef(golds, preds)
+@register_aggregation("bleu")
+def bleu(items):
+    """The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
+    for evaluating a generated sentence to a reference sentence. It counts matching
+    n-grams in the candidate translation to n-grams in the reference text, where
+    1-gram or unigram would be each token and a bigram comparison would be each
+    word pair. The comparison is made regardless of word order
+    Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
+    Paper: https://www.aclweb.org/anthology/P02-1040/
+    Higher is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_bleu(preds, refs).score
+@register_aggregation("chrf")
+def chrf(items):
+    """chrF++ is a tool for automatic evaluation of machine translation output
+    based on character n-gram precision and recall enhanced with word n-grams.
+    Source: https://github.com/m-popovic/chrF
+    Paper: https://www.aclweb.org/anthology/W15-3049.pdf
+    Higher is better  # TODO I think
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_chrf(preds, refs).score
+@register_aggregation("ter")
+def ter(items):
+    """Translation Error Rate is an error metric for machine translation that
+    measures the number of edits required to change a system output into one
+    of the references
+    Source: http://www.cs.umd.edu/~snover/tercom/
+    Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
+    Lower is better
+    """
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    refs, preds = _sacreformat(refs, preds)
+    return sacrebleu.corpus_ter(preds, refs).score
+@register_metric(
+    metric="acc",
+    higher_is_better=True,
+    output_type=["loglikelihood", "multiple_choice"],
+    aggregation="mean",
+)
+def acc_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="acc_norm",
+    higher_is_better=True,
+    output_type=["loglikelihood", "multiple_choice"],
+    aggregation="mean",
+)
+def acc_norm_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="acc_mutual_info",
+    higher_is_better=True,
+    output_type="multiple_choice",
+    aggregation="mean",
+)
+def acc_mutual_info_fn(items):  # This is a passthrough function
+    return items
+exact_match = evaluate.load("exact_match")
+@register_metric(
+    metric="exact_match",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="mean",
+)
+def exact_match_fn(**kwargs):
+    return exact_match.compute(**kwargs)
+@register_metric(
+    metric="perplexity",
+    higher_is_better=False,
+    output_type="loglikelihood",
+    aggregation="perplexity",
+)
+def perplexity_fn(items):  # This is a passthrough function
+    return items
+def levenshtein_distance(s1, s2):
+    if len(s1) > len(s2):
+        s1, s2 = s2, s1
+    distances = range(len(s1) + 1)
+    for i2, c2 in enumerate(s2):
+        distances_ = [i2 + 1]
+        for i1, c1 in enumerate(s1):
+            if c1 == c2:
+                distances_.append(distances[i1])
+            else:
+                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
+        distances = distances_
+    return distances[-1]
+@register_metric(
+    metric="anls",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="mean",
+)
+def anls(
+    references,
+    predictions,
+    thresh_hold=0.5,
+):  # This is a passthrough function
+    """https://github.com/QwenLM/Qwen-VL/blob/master/eval_mm/infographicsvqa_eval.py"""
+    values = []
+    for answer in references:
+        # preprocess both the answers - gt and prediction
+        gt_answer = " ".join(answer.strip().lower().split())
+        det_answer = " ".join(predictions[0].strip().lower().split())
+        # dist = levenshtein_distance(answer.lower(), detObject['answer'].lower())
+        dist = levenshtein_distance(gt_answer, det_answer)
+        length = max(len(answer.upper()), len(predictions[0].upper()))
+        values.append(0.0 if length == 0 else float(dist) / float(length))
+    question_result = 1 - min(values)
+    if question_result < thresh_hold:
+        question_result = 0
+    return {"anls": question_result}
+def pop_stddev(arr):
+    mu = mean(arr)
+    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / len(arr))
+def sample_stddev(arr):
+    mu = mean(arr)
+    return math.sqrt(sum([(x - mu) ** 2 for x in arr]) / (len(arr) - 1))
+def mean_stderr(arr):
+    return sample_stddev(arr) / math.sqrt(len(arr))
+@register_metric(
+    metric="mcc",
+    higher_is_better=True,
+    output_type="multiple_choice",
+    aggregation="matthews_corrcoef",
+)
+def mcc_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="f1",
+    higher_is_better=True,
+    output_type="multiple_choice",
+    aggregation="f1",
+)
+def f1_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="bleu",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="bleu",
+)
+def bleu_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="chrf",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="chrf",
+)
+def chrf_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="ter",
+    higher_is_better=True,
+    output_type="generate_until",
+    aggregation="ter",
+)
+def ter_fn(items):  # This is a passthrough function
+    return items
+@register_metric(
+    metric="acc_all",
+    higher_is_better=True,
+    output_type="loglikelihood",
+    aggregation="mean",
+)
+def acc_all(items):
+    # Only count as correct if all answers are labeled correctly for each question
+    question_scoring_dict = {}
+    preds = list(zip(*items))[0]
+    docs = list(zip(*items))[1]
+    for doc, pred in zip(docs, preds):
+        paragraph_id = doc["idx"]["paragraph"]
+        question_id = doc["idx"]["question"]
+        if (paragraph_id, question_id) not in question_scoring_dict:
+            question_scoring_dict[(paragraph_id, question_id)] = []
+        gold_label = doc["label"] == 1
+        question_scoring_dict[(paragraph_id, question_id)].append(gold_label == pred)
+    acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
+    return acc
+def acc_all_stderr(items):
+    # Only count as correct if all answers are labeled correctly for each question
+    question_scoring_dict = {}
+    preds = list(zip(*items))[0]
+    docs = list(zip(*items))[1]
+    for doc, pred in zip(docs, preds):
+        question_id = doc["idx"]["question"]
+        if question_id not in question_scoring_dict:
+            question_scoring_dict[question_id] = []
+        gold_label = doc["label"] == 1
+        question_scoring_dict[question_id].append(gold_label == pred)
+    acc = mean_stderr([int(all(x)) for x in question_scoring_dict.values()])
+    return acc
+def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
+    """Compute max metric between prediction and each ground truth."""
+    scores_for_ground_truths = []
+    for ground_truth in ground_truths:
+        score = metric_fn(prediction, ground_truth)
+        scores_for_ground_truths.append(score)
+    return max(scores_for_ground_truths)
+def weighted_mean(items):
+    a, b = zip(*items)
+    return sum(a) / sum(b)
+def is_non_str_iterable(obj):
+    return isinstance(obj, Iterable) and not isinstance(obj, str)
+def _sacreformat(refs, preds):
+    """Format refs and preds for sacrebleu corpus calculation. It is very particular"""
+    # Sacrebleu expects (List[str], List[List[str])
+    #   e.g. sacrebleu.corpus_bleu([pred_t], [[ref1_stream], [ref2_stream], ...])
+    # Note [ref1_stream] is the first reference for each pred.
+    # So lists are size N and (M, N) for N preds and M possible refs for each pred
+    # This is a different order of dimensions that I would expect
+    # We expect refs to be List[str] or List[List[str]], the outer list corresponding to preds
+    # Must become List[List[str]] with the inner list corresponding to preds
+    if not is_non_str_iterable(refs):
+        refs = list(refs)
+    if not is_non_str_iterable(refs[0]):
+        refs = [[ref] for ref in refs]
+    refs = list(zip(*refs))
+    # Note the number of refs in each ref list much match the number of preds
+    # We expect preds to be List[str] or List[List[str]]. Must become List[str]
+    if not is_non_str_iterable(preds):
+        preds = list(preds)
+    if is_non_str_iterable(preds[0]):
+        assert len(preds[0]) == 1, f"Pred must be a str, was {preds[0]}"
+        preds = [pred[0] for pred in preds]
+    return refs, preds
+# stderr stuff
+class _bootstrap_internal:
+    def __init__(self, f, n) -> None:
+        self.f = f
+        self.n = n
+    def __call__(self, v):
+        i, xs = v
+        rnd = random.Random()
+        rnd.seed(i)
+        res = []
+        for _ in range(self.n):
+            res.append(self.f(rnd.choices(xs, k=len(xs))))
+        return res
+def bootstrap_stderr(f, xs, iters):
+    import multiprocessing as mp
+    pool = mp.Pool(mp.cpu_count())
+    # this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
+    # equivalent to stderr calculated without Bessel's correction in the stddev.
+    # Unfortunately, I haven't been able to figure out what the right correction is
+    # to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
+    # that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
+    # Thankfully, shouldn't matter because our samples are pretty big usually anyways
+    res = []
+    chunk_size = min(1000, iters)
+    from tqdm import tqdm
+    print("bootstrapping for stddev:", f.__name__)
+    for bootstrap in tqdm(
+        pool.imap(
+            _bootstrap_internal(f, chunk_size),
+            [(i, xs) for i in range(iters // chunk_size)],
+        ),
+        total=iters // chunk_size,
+    ):
+        # sample w replacement
+        res.extend(bootstrap)
+    pool.close()
+    return sample_stddev(res)
+def stderr_for_metric(metric, bootstrap_iters):
+    bootstrappable = [
+        median,
+        matthews_corrcoef,
+        f1_score,
+        perplexity,
+        bleu,
+        chrf,
+        ter,
+    ]
+    if metric in bootstrappable:
+        return lambda x: bootstrap_stderr(metric, x, iters=bootstrap_iters)
+    stderr = {mean: mean_stderr, acc_all: acc_all_stderr}
+    return stderr.get(metric, None)

EAGLE/lmms_eval/api/model.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import abc
+import os
+from typing import Union, List, Tuple, Optional, Type, TypeVar
+from sqlitedict import SqliteDict
+import json
+import hashlib
+from lmms_eval.api.instance import Instance
+from tqdm import tqdm
+from lmms_eval import utils
+import logging
+eval_logger = logging.getLogger("lmms-eval")
+T = TypeVar("T", bound="lmms")
+class lmms(abc.ABC):
+    def __init__(self) -> None:
+        """Defines the interface that should be implemented by all lmms subclasses.
+        lmmss are assumed to take image-text as input and yield strings as output
+        (inputs/outputs should be tokenization-agnostic.)
+        """
+        # set rank and world size to a single process, by default.
+        self._rank = 0
+        self._world_size = 1
+        self.cache_hook = CacheHook(None)
+        self.task_dict = {}
+    @abc.abstractmethod
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        """Compute log-likelihood of generating a continuation from a context.
+        Downstream tasks should attempt to use loglikelihood instead of other
+        LMM calls whenever possible.
+        :param requests: list[Instance]
+            A list of Instance objects, with property `args` which returns a tuple (context, continuation).
+            `context: str`
+                Context string. Implementations of LMM must be able to handle an
+                empty context string.
+            `continuation: str`
+                The continuation over which log likelihood will be calculated. If
+                there is a word boundary, the space should be in the continuation.
+                For example, context="hello" continuation=" world" is correct.
+            'visual_list: list[dict]'
+                Visual input to the model. Can be None.
+        :return: list[tuple[float, bool]]
+            A list of pairs (logprob, isgreedy)
+            `logprob: float`
+                The log probability of `continuation`.
+            `isgreedy`:
+                Whether `continuation` would be generated by greedy sampling from `context`.
+        """
+        pass
+    # TODO: Add an optional max length
+    @abc.abstractmethod
+    def generate_until(self, requests) -> List[str]:
+        """Generate greedily until a stopping sequence
+        :param requests: list[Instance]
+            A list of Instance objects with property `args` which returns a tuple (context, until).
+            context: str
+                Context string
+            generation_kwargs: dict
+                Generation Kwargs
+            'visual_list: list[dict]'
+                Visual input to the model. Can be None.
+        :return: list[str]
+            A list of strings continuation
+            continuation: str
+                The generated continuation.
+        """
+        pass
+    @classmethod
+    def create_from_arg_string(cls: Type[T], arg_string: str, additional_config: Optional[dict] = None) -> T:
+        """
+        Creates an instance of the LMM class using the given argument string and additional config.
+        Parameters:
+        - arg_string: A string containing arguments in the format key1=value1,key2=value2.
+        - additional_config: Optional dictionary containing additional configuration parameters.
+        Returns:
+        - Instance of the LMM class.
+        """
+        additional_config = {} if additional_config is None else additional_config
+        args = utils.simple_parse_args_string(arg_string)
+        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        return cls(**args, **args2)
+    @property
+    def rank(self):
+        # used in the case of parallelism. Hardcoded to
+        # ensure no errors arise using API models which do
+        # not support multi-device parallelism nor expect it.
+        return self._rank
+    @property
+    def world_size(self):
+        # used in the case of parallelism. Hardcoded to
+        # ensure no errors arise using API models which do
+        # not support multi-device parallelism nor expect it.
+        return self._world_size
+    def set_cache_hook(self, cache_hook) -> None:
+        self.cache_hook = cache_hook
+### SQLite-based caching of LMM responses
+def hash_args(attr, args):
+    dat = json.dumps([attr] + list(args))
+    return hashlib.sha256(dat.encode("utf-8")).hexdigest()
+class CacheHook:
+    def __init__(self, cachinglm) -> None:
+        if cachinglm is None:
+            self.dbdict = None
+            return
+        self.dbdict = cachinglm.dbdict
+    def add_partial(self, attr, req, res) -> None:
+        if self.dbdict is None:
+            return
+        hsh = hash_args(attr, req)
+        self.dbdict[hsh] = res
+class CachingLMM:
+    def __init__(self, lm, cache_db) -> None:
+        """LMM wrapper that returns cached results if they exist, and uses the underlying LMM if not.
+        :param lm: LMM
+            Underlying LMM
+        :param cache_db: str
+            Path to cache db
+        """
+        self.lm = lm
+        self.cache_db = cache_db
+        if os.path.dirname(cache_db):
+            os.makedirs(os.path.dirname(cache_db), exist_ok=True)
+        self.dbdict = SqliteDict(cache_db, autocommit=True)
+        # add hook to lm
+        lm.set_cache_hook(self.get_cache_hook())
+    def __getattr__(self, attr):
+        lm_attr = getattr(self.lm, attr)
+        if not callable(lm_attr):
+            return lm_attr
+        def fn(requests):
+            res = []
+            remaining_reqs = []
+            warned = False
+            # figure out which ones are cached and which ones are new
+            eval_logger.info(f"Loading '{attr}' responses from cache '{self.cache_db}' where possible...")
+            for req in tqdm(requests):
+                hsh = hash_args(attr, req.args)
+                if attr == "generate_until" and req.args[1].get("do_sample", False):
+                    # when we are doing non-greedy generation, don't use the cache
+                    # (else every "randomly sampled" generation would be identical for repeats > 1).
+                    if not warned:
+                        eval_logger.warning(f"Arguments to lm.generate_until() '{req.args[1]}' include non-deterministic sampling. Caching will not be performed for such requests.")
+                        warned = True
+                    res.append(None)
+                    remaining_reqs.append(req)
+                elif hsh in self.dbdict:
+                    ob = self.dbdict[hsh]
+                    assert ob is not None
+                    res.append(ob)
+                else:
+                    res.append(None)
+                    remaining_reqs.append(req)
+            # actually run the LMM on the requests that do not have cached results
+            rem_res = getattr(self.lm, attr)(remaining_reqs)
+            # stick the new ones back into the list and also cache any of the new ones
+            resptr = 0
+            for req, r in zip(remaining_reqs, rem_res):
+                while res[resptr] is not None:
+                    resptr += 1
+                res[resptr] = r
+                # caching
+                hsh = hash_args(attr, req.args)
+                self.dbdict[hsh] = r
+            self.dbdict.commit()
+            return res
+        return fn
+    def get_cache_hook(self):
+        return CacheHook(self)

EAGLE/lmms_eval/api/registry.py ADDED Viewed

	@@ -0,0 +1,139 @@

+from lmms_eval.api.model import lmms
+import logging
+eval_logger = logging.getLogger("lmms-eval")
+MODEL_REGISTRY = {}
+def register_model(*names):
+    # either pass a list or a single alias.
+    # function receives them as a tuple of strings
+    def decorate(cls):
+        for name in names:
+            assert issubclass(cls, lmms), f"Model '{name}' ({cls.__name__}) must extend lmms class"
+            assert name not in MODEL_REGISTRY, f"Model named '{name}' conflicts with existing model! Please register with a non-conflicting alias instead."
+            MODEL_REGISTRY[name] = cls
+        return cls
+    return decorate
+def get_model(model_name):
+    try:
+        return MODEL_REGISTRY[model_name]
+    except KeyError:
+        raise ValueError(f"Attempted to load model '{model_name}', but no model for this name found! Supported model names: {', '.join(MODEL_REGISTRY.keys())}")
+TASK_REGISTRY = {}  # Key: task name, Value: task ConfigurableTask class
+GROUP_REGISTRY = {}  # Key: group name, Value: list of task names or group names
+ALL_TASKS = set()  # Set of all task names and group names
+func2task_index = {}  # Key: task ConfigurableTask class, Value: task name
+def register_task(name):
+    def decorate(fn):
+        assert name not in TASK_REGISTRY, f"task named '{name}' conflicts with existing registered task!"
+        TASK_REGISTRY[name] = fn
+        ALL_TASKS.add(name)
+        func2task_index[fn.__name__] = name
+        return fn
+    return decorate
+def register_group(name):
+    def decorate(fn):
+        func_name = func2task_index[fn.__name__]
+        if name in GROUP_REGISTRY:
+            GROUP_REGISTRY[name].append(func_name)
+        else:
+            GROUP_REGISTRY[name] = [func_name]
+            ALL_TASKS.add(name)
+        return fn
+    return decorate
+OUTPUT_TYPE_REGISTRY = {}
+METRIC_REGISTRY = {}
+METRIC_AGGREGATION_REGISTRY = {}
+AGGREGATION_REGISTRY = {}
+HIGHER_IS_BETTER_REGISTRY = {}
+DEFAULT_METRIC_REGISTRY = {
+    "loglikelihood": [
+        "perplexity",
+        "acc",
+    ],
+    "multiple_choice": ["acc", "acc_norm"],
+    "generate_until": ["exact_match"],
+}
+def register_metric(**args):
+    # TODO: do we want to enforce a certain interface to registered metrics?
+    def decorate(fn):
+        assert "metric" in args
+        name = args["metric"]
+        for key, registry in [
+            ("metric", METRIC_REGISTRY),
+            ("higher_is_better", HIGHER_IS_BETTER_REGISTRY),
+            ("aggregation", METRIC_AGGREGATION_REGISTRY),
+        ]:
+            if key in args:
+                value = args[key]
+                assert value not in registry, f"{key} named '{value}' conflicts with existing registered {key}!"
+                if key == "metric":
+                    registry[name] = fn
+                elif key == "aggregation":
+                    registry[name] = AGGREGATION_REGISTRY[value]
+                else:
+                    registry[name] = value
+        return fn
+    return decorate
+def register_aggregation(name):
+    def decorate(fn):
+        assert name not in AGGREGATION_REGISTRY, f"aggregation named '{name}' conflicts with existing registered aggregation!"
+        AGGREGATION_REGISTRY[name] = fn
+        return fn
+    return decorate
+def get_aggregation(name):
+    try:
+        return AGGREGATION_REGISTRY[name]
+    except KeyError:
+        eval_logger.warning(
+            "{} not a registered aggregation metric!".format(name),
+        )
+def get_metric_aggregation(name):
+    try:
+        return METRIC_AGGREGATION_REGISTRY[name]
+    except KeyError:
+        eval_logger.warning(
+            "{} metric is not assigned a default aggregation!".format(name),
+        )
+def is_higher_better(metric_name):
+    try:
+        return HIGHER_IS_BETTER_REGISTRY[metric_name]
+    except KeyError:
+        eval_logger.warning(f"higher_is_better not specified for metric '{metric_name}'!")

EAGLE/lmms_eval/api/samplers.py ADDED Viewed

	@@ -0,0 +1,94 @@

+class ContextSampler:
+    def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
+        self.rnd = rnd
+        assert self.rnd, "must pass rnd to FewShotSampler!"
+        self.task = task
+        self.config = task._config
+        self.target_delimiter = self.config.target_delimiter
+        self.fewshot_delimiter = self.config.fewshot_delimiter
+        self.doc_to_text = self.task.doc_to_text
+        self.doc_to_target = self.task.doc_to_target
+        self.doc_to_choice = self.task.doc_to_choice
+        self.docs = docs  # HF dataset split, provided by task._fewshot_docs()
+        if fewshot_indices:  # subset few-shot docs from
+            self.docs = self.docs.select(fewshot_indices)
+    def get_context(self, doc, num_fewshot):
+        # draw an extra fewshot sample if using same split as evaluating on
+        n_samples = num_fewshot + 1 if self.config.fewshot_split == self.config.test_split else num_fewshot
+        # draw `n_samples` docs from fewshot_docs
+        fewshotex = self.sample(n_samples)
+        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+        # TODO: should we just stop people from using fewshot from same split as evaluating?
+        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
+        labeled_examples = (
+            self.fewshot_delimiter.join(
+                [
+                    # TODO: is separating doc_to_text and doc_to_target by one space always desired?
+                    (self.doc_to_text(doc) if (self.config.doc_to_choice is None or type(self.doc_to_text(doc)) is str) else self.doc_to_choice(doc)[self.doc_to_text(doc)])
+                    + self.target_delimiter
+                    + (
+                        str(self.doc_to_target(doc)[0])
+                        if type(self.doc_to_target(doc)) is list
+                        else self.doc_to_target(doc) if (self.config.doc_to_choice is None or type(self.doc_to_target(doc)) is str) else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
+                    )
+                    for doc in selected_docs
+                ]
+            )
+            + self.fewshot_delimiter
+        )
+        return labeled_examples
+    def sample(self, n):
+        """
+        Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
+        """
+        return self.rnd.sample(self.docs, n)
+class FirstNSampler(ContextSampler):
+    def sample(self, n) -> None:
+        """
+        Draw the first `n` samples in order from the specified split.
+        Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
+        """
+        assert n <= len(self.docs), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
+        return self.docs[:n]
+class BalancedSampler(ContextSampler):
+    def sample(self, n) -> None:
+        """
+        TODO: this should return approximately class-balanced samples from our fewshot examples.
+        TODO: what order should they be in? maybe random?
+        """
+        pass
+class ManualSampler(ContextSampler):
+    def sample(self, n) -> None:
+        """ """
+        pass
+SAMPLER_REGISTRY = {
+    "default": ContextSampler,
+    "first_n": FirstNSampler,
+}
+def get_sampler(name):
+    try:
+        return SAMPLER_REGISTRY[name]
+    except KeyError:
+        raise ValueError(f"Attempted to use contextsampler '{name}', but no sampling strategy for this name found! Supported model names: {', '.join(SAMPLER_REGISTRY.keys())}")

EAGLE/lmms_eval/api/task.py ADDED Viewed

	@@ -0,0 +1,1118 @@

+import abc
+from dataclasses import dataclass, field, asdict
+import itertools
+import os
+import re
+import ast
+import logging
+import random
+from tqdm import tqdm
+import datasets
+from datasets import Image, Sequence
+import numpy as np
+from PIL import ImageFile
+from datasets import DownloadConfig
+from typing import Union, List, Any
+from collections.abc import Callable
+from tenacity import retry, stop_after_attempt, wait_fixed
+from lmms_eval import utils
+from lmms_eval.api import samplers
+from lmms_eval.api.instance import Instance
+from lmms_eval.filters import build_filter_ensemble
+from lmms_eval.api.registry import (
+    get_aggregation,
+    get_metric_aggregation,
+    is_higher_better,
+    DEFAULT_METRIC_REGISTRY,
+    METRIC_REGISTRY,
+    OUTPUT_TYPE_REGISTRY,
+    AGGREGATION_REGISTRY,
+)
+ALL_OUTPUT_TYPES = [
+    "loglikelihood",
+    "multiple_choice",
+    "generate_until",
+]
+eval_logger = logging.getLogger("lmms-eval")
+# HuggingfaceM4/NoCaps contains truncated image in test split
+# Include this inside code block to avoid error
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+@dataclass
+class TaskConfig(dict):
+    # task naming/registry
+    task: str = None
+    task_alias: str = None
+    group: Union[str, list] = None
+    group_alias: Union[str, list] = None
+    # HF dataset options.
+    # which dataset to use,
+    # and what splits for what purpose
+    dataset_path: str = None
+    dataset_name: str = None
+    dataset_kwargs: dict = None
+    training_split: str = None
+    validation_split: str = None
+    test_split: str = None
+    fewshot_split: str = None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
+    # formatting / prompting options.
+    # see docs/advanced_task_guide.md for more info
+    process_docs: Callable = None
+    doc_to_visual: Union[Callable, str] = None
+    doc_to_text: Union[Callable, str] = None
+    doc_to_target: Union[Callable, str] = None
+    doc_to_choice: Union[Callable, str, dict, list] = None
+    process_results: Union[Callable, str] = None
+    use_prompt: str = None
+    description: str = ""
+    target_delimiter: str = " "
+    fewshot_delimiter: str = "\n\n"
+    fewshot_config: dict = None
+    # runtime configuration options
+    num_fewshot: int = None
+    # scoring options
+    metric_list: list = None
+    output_type: str = "generate_until"
+    generation_kwargs: dict = None
+    repeats: int = 1
+    filter_list: Union[str, list] = None
+    should_decontaminate: bool = False
+    doc_to_decontamination_query: str = None
+    metadata: Union[str, list] = None  # by default, not used in the code. allows for users to pass arbitrary info to tasks
+    model_specific_prompt_kwargs: dict = None
+    model_specific_generation_kwargs: dict = None
+    model_specific_target_kwargs: dict = None
+    def __post_init__(self) -> None:
+        if self.dataset_path and os.path.exists(os.path.dirname(self.dataset_path)):
+            import inspect
+            from importlib import import_module
+            self.dataset_path = inspect.getfile(import_module(self.dataset_path))
+        if self.generation_kwargs is not None:
+            if self.output_type != "generate_until":
+                eval_logger.warning(f"[{self.task}] passed `generation_kwargs`, but not using `output_type: generate_until`!")
+                assert self.output_type != "generate_until"
+            if "temperature" in self.generation_kwargs:
+                self.generation_kwargs["temperature"] = float(self.generation_kwargs["temperature"])
+            if "until" not in self.generation_kwargs:
+                self.generation_kwargs["until"] = [self.fewshot_delimiter]
+        else:
+            if self.output_type == "generate_until":
+                # ensure that we greedily generate in absence of explicit arguments otherwise
+                self.generation_kwargs = {
+                    "until": None if self.fewshot_delimiter is None else [self.fewshot_delimiter],
+                    "do_sample": False,
+                }
+        # TODO: how to make TaskConfigs be de- and re-serializable, even when using the !function constructor?
+    def __getitem__(self, item):
+        return getattr(self, item)
+    def __setitem__(self, item, value):
+        return setattr(self, item, value)
+    def to_dict(self):
+        """dumps the current config as a dictionary object, as a printable format.
+        null fields will not be printed.
+        Used for dumping results alongside full task configuration
+        :return: dict
+            A printable dictionary version of the TaskConfig object.
+        # TODO: should any default value in the TaskConfig not be printed?
+        """
+        cfg_dict = asdict(self)
+        # remove values that are `None`
+        for k, v in list(cfg_dict.items()):
+            if v is None:
+                cfg_dict.pop(k)
+            elif isinstance(v, Callable):
+                # TODO: this should handle Promptsource template objects as a separate case?
+                cfg_dict[k] = str(v)
+        return cfg_dict
+class Task(abc.ABC):
+    """A task represents an entire benchmark including its dataset, problems,
+    answers, and evaluation methods. See BoolQ for a simple example implementation
+    A `doc` can be any python object which represents one instance of evaluation.
+    This is usually a dictionary e.g.
+        {"question": ..., "answer": ...} or
+        {"question": ..., question, answer)
+    """
+    VERSION = None
+    # The name of the `Task` benchmark as denoted in the HuggingFace datasets Hub
+    # or a path to a custom `datasets` loading script.
+    DATASET_PATH: str = None
+    # The name of a subset within `DATASET_PATH`.
+    DATASET_NAME: str = None
+    OUTPUT_TYPE: str = None
+    def __init__(
+        self,
+        data_dir=None,
+        cache_dir=None,
+        download_mode=None,
+        config=None,
+    ) -> None:
+        """
+        :param data_dir: str
+            Stores the path to a local folder containing the `Task`'s data files.
+            Use this to specify the path to manually downloaded data (usually when
+            the dataset is not publicly accessible).
+        :param cache_dir: str
+            The directory to read/write the `Task` dataset. This follows the
+            HuggingFace `datasets` API with the default cache directory located at:
+                `~/.cache/huggingface/datasets`
+            NOTE: You can change the cache location globally for a given process
+            to another directory:
+                `export HF_DATASETS_CACHE="/path/to/another/directory"`
+        :param download_mode: datasets.DownloadMode
+            How to treat pre-existing `Task` downloads and data.
+            - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS`
+                Reuse download and reuse dataset.
+            - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS`
+                Reuse download with fresh dataset.
+            - `datasets.DownloadMode.FORCE_REDOWNLOAD`
+                Fresh download and fresh dataset.
+        """
+        self.download(data_dir, cache_dir, download_mode)
+        self._training_docs = None
+        self._fewshot_docs = None
+        self._instances = None
+        self._config = TaskConfig({**config}) if config else TaskConfig()
+        self._filters = [build_filter_ensemble("none", [["take_first", None]])]
+    def download(self, data_dir=None, cache_dir=None, download_mode=None) -> None:
+        """Downloads and returns the task dataset.
+        Override this method to download the dataset from a custom API.
+        :param data_dir: str
+            Stores the path to a local folder containing the `Task`'s data files.
+            Use this to specify the path to manually downloaded data (usually when
+            the dataset is not publicly accessible).
+        :param cache_dir: str
+            The directory to read/write the `Task` dataset. This follows the
+            HuggingFace `datasets` API with the default cache directory located at:
+                `~/.cache/huggingface/datasets`
+            NOTE: You can change the cache location globally for a given process
+            by setting the shell environment variable, `HF_DATASETS_CACHE`,
+            to another directory:
+                `export HF_DATASETS_CACHE="/path/to/another/directory"`
+        :param download_mode: datasets.DownloadMode
+            How to treat pre-existing `Task` downloads and data.
+            - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS`
+                Reuse download and reuse dataset.
+            - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS`
+                Reuse download with fresh dataset.
+            - `datasets.DownloadMode.FORCE_REDOWNLOAD`
+                Fresh download and fresh dataset.
+        """
+        self.dataset = datasets.load_dataset(
+            path=self.DATASET_PATH,
+            name=self.DATASET_NAME,
+            data_dir=data_dir,
+            cache_dir=cache_dir,
+            download_mode=download_mode,
+        )
+        self.dataset_no_image = datasets.load_dataset(
+            path=self.DATASET_PATH,
+            name=self.DATASET_NAME,
+            data_dir=data_dir,
+            cache_dir=cache_dir,
+            download_mode=download_mode,
+        )
+        for doc_name in self.dataset_no_image:
+            remove_cols = []
+            features = self.dataset_no_image[doc_name].features
+            # If it is an Image instance or a Sequence of Image instance. Remove it
+            for feature in features:
+                if isinstance(features[feature], Image):
+                    remove_cols.append(feature)
+                elif isinstance(features[feature], Sequence) and isinstance(features[feature].feature, Image):
+                    remove_cols.append(feature)
+            for remove_col in remove_cols:
+                self.dataset_no_image[doc_name] = self.dataset_no_image[doc_name].remove_columns(remove_col)
+    @property
+    def config(self):
+        """Returns the TaskConfig associated with this class."""
+        return self._config
+    @abc.abstractmethod
+    def has_training_docs(self):
+        """Whether the task has a training set"""
+        pass
+    @abc.abstractmethod
+    def has_validation_docs(self):
+        """Whether the task has a validation set"""
+        pass
+    @abc.abstractmethod
+    def has_test_docs(self):
+        """Whether the task has a test set"""
+        pass
+    def training_docs(self):
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return []
+    def validation_docs(self):
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return []
+    def test_docs(self):
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        return []
+    def fewshot_docs(self):
+        """
+        :return: Iterable[obj]
+            A iterable of any object, that doc_to_text can handle
+        """
+        if self.has_training_docs():
+            return self.training_docs()
+        elif self.has_validation_docs():
+            return self.validation_docs()
+        else:
+            if self.config.num_fewshot is not None:
+                eval_logger.warning("has_training_docs and has_validation_docs are False" ", using test_docs as fewshot_docs but this is not recommended.")
+            return self.test_docs()
+    def _process_doc(self, doc):
+        """
+        Override this to process (detokenize, strip, replace, etc.) individual
+        documents. This can be used in a map over documents of a data split.
+        E.g. `map(self._process_doc, self.dataset["validation"])`
+        :return: dict
+            The processed version of the specified `doc`.
+        """
+        return doc
+    @property
+    def instances(self):
+        """After calling `task.build_all_requests()`, tasks
+        maintain a list of the dataset instances which will be evaluated.
+        """
+        return self._instances
+    def fewshot_examples(self, k, rnd):
+        if self._training_docs is None:
+            self._training_docs = list(self.training_docs())
+        return rnd.sample(self._training_docs, k)
+    def doc_to_decontamination_query(self, doc) -> None:
+        print("Override doc_to_decontamination_query with document specific decontamination query.")
+        assert False
+    @abc.abstractmethod
+    def doc_to_text(self, doc):
+        pass
+    @abc.abstractmethod
+    def doc_to_target(self, doc):
+        pass
+    # @profile
+    def build_all_requests(self, limit=None, rank=None, world_size=None) -> None:
+        """Build a set of Instances for a task, and store them in task.instances"""
+        if self.has_test_docs():
+            docs = self.test_docs()
+            split = self.config.test_split
+        elif self.has_validation_docs():
+            docs = self.validation_docs()
+            split = self.config.validation_split
+        else:
+            assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
+        eval_logger.info(f"Building contexts for task {self.CONFIG.task} on rank {rank}...")
+        instances = []
+        doc_id_iterator = utils.create_iterator([i for i in range(len(docs))], rank, world_size, limit)
+        doc_id_iterator, doc_id_iterator_counting = itertools.tee(doc_id_iterator)
+        total_docs = sum(1 for _ in doc_id_iterator_counting)
+        pbar = tqdm(total=total_docs, desc=f"Building context", disable=(rank != 0))
+        for doc_id in doc_id_iterator:
+            # sample fewshot context #TODO: need to offset doc_id by rank now!
+            fewshot_ctx = self.fewshot_context(doc_id, 0 if self.config.num_fewshot is None else self.config.num_fewshot, self.config.training_split if self.has_training_docs() else split)
+            # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
+            inst = self.construct_requests(doc_id=doc_id, ctx=fewshot_ctx, metadata=(self.config["task"], doc_id, self.config.repeats), split=split)
+            if not isinstance(inst, list):
+                inst = [inst]
+            instances.extend(inst)
+            pbar.update(1)
+        pbar.close()
+        self._instances = instances
+        assert len(self._instances) != 0, "task.build_requests() did not find any docs!"
+    @abc.abstractmethod
+    def construct_requests(self, doc_id, ctx, **kwargs):
+        """Uses RequestFactory to construct Requests and returns an iterable of
+        Requests which will be sent to the LMM.
+        :param doc_id: int
+            The index of a document within `self.test_docs()` or `self.validation_docs()`,
+            whichever is the main split used.
+        :param ctx: str
+            The context string, generated by fewshot_context. This includes the natural
+            language description, as well as the few shot examples, and the question
+            part of the document for `doc`.
+        :param repeats: int
+        TODO: update this docstring
+            The number of times each instance in a dataset is inferred on. Defaults to 1,
+            can be increased for techniques like majority voting.
+        """
+        pass
+    @abc.abstractmethod
+    def process_results(self, doc, results):
+        """Take a single document and the LMM results and evaluates, returning a
+        dict where keys are the names of submetrics and values are the values of
+        the metric for that one document
+        :param doc:
+            The document as returned from training_docs, validation_docs, or test_docs.
+        :param results:
+            The results of the requests created in construct_requests.
+        """
+        pass
+    @abc.abstractmethod
+    def aggregation(self):
+        """
+        :returns: {str: [metric_score] -> float}
+            A dictionary where keys are the names of submetrics and values are
+            functions that aggregate a list of metric scores
+        """
+        pass
+    @abc.abstractmethod
+    def higher_is_better(self):
+        """
+        :returns: {str: bool}
+            A dictionary where keys are the names of submetrics and values are
+            whether a higher value of the submetric is better
+        """
+        pass
+    @classmethod
+    def count_bytes(cls, doc):
+        """Used for byte-level perplexity metrics in rolling loglikelihood"""
+        return len(doc.encode("utf-8"))
+    @utils.positional_deprecated
+    def fewshot_context(
+        self,
+        doc_id,
+        num_fewshot,
+        split,
+        rnd=random.Random(1234),
+        description=None,
+    ):
+        """Returns a fewshot context string that is made up of a prepended description
+        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
+        :param doc_id: int
+            The document id as returned from training_docs, validation_docs, or test_docs.
+        :param num_fewshot: int
+            The number of fewshot examples to provide in the returned context string.
+        :param split: str
+            The split of the document to retrieve from the dataset
+        :param rnd: random.Random
+            The pseudo-random number generator used to randomly sample examples.
+            WARNING: This is currently a required arg although it's optionalized with a default `None`.
+        :param description: str
+            The task's description that will be prepended to the fewshot examples.
+        :returns: str
+            The fewshot context.
+        """
+        assert rnd is not None, "A `random.Random` generator argument must be provided to `rnd`"
+        description = description if description else ""
+        doc = self.dataset_no_image[split][doc_id]
+        if num_fewshot == 0:
+            labeled_examples = ""
+        else:
+            # for sets with no training docs, draw from other set *but ensure no overlap with current doc*
+            if self.has_training_docs():
+                fewshotex = self.fewshot_examples(k=num_fewshot, rnd=rnd)
+            else:
+                if self._fewshot_docs is None:
+                    self._fewshot_docs = list(self.validation_docs() if self.has_validation_docs() else self.test_docs())
+                fewshotex = rnd.sample(self._fewshot_docs, num_fewshot + 1)
+                # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+                fewshotex = [x for x in fewshotex if x != doc][:num_fewshot]
+            labeled_examples = "\n\n".join([self.doc_to_text(doc) + self.doc_to_target(doc) for doc in fewshotex]) + "\n\n"
+        example = self.doc_to_text(doc)
+        return description + labeled_examples + example
+    def apply_filters(self):
+        if hasattr(self, "_filters"):
+            for f in self._filters:
+                f.apply(self._instances, None)
+        else:
+            eval_logger.warning("No filter defined, passing through instances")
+            return self._instances
+    def dump_config(self) -> dict:
+        """Returns a dictionary representing the task's config.
+        :returns: str
+            The fewshot context.
+        """
+        # TODO: this should only return the overrides applied to a non-YAML task's configuration.
+        # (num_fewshot)
+        return self.config.to_dict()
+class ConfigurableTask(Task):
+    VERSION = "Yaml"
+    OUTPUT_TYPE = None
+    CONFIG = None
+    def __init__(self, model_name) -> None:  # TODO no super() call here
+        # Get pre-configured attributes
+        self._config = self.CONFIG
+        # different model requires different prompt, we have to take those into account.
+        self.model_name = model_name
+        self._prepare_model_specific_config()
+        assert self.config.output_type in ALL_OUTPUT_TYPES
+        self.OUTPUT_TYPE = self.config.output_type
+        self.DATASET_PATH = self.config.dataset_path
+        if self.config.dataset_name is not None:
+            self.DATASET_NAME = self.config.dataset_name
+        self._prepare_metric_and_aggregation()
+        self.download(self.config.dataset_kwargs)
+        self._training_docs = None
+        self._fewshot_docs = None
+        if self.config.filter_list is not None:
+            self._filters = []
+            for filter_config in self.config.filter_list:
+                for filter_pipeline in filter_config:
+                    filter_name = filter_config["name"]
+                    filter_functions = filter_config["filter"]
+                    components = []
+                    for function in filter_functions:
+                        kwargs = {key: function[key] for key in function if key != "function"}
+                        components.append([function["function"], kwargs])
+                    filter_pipeline = build_filter_ensemble(filter_name, components)
+                self._filters.append(filter_pipeline)
+        else:
+            self._filters = [build_filter_ensemble("none", [["take_first", None]])]
+        if self.config.fewshot_config is not None:
+            self.sampler = samplers.get_sampler(self.config.fewshot_config.get("sampler", "default") if self.config.fewshot_config else "default")(list(self.fewshot_docs()), self, rnd=random.Random(1234))
+        if self.has_test_docs():
+            self.task_docs = self.test_docs()
+        elif self.has_validation_docs():
+            self.task_docs = self.validation_docs()
+        else:
+            assert False, f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
+        # Test One Doc
+        self.features = list(self.task_docs.features.keys())
+        self.multiple_input = 0
+        self.multiple_target = 0
+        test_doc = self.task_docs[0]
+        test_text = self.doc_to_text(test_doc)
+        test_target = self.doc_to_target(test_doc)
+        if self.config.doc_to_choice is not None:
+            test_choice = self.doc_to_choice(test_doc)
+            if type(test_choice) is not list:
+                eval_logger.error("doc_to_choice must return list")
+            else:
+                num_choice = len(test_choice)
+            if type(test_text) is int:
+                self.multiple_input = num_choice
+        else:
+            test_choice = None
+        if type(test_target) is list:
+            self.multiple_target = len(test_target)
+        else:
+            if (type(test_target) is int) and (test_choice is not None):
+                test_target = test_choice[test_target]
+            else:
+                test_target = str(test_target)
+        if test_choice is not None:
+            check_choices = test_choice
+        else:
+            check_choices = [test_target]
+        if self.config.doc_to_choice is not None:
+            for choice in check_choices:
+                choice_has_whitespace = True if choice[0].isspace() else False
+                delimiter_has_whitespace = True if self.config.target_delimiter.rstrip() != self.config.target_delimiter else False
+                if delimiter_has_whitespace and choice_has_whitespace:
+                    eval_logger.warning(f'Both target_delimiter and target choice: "{choice}" have whitespace')
+                elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
+                    eval_logger.warning(f'Both target_delimiter "{self.config.target_delimiter}" and target choice: "{choice}" do not have whitespace, ignore if the language you are evaluating on does not require/use whitespace')
+    def _prepare_model_specific_config(self):
+        self.model_specific_prompt_kwargs = self.config.model_specific_prompt_kwargs
+        if self.model_specific_prompt_kwargs is not None:
+            if self.model_name in self.model_specific_prompt_kwargs:
+                self.model_specific_prompt_kwargs = self.model_specific_prompt_kwargs[self.model_name]
+            else:
+                self.model_specific_prompt_kwargs = self.model_specific_prompt_kwargs.get("default", None)
+        self.model_specific_target_kwargs = self.config.model_specific_target_kwargs
+        if self.model_specific_target_kwargs is not None:
+            if self.model_name in self.model_specific_target_kwargs:
+                self.model_specific_target_kwargs = self.model_specific_target_kwargs[self.model_name]
+            else:
+                self.model_specific_target_kwargs = self.model_specific_target_kwargs.get("default", None)
+        self.model_specific_generation_kwargs = self.config.model_specific_generation_kwargs
+        if self.model_specific_generation_kwargs is not None:
+            if self.model_name in self.model_specific_generation_kwargs:
+                self.model_specific_generation_kwargs = self.model_specific_generation_kwargs[self.model_name]
+            else:
+                self.model_specific_generation_kwargs = self.model_specific_generation_kwargs.get("default", {})
+            self.config.generation_kwargs.update(self.model_specific_generation_kwargs)
+    def _prepare_metric_and_aggregation(self):
+        self._metric_fn_list = {}
+        self._metric_fn_kwargs = {}
+        self._aggregation_list = {}
+        self._higher_is_better = {}
+        if self.config.metric_list is None:
+            # TODO: handle this in TaskConfig.__post_init__ ?
+            _metric_list = DEFAULT_METRIC_REGISTRY[self.config.output_type]
+            for metric_name in _metric_list:
+                self._metric_fn_list[metric_name] = METRIC_REGISTRY[metric_name]
+                self._metric_fn_kwargs[metric_name] = {}
+                self._aggregation_list[metric_name] = get_metric_aggregation(metric_name)
+                self._higher_is_better[metric_name] = is_higher_better(metric_name)
+        else:
+            for metric_config in self.config.metric_list:
+                assert "metric" in metric_config
+                metric_name = metric_config["metric"]
+                kwargs = {key: metric_config[key] for key in metric_config if key not in ["metric", "aggregation", "higher_is_better"]}
+                if self.config.process_results is not None:
+                    self._metric_fn_list[metric_name] = None
+                    self._metric_fn_kwargs[metric_name] = {}
+                elif callable(metric_name):
+                    metric_fn = metric_name.__call__
+                    metric_name = metric_name.__name__
+                    self._metric_fn_list[metric_name] = metric_fn
+                    self._metric_fn_kwargs[metric_name] = kwargs
+                else:
+                    self._metric_fn_list[metric_name] = METRIC_REGISTRY[metric_name]
+                    self._metric_fn_kwargs[metric_name] = kwargs
+                if "aggregation" in metric_config:
+                    agg_name = metric_config["aggregation"]
+                    if type(agg_name) == str:
+                        self._aggregation_list[metric_name] = get_aggregation(agg_name)
+                    elif callable(agg_name):
+                        self._aggregation_list[metric_name] = metric_config["aggregation"]
+                else:
+                    INV_AGG_REGISTRY = {v: k for k, v in AGGREGATION_REGISTRY.items()}
+                    metric_agg = get_metric_aggregation(metric_name)
+                    eval_logger.warning(f"[Task: {self._config.task}] metric {metric_name} is defined, but aggregation is not. " f"using default " f"aggregation={INV_AGG_REGISTRY[metric_agg]}")
+                    self._aggregation_list[metric_name] = metric_agg
+                if "higher_is_better" in metric_config:
+                    self._higher_is_better[metric_name] = metric_config["higher_is_better"]
+                else:
+                    eval_logger.warning(f"[Task: {self._config.task}] metric {metric_name} is defined, but higher_is_better is not. " f"using default " f"higher_is_better={is_higher_better(metric_name)}")
+                    self._higher_is_better[metric_name] = is_higher_better(metric_name)
+    @retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
+    def download(self, dataset_kwargs=None) -> None:
+        download_config = DownloadConfig()
+        download_config.max_retries = dataset_kwargs.get("max_retries", 3) if dataset_kwargs is not None else 3
+        download_config.num_proc = dataset_kwargs.get("num_proc", 8) if dataset_kwargs is not None else 8
+        self.dataset = datasets.load_dataset(
+            path=self.DATASET_PATH,
+            name=self.DATASET_NAME,
+            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
+            **dataset_kwargs if dataset_kwargs is not None else {},
+        )
+        self.dataset_no_image = datasets.load_dataset(
+            path=self.DATASET_PATH,
+            name=self.DATASET_NAME,
+            download_mode=datasets.DownloadMode.REUSE_DATASET_IF_EXISTS,
+            **dataset_kwargs if dataset_kwargs is not None else {},
+        )
+        for doc_name in self.dataset_no_image:
+            remove_cols = []
+            features = self.dataset_no_image[doc_name].features
+            # If it is an Image instance or a Sequence of Image instance. Remove it
+            for feature in features:
+                if isinstance(features[feature], Image):
+                    remove_cols.append(feature)
+                elif isinstance(features[feature], Sequence) and isinstance(features[feature].feature, Image):
+                    remove_cols.append(feature)
+            for remove_col in remove_cols:
+                self.dataset_no_image[doc_name] = self.dataset_no_image[doc_name].remove_columns(remove_col)
+    def has_training_docs(self) -> bool:
+        if self.config.training_split is not None:
+            return True
+        else:
+            return False
+    def has_validation_docs(self) -> bool:
+        if self.config.validation_split is not None:
+            return True
+        else:
+            return False
+    def has_test_docs(self) -> bool:
+        if self.config.test_split is not None:
+            return True
+        else:
+            return False
+    def training_docs(self) -> datasets.Dataset:
+        if self.has_training_docs():
+            if self.config.process_docs is not None:
+                return self.config.process_docs(self.dataset[self.config.training_split])
+            return self.dataset[self.config.training_split]
+    def validation_docs(self) -> datasets.Dataset:
+        if self.has_validation_docs():
+            if self.config.process_docs is not None:
+                return self.config.process_docs(self.dataset[self.config.validation_split])
+            return self.dataset[self.config.validation_split]
+    def test_docs(self) -> datasets.Dataset:
+        if self.has_test_docs():
+            if self.config.process_docs is not None:
+                return self.config.process_docs(self.dataset[self.config.test_split])
+            return self.dataset[self.config.test_split]
+    def fewshot_docs(self):
+        if self.config.fewshot_split is not None:
+            return self.dataset[self.config.fewshot_split]
+        else:
+            if (self.config.num_fewshot is not None) and (self.config.num_fewshot > 0):
+                eval_logger.warning(f"Task '{self.config.task}': " "num_fewshot > 0 but fewshot_split is None. " "using preconfigured rule.")
+            return super().fewshot_docs()
+    @utils.positional_deprecated
+    def fewshot_context(self, doc_id, num_fewshot, split):
+        """Returns a fewshot context string that is made up of a prepended description
+        (if provided), the `num_fewshot` number of examples, and an appended prompt example.
+        :param doc_id: str
+            The document id as returned from training_docs, validation_docs, or test_docs.
+        :param num_fewshot: int
+            The number of fewshot examples to provide in the returned context string.
+        :returns: str
+            The fewshot context.
+        """
+        doc = self.dataset_no_image[split][doc_id]
+        if num_fewshot == 0:
+            # always prepend the (possibly empty) task description
+            labeled_examples = self.config.description
+        else:
+            labeled_examples = self.config.description + self.sampler.get_context(doc, num_fewshot)
+        example = self.doc_to_text(doc)
+        if type(example) == str:
+            return labeled_examples + example
+        elif type(example) == list:
+            return [labeled_examples + ex for ex in example]
+        elif type(example) == int:
+            if self.config.doc_to_choice is not None:
+                choices = self.doc_to_choice(doc)
+                return labeled_examples + choices[example]
+            else:
+                return labeled_examples + str(example)
+    def apply_filters(self):
+        if hasattr(self, "_filters"):
+            for f in self._filters:
+                f.apply(self._instances, self.task_docs)
+        else:
+            eval_logger.warning("No filter defined, passing through instances")
+            return self._instances
+    def should_decontaminate(self):
+        return self.config.should_decontaminate
+    def doc_to_decontamination_query(self, doc):
+        if self.config.should_decontaminate:
+            if self.config.doc_to_decontamination_query is None:
+                return self.doc_to_text(doc)
+            else:
+                doc_to_decontamination_query = self.config.doc_to_decontamination_query
+                if doc_to_decontamination_query in self.features:
+                    return doc[doc_to_decontamination_query]
+                elif callable(doc_to_decontamination_query):
+                    return doc_to_decontamination_query(doc)
+                else:
+                    return ast.literal_eval(utils.apply_template(self.config.doc_to_decontamination_query, doc))
+    def _process_doc(self, doc):
+        """
+        Override this to process (detokenize, strip, replace, etc.) individual
+        documents. This can be used in a map over documents of a data split.
+        E.g. `map(self._process_doc, self.dataset["validation"])`
+        :return: dict
+            The processed version of the specified `doc`.
+        """
+        return doc
+    def doc_to_text(self, doc):
+        doc_to_text = self.config.doc_to_text
+        if type(doc_to_text) == int:
+            return doc_to_text
+        elif type(doc_to_text) == str:
+            if doc_to_text in self.features:
+                # if self.config.doc_to_choice is not None:
+                #     return self.doc_to_choice(doc)[doc[doc_to_text]]
+                # else:
+                return doc[doc_to_text]
+            else:
+                text_string = utils.apply_template(doc_to_text, doc)
+                if text_string.isdigit() and self._config.doc_to_choice is not None:
+                    return ast.literal_eval(text_string)
+                else:
+                    return text_string
+        elif callable(doc_to_text):
+            return (
+                doc_to_text(doc, self.model_specific_prompt_kwargs)
+                if self.model_specific_prompt_kwargs is not None
+                else doc_to_text(
+                    doc,
+                )
+            )
+        # Used when applying a Promptsource template
+        elif hasattr(doc_to_text, "apply"):
+            applied_prompt = doc_to_text.apply(doc)
+            if len(applied_prompt) == 2:
+                return applied_prompt[0]
+            else:
+                eval_logger.warning("Applied prompt returns empty string")
+                return self.config.fewshot_delimiter
+        else:
+            print(type(doc_to_text))
+            raise TypeError
+    def doc_to_target(self, doc: dict) -> Union[int, str, list]:
+        doc_to_target = self.config.doc_to_target
+        if type(doc_to_target) == int:
+            return doc_to_target
+        elif type(doc_to_target) == str:
+            if doc_to_target in self.features:
+                # if self.config.doc_to_choice is not None:
+                #     return self.doc_to_choice(doc)[doc[doc_to_target]]
+                # else:
+                return doc[doc_to_target]
+            else:
+                target_string = utils.apply_template(doc_to_target, doc)
+                if target_string.isdigit() and self._config.doc_to_choice is not None:
+                    return ast.literal_eval(target_string)
+                elif len(target_string) >= 2 and (target_string[0] == "[") and (target_string[-1] == "]"):
+                    try:
+                        return ast.literal_eval(target_string)
+                    except (SyntaxError, ValueError):
+                        return target_string
+                else:
+                    return target_string
+        elif type(doc_to_target) == list:
+            return doc_to_target
+        elif callable(doc_to_target):
+            return doc_to_target(doc, self.model_specific_target_kwargs) if self.model_specific_target_kwargs is not None else doc_to_target(doc)
+        # Used when applying a Promptsource template
+        elif hasattr(doc_to_target, "apply"):
+            applied_prompt = doc_to_target.apply(doc)
+            if len(applied_prompt) == 2:
+                return applied_prompt[1]
+            else:
+                eval_logger.warning("Applied prompt returns empty string")
+                return self.config.fewshot_delimiter
+        else:
+            raise TypeError
+    def doc_to_visual(self, doc: dict) -> Union[int, str, list]:
+        self.config.doc_to_visual
+        if type(self.config.doc_to_visual) == str:
+            assert self.config.doc_to_visual in self.features
+            # Single image. Still return a list for consistency.
+            return [doc[self.config.doc_to_visual]]
+        else:
+            assert callable(self.config.doc_to_visual)
+            return self.config.doc_to_visual(doc)
+    def doc_to_choice(self, doc: Any) -> List[str]:
+        if self.config.doc_to_choice is None:
+            eval_logger.error("doc_to_choice was called but not set in config")
+        else:
+            doc_to_choice = self.config.doc_to_choice
+        if type(doc_to_choice) == str:
+            if doc_to_choice in self.features:
+                return doc[doc_to_choice]
+            else:
+                return ast.literal_eval(utils.apply_template(doc_to_choice, doc))
+        elif type(doc_to_choice) == list:
+            return doc_to_choice
+        elif type(doc_to_choice) == dict:
+            return list(doc_to_choice.values())
+        elif callable(doc_to_choice):
+            return doc_to_choice(doc)
+        elif hasattr(doc_to_choice, "get_answer_choices_list"):
+            return doc_to_choice.get_answer_choices_list(doc)
+        else:
+            raise TypeError
+    def construct_requests(self, doc_id: int, ctx: str, **kwargs) -> Union[List[Instance], Instance]:
+        split = kwargs.get("split")
+        kwargs.pop("split")
+        if self.OUTPUT_TYPE == "loglikelihood":
+            arguments = (ctx, self.doc_to_target, self.doc_to_visual, doc_id, self.config.task, split)
+        elif self.OUTPUT_TYPE == "multiple_choice":
+            doc = self.dataset[split][doc_id]
+            choices = self.doc_to_choice(doc)
+            target_delimiter = self.config.target_delimiter
+            if self.multiple_input:
+                # If there are multiple inputs, choices are placed in the ctx
+                cont = self.doc_to_target(doc)
+                arguments = [(ctx, f"{target_delimiter}{cont}", self.doc_to_visual, doc_id, self.config.task, split) for ctx in choices]
+            else:
+                # Otherwise they are placed in the continuation
+                arguments = [(ctx, f"{target_delimiter}{cont}", self.doc_to_visual, doc_id, self.config.task, split) for cont in choices]
+            request_list = [
+                Instance(
+                    request_type="loglikelihood",
+                    # doc=doc,
+                    arguments=arg,
+                    idx=i,
+                    **kwargs,
+                )
+                for i, arg in enumerate(arguments)
+            ]
+            # TODO: we should raise a warning telling users this will at most ~2x runtime.
+            if "acc_mutual_info" in self._metric_fn_list.keys():
+                # if we are calculating multiple choice accuracy
+                # using mutual information instead of raw loglikelihood as metric, need unconditional lls.
+                # here mutual info refers to calculating
+                # log(P(choice|ctx) / P(choice)) = log(P(choice|ctx)) - log(P(choice))
+                # in other words normalizing by subtracting the unconditional logprob of each choice.
+                request_list.extend(
+                    [
+                        Instance(
+                            request_type="loglikelihood",
+                            # doc=doc,
+                            arguments=("", "{}".format(choice)),
+                            idx=i,
+                            **kwargs,
+                        )
+                        for i, choice in enumerate(choices)
+                    ]
+                )
+            return request_list
+        elif self.OUTPUT_TYPE == "generate_until":
+            arguments = (ctx, self.config.generation_kwargs, self.doc_to_visual, doc_id, self.config.task, split)
+        return Instance(request_type=self.OUTPUT_TYPE, arguments=arguments, idx=0, **kwargs)
+    def process_results(self, doc, results):
+        if callable(self.config.process_results):
+            return self.config.process_results(doc, results)
+        result_dict = {}
+        use_metric = list(self._metric_fn_list.keys())
+        if self.OUTPUT_TYPE == "loglikelihood":
+            results = results[0]
+            ll, is_greedy = results
+            return {
+                **({"perplexity": ll} if "perplexity" in use_metric else {}),
+                **({"acc": int(is_greedy)} if "acc" in use_metric else {}),
+            }
+        elif self.OUTPUT_TYPE == "multiple_choice":
+            lls, is_greedy = zip(*results)
+            # retrieve choices in List[str] form, to compute choice lengths, etc.
+            choices = self.doc_to_choice(doc)
+            completion_len = np.array([float(len(i)) for i in choices])
+            if 2 * len(choices) == len(lls) and "acc_mutual_info" in self._metric_fn_list.keys():
+                # then we are doing mutual info.
+                # this stores the "dryrun" / unconditional answer loglikelihoods
+                lls_unconditional = lls[1::2]
+                assert len(lls_unconditional) == len(choices)
+                # and this stores our "regular" conditional loglikelihoods
+                lls = lls[::2]
+            pred = np.argmax(lls)
+            pred_norm = np.argmax(lls / completion_len)
+            if self.multiple_input:
+                gold = self.doc_to_text(doc)
+            else:
+                gold = self.doc_to_target(doc)
+            gold_index_error = False
+            if type(gold) is list:
+                gold = [i if i < len(choices) else -100 for i in gold]
+                if -100 in gold:
+                    gold_index_error = True
+            else:
+                if type(gold) is int:
+                    gold = gold if gold < len(choices) else -100
+                elif type(gold) is str:
+                    gold = choices.index(gold) if gold in choices else -100
+                if gold == -100:
+                    gold_index_error = True
+            if gold_index_error:
+                eval_logger.warning(f"Label index was not in within range of available choices," f"Sample:\n\n{doc}\n\n")
+            if self.multiple_target:
+                acc = 1.0 if pred in gold else 0.0
+                acc_norm = 1.0 if pred_norm in gold else 0.0
+                exact_match = int(any([is_greedy[i] if i != -100 else 0 for i in gold]))
+            else:
+                acc = 1.0 if pred == gold else 0.0
+                acc_norm = 1.0 if pred_norm == gold else 0.0
+                # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
+                exact_match = int(is_greedy[gold]) if gold != -100 else 0
+            result_dict = {
+                **({"acc": acc} if "acc" in use_metric else {}),
+                **({"f1": (gold, pred)} if "f1" in use_metric else {}),
+                **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
+                **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
+                **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
+            }
+            if "acc_mutual_info" in use_metric:
+                lls_mutual_info = [ll_c - ll_u for ll_c, ll_u in zip(lls, lls_unconditional)]
+                acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0
+                result_dict["acc_mutual_info"] = acc_mutual_info
+        elif self.OUTPUT_TYPE == "generate_until":
+            gold = self.doc_to_target(doc)
+            result = results[0]
+            if self.config.doc_to_choice is not None:
+                # If you set doc_to_choice,
+                # it assumes that doc_to_target returns a number.
+                choices = self.doc_to_choice(doc)
+                gold = choices[gold]
+            # we expect multiple_targets to be a list.
+            elif self.multiple_target:
+                gold = list(gold)
+            elif type(gold) != type(result):
+                # cast gold to the same type as result
+                gold = type(result)(gold)
+            for metric in self._metric_fn_list.keys():
+                if self.multiple_target:
+                    # in the case where we have multiple targets,
+                    # return true if any are true
+                    # TODO: this may break for multipLe_target, non zero-or-1 metrics
+                    scores = []
+                    if not isinstance(gold, list):
+                        # sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
+                        # print(gold)
+                        gold = [gold]
+                    for gold_option in gold:
+                        try:
+                            result_score = self._metric_fn_list[metric](
+                                references=[gold_option],
+                                predictions=[result],
+                                **self._metric_fn_kwargs[metric],
+                            )
+                        except TypeError:  # TODO: this is hacky and I don't want to do it
+                            result_score = self._metric_fn_list[metric]([gold_option, result])
+                        if isinstance(result_score, dict):
+                            # TODO: this handles the case where HF evaluate returns a dict.
+                            result_score = result_score[metric]
+                        scores.append(result_score)
+                    if any(scores):
+                        result_score = 1.0
+                    else:
+                        result_score = 0.0
+                else:
+                    try:
+                        result_score = self._metric_fn_list[metric](
+                            references=[gold],
+                            predictions=[result],
+                            **self._metric_fn_kwargs[metric],
+                        )
+                    except TypeError:  # needed for now in order to use a different interface between our own metrics and HF Evaluate metrics
+                        result_score = self._metric_fn_list[metric]([gold, result])
+                    if isinstance(result_score, dict):
+                        # TODO: this handles the case where HF evaluate returns a dict.
+                        result_score = result_score[metric]
+                result_dict[metric] = result_score
+        else:
+            raise ValueError(
+                f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",
+                "'loglikelihood','generate_until' or 'multiple_choice'",
+            )
+        return result_dict
+    def aggregation(self):
+        return self._aggregation_list
+    def higher_is_better(self):
+        return self._higher_is_better

EAGLE/lmms_eval/filters/__init__.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from lmms_eval.api.filter import FilterEnsemble
+from . import selection
+from . import extraction
+from . import transformation
+FILTER_REGISTRY = {
+    "take_first": selection.TakeFirstFilter,
+    "regex": extraction.RegexFilter,
+    "majority_vote": selection.MajorityVoteFilter,
+    "take_first_k": selection.TakeKFilter,
+    "remove_whitespace": extraction.WhitespaceFilter,
+    "lowercase": transformation.LowercaseFilter,
+    "uppercase": transformation.UppercaseFilter,
+    "map": transformation.MapFilter,
+    # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
+    # that takes an input and returns a scalar and then should select the max reward,
+    # or should implement different filters for different ways of handling a reward model's inference.
+    # "arg_max": selection.ArgMaxFilter,
+}
+def get_filter(filter_name):
+    if filter_name in FILTER_REGISTRY:
+        return FILTER_REGISTRY[filter_name]
+    else:
+        return filter_name
+def build_filter_ensemble(filter_name, components):
+    """
+    Create a filtering pipeline.
+    """
+    filters = []
+    for function, kwargs in components:
+        if kwargs is None:
+            f = get_filter(function)()
+        else:
+            # create a filter given its name in the registry
+            f = get_filter(function)(**kwargs)  # TODO: pass kwargs to filters properly
+        # add the filter as a pipeline step
+        filters.append(f)
+    return FilterEnsemble(name=filter_name, filters=filters)

EAGLE/lmms_eval/filters/decontamination.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from lmms_eval.api.filter import Filter
+class DecontaminationFilter(Filter):
+    """
+    A filter which evaluates
+    """
+    name = "track_decontamination"
+    def __init__(self, path) -> None:
+        """
+        TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path").
+        should further cache result on a given (task_name, doc_id)
+        """
+        self._decontam_results = None
+    def apply(self, resps, docs) -> None:
+        """
+        Return {"no_contamination", "only_contamination"} keys for the 2 different subsets
+        """
+        pass

EAGLE/lmms_eval/filters/extraction.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import re
+from lmms_eval.api.filter import Filter
+class RegexFilter(Filter):
+    """ """
+    def __init__(self, regex_pattern: str = r"#### (\-?[0-9\.\,]+)", fallback: str = "[invalid]") -> None:
+        """
+        pass a string `regex` to run `re.compile(r"regex")` on.
+        `fallback` defines the output returned if no matches for the regex are located.
+        """
+        self.regex_pattern = regex_pattern
+        self.regex = re.compile(regex_pattern)
+        self.fallback = fallback
+    def apply(self, resps, docs):
+        # here, we assume we have a list, in which each element is
+        # a list of model responses for some particular input/target pair.
+        # so we process each of these (same input/target response sets)
+        # independently (and keep them a list.)
+        def filter_set(inst):
+            filtered = []
+            for resp in inst:
+                match = self.regex.search(resp)
+                if match:
+                    match = match.group(1).strip()
+                else:
+                    match = self.fallback
+                filtered.append(match)
+            return filtered
+        # print(resps)
+        filtered_resps = list(map(lambda x: filter_set(x), resps))
+        # print(filtered_resps)
+        return filtered_resps
+class WhitespaceFilter(Filter):
+    """ """
+    def __init__(self) -> None:
+        pass
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            filtered_resp = []
+            for resp in inst:
+                if resp.startswith(" "):
+                    resp = resp[1:]
+                filtered_resp.append(resp)
+            return filtered_resp
+        filtered_resps = [filter_set(resp) for resp in resps]
+        return filtered_resps

EAGLE/lmms_eval/filters/selection.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from collections import Counter
+from lmms_eval.api.filter import Filter
+class TakeFirstFilter(Filter):
+    def __init__(self) -> None:
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+    def apply(self, resps, docs):
+        """
+        Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
+        """
+        return map(lambda r: r[0], resps)
+class TakeKFilter(Filter):
+    def __init__(self, *args, **kwargs) -> None:
+        self.k = kwargs.pop("k")
+        super().__init__(*args, **kwargs)
+    def apply(self, resps, docs):
+        # check we have at least k responses per doc, else we can't take the first k
+        assert len(resps[0]) >= self.k, f"Need at least {self.k} responses per doc to take first {self.k}, but got {len(resps[0])} only! Please increase TaskConfig.repeats ."
+        return map(lambda r: r[: self.k], resps)
+class MajorityVoteFilter(Filter):
+    def __init__(self) -> None:
+        """
+        Can define custom behavior here, if an individual instantiation of a Filter class should have state.
+        """
+    def apply(self, resps, docs):
+        """
+        Each entry of `resps` is a list of model responses.
+        We select the response that occurs most frequently in each entry of `resps`.
+        """
+        def select_majority(resp):
+            counts = Counter(resp)
+            vote = counts.most_common(1)[0][0]
+            return vote
+        return map(lambda r: [select_majority(r)], resps)

EAGLE/lmms_eval/filters/transformation.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from lmms_eval.api.filter import Filter
+class LowercaseFilter(Filter):
+    def __init__(self) -> None:
+        pass
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [resp.lower() for resp in inst]
+        return [filter_set(resp) for resp in resps]
+class UppercaseFilter(Filter):
+    def __init__(self) -> None:
+        pass
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [resp.upper() for resp in inst]
+        return [filter_set(resp) for resp in resps]
+class MapFilter(Filter):
+    def __init__(self, mapping_dict: dict = {}, default_value=None) -> None:
+        """
+        Initializes the MapFilter with a given mapping dictionary and default value.
+        Args:
+        - mapping_dict (dict): A dictionary containing the key-value mappings.
+                               Default is an empty dictionary.
+        - default_value (Any): The value to be returned when a key is not found in the mapping_dict.
+                               Default is None.
+        Example:
+        mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
+        """
+        assert isinstance(mapping_dict, dict), "Provided mapping_dict is not a dictionary"
+        self.mapping_dict = mapping_dict
+        self.default_value = default_value
+    def apply(self, resps, docs):
+        def filter_set(inst):
+            return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
+        return [filter_set(resp) for resp in resps]

EAGLE/lmms_eval/models/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import os
+AVAILABLE_MODELS = {
+    "eagle": "Eagle",
+}
+for model_name, model_class in AVAILABLE_MODELS.items():
+    try:
+        exec(f"from .{model_name} import {model_class}")
+    except ImportError:
+        pass
+import hf_transfer
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

EAGLE/lmms_eval/models/eagle.py ADDED Viewed

	@@ -0,0 +1,415 @@

+import torch
+from PIL import Image
+torch.backends.cuda.matmul.allow_tf32 = True
+import logging
+import copy
+from tqdm import tqdm
+from datetime import timedelta
+from lmms_eval import utils
+from lmms_eval.api.instance import Instance
+from lmms_eval.api.model import lmms
+from lmms_eval.api.registry import register_model
+from lmms_eval.utils import stop_sequences_criteria
+from accelerate import Accelerator, DistributedType, InitProcessGroupKwargs
+from accelerate.state import AcceleratorState
+from typing import List, Optional, Union, Tuple
+import warnings
+warnings.filterwarnings("ignore")
+eval_logger = logging.getLogger("lmms-eval")
+try:
+    from eagle.model.builder import load_pretrained_model
+    from eagle.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
+    from eagle.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
+    from eagle.conversation import conv_templates, SeparatorStyle
+except ImportError:
+    eval_logger.error("Please add a symbolic link pointing to the eagle folder of repo ")
+from transformers.integrations.deepspeed import (
+    is_deepspeed_zero3_enabled,
+    set_hf_deepspeed_config,
+    unset_hf_deepspeed_config,
+)
+def resize_image_with_aspect_ratio(img, min_size):
+    """
+    Resize an image while maintaining its aspect ratio.
+    Parameters:
+    - image_path: str, path to the input image.
+    - min_size: int, the minimum size for the shortest side of the image.
+    Returns:
+    - resized_image: PIL.Image object, the resized image.
+    """
+    # Get the original dimensions of the image
+    original_width, original_height = img.size
+    # Determine the aspect ratio
+    aspect_ratio = original_width / original_height
+    # Calculate new dimensions based on the shortest side
+    if original_width < original_height:
+        new_width = min_size
+        new_height = int(min_size / aspect_ratio)
+    else:
+        new_height = min_size
+        new_width = int(min_size * aspect_ratio)
+    # Resize the image while maintaining aspect ratio
+    resized_image = img.resize((new_width, new_height), Image.LANCZOS)# Image.ANTIALIAS)
+    return resized_image
+@register_model("eagle")
+class Eagle(lmms):
+    """
+    EAGLE Model
+    """
+    def __init__(
+        self,
+        pretrained: str = "NVEagle/Eagle-X5-7B",
+        truncation: Optional[bool] = True,
+        device: Optional[str] = "cuda",
+        dtype: Optional[Union[str, torch.dtype]] = "",
+        batch_size: Optional[Union[int, str]] = 1,
+        trust_remote_code: Optional[bool] = False,
+        revision=None,
+        use_flash_attention_2=True,
+        device_map="",
+        conv_template="vicuna_v1",
+        use_cache=True,
+        truncate_context=False,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        # Do not use kwargs for now
+        assert kwargs == {}, f"Unexpected kwargs: {kwargs}"
+        accelerator_kwargs = InitProcessGroupKwargs(timeout=timedelta(weeks=52))
+        accelerator = Accelerator(kwargs_handlers=[accelerator_kwargs])
+        if accelerator.num_processes > 1 and device_map == "":
+            self._device = torch.device(f"cuda:{accelerator.local_process_index}")
+            self.device_map = f"cuda:{accelerator.local_process_index}"
+        else:
+            self._device = torch.device(device)
+            self.device_map = device_map
+        self._tokenizer, self._model, self._image_processor, self._max_length = load_pretrained_model(pretrained, None, get_model_name_from_path(pretrained), device_map=self.device_map, use_flash_attention_2=use_flash_attention_2)
+        self._config = self._model.config
+        self.model.eval()
+        self.model.tie_weights()
+        self.truncation = truncation
+        self.batch_size_per_gpu = int(batch_size)
+        self.conv_template = conv_template
+        self.use_cache = use_cache
+        self.truncate_context = truncate_context
+        if accelerator.num_processes > 1 and device_map == "":
+            assert accelerator.distributed_type in [DistributedType.FSDP, DistributedType.MULTI_GPU, DistributedType.DEEPSPEED], "Unsupported distributed type provided. Only DDP and FSDP are supported."
+            # If you want to use DistributedType.DEEPSPEED, you have to run accelerate config before using the model
+            # Also, you have to select zero stage 0 (equivalent to DDP) in order to make the prepare model works
+            # I tried to set different parameters in the kwargs to let default zero 2 stage works, but it didn't work.
+            if accelerator.distributed_type == DistributedType.DEEPSPEED:
+                kwargs = {
+                    "train_micro_batch_size_per_gpu": self.batch_size_per_gpu,
+                    "train_batch_size": self.batch_size_per_gpu * accelerator.num_processes,
+                }
+                AcceleratorState().deepspeed_plugin.deepspeed_config_process(must_match=True, **kwargs)
+                eval_logger.info("Detected that you are using DistributedType.DEEPSPEED. Make sure you run `accelerate config` and set zero stage to 0")
+            if accelerator.distributed_type == DistributedType.FSDP or accelerator.distributed_type == DistributedType.DEEPSPEED:
+                self._model = accelerator.prepare(self.model)
+            else:
+                self._model = accelerator.prepare_model(self.model, evaluation_mode=True)
+            self.accelerator = accelerator
+            if self.accelerator.is_local_main_process:
+                eval_logger.info(f"Using {accelerator.num_processes} devices with data parallelism")
+            self._rank = self.accelerator.local_process_index
+            self._world_size = self.accelerator.num_processes
+        elif accelerator.num_processes == 1 and device_map == "auto":
+            eval_logger.info(f"Using {accelerator.num_processes} devices with tensor parallelism")
+            self._rank = 0
+            self._word_size = 1
+        else:
+            eval_logger.info(f"Using single device: {self._device}")
+            self.model.to(self._device)
+            self._rank = 0
+            self._world_size = 1
+    @property
+    def config(self):
+        # return the associated transformers.AutoConfig for the given pretrained model.
+        return self._config
+    @property
+    def tokenizer(self):
+        return self._tokenizer
+    @property
+    def model(self):
+        # returns the model, unwrapping it if using Accelerate
+        if hasattr(self, "accelerator"):
+            return self.accelerator.unwrap_model(self._model)
+        else:
+            return self._model
+    @property
+    def eot_token_id(self):
+        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
+        return self.tokenizer.eos_token_id
+    @property
+    def max_length(self):
+        return self._max_length
+    def pad_sequence(self, input_ids, batch_first, padding_value):
+        if self.tokenizer.padding_side == "left":
+            input_ids = [torch.flip(_input_ids, [0]) for _input_ids in input_ids]
+        input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=batch_first, padding_value=padding_value)
+        if self.tokenizer.padding_side == "left":
+            input_ids = torch.flip(input_ids, [1])
+        return input_ids
+    @property
+    def batch_size(self):
+        return self.batch_size_per_gpu
+    @property
+    def device(self):
+        return self._device
+    @property
+    def rank(self):
+        return self._rank
+    @property
+    def world_size(self):
+        return self._world_size
+    def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None) -> List[int]:
+        """ """
+        add_special_tokens = False if add_special_tokens is None else add_special_tokens
+        encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens)
+        # left-truncate the encoded context to be at most `left_truncate_len` tokens long
+        if left_truncate_len:
+            encoding = encoding[-left_truncate_len:]
+        return encoding
+    def tok_decode(self, tokens):
+        return self.tokenizer.decode(tokens)
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        # TODO
+        res = []
+        pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
+        for contexts, doc_to_target, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
+            # encode, pad, and truncate contexts for this batch
+            if type(doc_to_target) == str:
+                continuation = doc_to_target
+            else:
+                continuation = doc_to_target(self.task_dict[task][split][doc_id])
+            visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
+            visuals = self.flatten(visuals)
+            if visuals:
+                image = process_images(visuals, self._image_processor, self._config)
+                if type(image) is list:
+                    image = [_image.to(dtype=torch.float16, device=self.device) for _image in image]
+                else:
+                    image = image.to(dtype=torch.float16, device=self.device)
+            else:
+                image = None
+            prompts_input = contexts[0]
+            if image is not None and len(image) != 0 and DEFAULT_IMAGE_TOKEN not in prompts_input:
+                """
+                Three senarios:
+                1. No image, and there for, no image token should be added.
+                2. image token is already specified in the context, so we don't need to add it.
+                3. image token is not specified in the context and there is image inputs, so we need to add it. In this case, we add the image token at the beginning of the context and add a new line.
+                """
+                image_tokens = [DEFAULT_IMAGE_TOKEN] * len(visuals)
+                image_tokens = " ".join(image_tokens)
+                prompts_input = image_tokens + "\n" + contexts[0]
+            conv = conv_templates[self.conv_template].copy()
+            conv.append_message(conv.roles[0], prompts_input)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+            pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+            contxt_id = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device)
+            # Add the answer of the second role
+            conv.messages[1][1] = continuation
+            prompt = conv.get_prompt()
+            input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device)
+            labels = input_ids.clone()
+            # Context part no need to calculate for loss
+            labels[0, : contxt_id.shape[1]] = -100
+            with torch.inference_mode():
+                outputs = self.model(input_ids=input_ids, labels=labels, images=image, use_cache=True)
+            loss = outputs["loss"]
+            # loss = torch.exp(loss)
+            logits = outputs["logits"]
+            greedy_tokens = logits.argmax(dim=-1)
+            cont_toks = input_ids[:, contxt_id.shape[1] :]  # [1, seq]
+            greedy_tokens = greedy_tokens[:, contxt_id.shape[1] : input_ids.shape[1]]  # [1, seq]
+            max_equal = (greedy_tokens == cont_toks).all()
+            res.append((float(loss.item()), bool(max_equal)))
+            pbar.update(1)
+        pbar.close()
+        return res
+    def flatten(self, input):
+        new_list = []
+        for i in input:
+            for j in i:
+                new_list.append(j)
+        return new_list
+    def generate_until(self, requests: List[Instance]) -> List[str]:
+        res = []
+        def _collate(x):
+            # the negative sign on len(toks) sorts descending - this has a few advantages:
+            # - time estimates will always be over not underestimates, which is more useful for planning
+            # - to know the size of a batch when going through the list, you know the first one is always the batch
+            #   padded context length. this is useful to simplify the batching logic and more importantly to make
+            #   automatic adaptive batches much much easier to implement
+            # - any OOMs will happen right away rather than near the end
+            toks = self.tok_encode(x[0])
+            return -len(toks), x[0]
+        # we group requests by their generation_kwargs,
+        # so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
+        # in the same batch.
+        re_ords = utils.Collator([reg.args for reg in requests], _collate, grouping=True)
+        chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
+        num_iters = len(requests) // self.batch_size if len(requests) % self.batch_size == 0 else len(requests) // self.batch_size + 1
+        pbar = tqdm(total=num_iters, disable=(self.rank != 0), desc="Model Responding")
+        for chunk in chunks:
+            contexts, all_gen_kwargs, doc_to_visual, doc_id, task, split = zip(*chunk)
+            task = task[0]
+            split = split[0]
+            visuals = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id]
+            visuals = self.flatten(visuals)
+            # we assume all gen kwargs in the batch are the same
+            # this is safe to assume because the `grouper` object ensures it.
+            gen_kwargs = all_gen_kwargs[0]
+            # Set default values for until and max_new_tokens
+            until = [self.tok_decode(self.eot_token_id)]
+            # Update values from gen_kwargs if present
+            if "until" in gen_kwargs:
+                until = gen_kwargs.pop("until")
+                if isinstance(until, str):
+                    until = [until]
+                elif not isinstance(until, list):
+                    raise ValueError(f"Expected `gen_kwargs['until']` to be of type Union[str,list] but got {type(until)}")
+            if "image_aspect_ratio" in gen_kwargs.keys() and "image_aspect_ratio" not in self._config.__dict__:
+                # here we should pop it out of gen_kwargs so that it doesn't get passed to the model for next step of generation
+                self._config.image_aspect_ratio = gen_kwargs.pop("image_aspect_ratio")
+                eval_logger.info(f"Setting image aspect ratio: {self._config.image_aspect_ratio}")
+            if visuals:
+                image_tensor = process_images(visuals, self._image_processor, self._config)
+                if type(image_tensor) is list:
+                    image_tensor = [_image.to(dtype=torch.float16, device=self.device) for _image in image_tensor]
+                else:
+                    image_tensor = image_tensor.to(dtype=torch.float16, device=self.device)
+            else:
+                image_tensor = None
+            # prompts_input = contexts[0]
+            question_input = []
+            for visual, context in zip(visuals, contexts):
+                if image_tensor is not None and len(image_tensor) != 0 and DEFAULT_IMAGE_TOKEN not in context:
+                    """
+                    Three senarios:
+                    1. No image, and there for, no image token should be added.
+                    2. image token is already specified in the context, so we don't need to add it.
+                    3. image token is not specified in the context and there is image inputs, so we need to add it. In this case, we add the image token at the beginning of the context and add a new line.
+                    """
+                    image_tokens = [DEFAULT_IMAGE_TOKEN] * len(visual) if isinstance(visual, list) else [DEFAULT_IMAGE_TOKEN]
+                    image_tokens = " ".join(image_tokens)
+                    question = image_tokens + "\n" + context
+                else:
+                    question = context
+                conv = conv_templates[self.conv_template].copy()
+                conv.append_message(conv.roles[0], question)
+                conv.append_message(conv.roles[1], None)
+                prompt_question = conv.get_prompt()
+                question_input.append(prompt_question)
+            # The above for loop has bugs. When there is no visuals, e.g. pure text,
+            # there will be no for loop execute resulting in an empty question_input (because no visuals)
+            # Scenario 1 won't even be execute
+            if len(visuals) == 0:
+                for context in contexts:
+                    question = context
+                    conv = conv_templates[self.conv_template].copy()
+                    conv.append_message(conv.roles[0], question)
+                    conv.append_message(conv.roles[1], None)
+                    prompt_question = conv.get_prompt()
+                    question_input.append(prompt_question)
+            # input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(self.device)
+            # preconfigure gen_kwargs with defaults
+            gen_kwargs["image_sizes"] = [visuals[idx].size for idx in range(len(visuals))]
+            if "max_new_tokens" not in gen_kwargs:
+                gen_kwargs["max_new_tokens"] = 1024
+            if "temperature" not in gen_kwargs:
+                gen_kwargs["temperature"] = 0
+            if "top_p" not in gen_kwargs:
+                gen_kwargs["top_p"] = None
+            if "num_beams" not in gen_kwargs:
+                gen_kwargs["num_beams"] = 1
+            input_ids_list = [tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt") for prompt in question_input]
+            pad_token_ids = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+            input_ids = self.pad_sequence(input_ids_list, batch_first=True, padding_value=pad_token_ids).to(self.device)
+            attention_masks = input_ids.ne(pad_token_ids).to(self.device)
+            try:
+                cont = self.model.generate(
+                    input_ids,
+                    attention_mask=attention_masks,
+                    pad_token_id=pad_token_ids,
+                    images=image_tensor,
+                    image_sizes=gen_kwargs["image_sizes"],
+                    do_sample=True if gen_kwargs["temperature"] > 0 else False,
+                    temperature=gen_kwargs["temperature"],
+                    top_p=gen_kwargs["top_p"],
+                    num_beams=gen_kwargs["num_beams"],
+                    max_new_tokens=gen_kwargs["max_new_tokens"],
+                    use_cache=self.use_cache,
+                )
+                text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)
+            except Exception as e:
+                eval_logger.error(f"Error {e} in generating")
+                cont = ""
+                text_outputs = [""]
+            res.extend(text_outputs)
+            self.cache_hook.add_partial("generate_until", (context, gen_kwargs), text_outputs)
+            pbar.update(1)
+        res = re_ords.get_original(res)
+        pbar.close()
+        return res

EAGLE/lmms_eval/models/gpt4v.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from io import BytesIO
+from copy import deepcopy
+import os
+import base64
+from typing import List, Tuple
+from tqdm import tqdm
+import requests as url_requests
+import time
+import logging
+from lmms_eval.api.instance import Instance
+from lmms_eval.api.model import lmms
+from lmms_eval.api.registry import register_model
+from lmms_eval import utils
+from PIL import Image
+API_TYPE = os.getenv("API_TYPE", "openai")
+NUM_SECONDS_TO_SLEEP = 5
+eval_logger = logging.getLogger("lmms-eval")
+if API_TYPE == "openai":
+    API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+    API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+    }
+elif API_TYPE == "azure":
+    API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "api-key": API_KEY,
+        "Content-Type": "application/json",
+    }
+@register_model("gpt4V")
+class GPT4V(lmms):
+    def __init__(self, **kwargs) -> None:
+        super().__init__()
+        # Manually set a image token for GPT4V so that we can search for it
+        # and split the text and image
+        # Here we just use the same token as llava for convenient
+        self.image_token = "<image>"
+    # Function to encode the image
+    def encode_image(self, image: Image):
+        output_buffer = BytesIO()
+        image.save(output_buffer, format="JPEG")
+        byte_data = output_buffer.getvalue()
+        base64_str = base64.b64encode(byte_data).decode("utf-8")
+        return base64_str
+    def flatten(self, input):
+        new_list = []
+        for i in input:
+            for j in i:
+                new_list.append(j)
+        return new_list
+    def generate_until(self, requests) -> List[str]:
+        res = []
+        pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
+        for contexts, gen_kwargs, doc_to_visual, doc_id, task, split in [reg.args for reg in requests]:
+            # encode, pad, and truncate contexts for this batch
+            visuals = [doc_to_visual(self.task_dict[task][split][doc_id])]
+            visuals = self.flatten(visuals)
+            imgs = []
+            for visual in visuals:
+                img = self.encode_image(visual)
+                imgs.append(img)
+            payload = {"model": "gpt-4-vision-preview", "messages": []}
+            response_json = {"role": "user", "content": []}
+            # When there is no image token in the context, append the image to the text
+            if self.image_token not in contexts:
+                payload["messages"].append(deepcopy(response_json))
+                payload["messages"][0]["content"].append({"type": "text", "text": contexts})
+                for img in imgs:
+                    payload["messages"][0]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
+            else:
+                contexts = contexts.split(self.image_token)
+                for idx, img in enumerate(imgs):
+                    payload["messages"].append(deepcopy(response_json))
+                    payload["messages"][idx]["content"].append({"type": "text", "text": contexts[idx]})
+                    payload["messages"][idx]["content"].append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}})
+                # If n image tokens are in the contexts
+                # contexts will be splitted into n+1 chunks
+                # Manually add it into the payload
+                payload["messages"].append(deepcopy(response_json))
+                payload["messages"][-1]["content"].append({"type": "text", "text": contexts[-1]})
+            if "max_new_tokens" not in gen_kwargs:
+                gen_kwargs["max_new_tokens"] = 1024
+            if "temperature" not in gen_kwargs:
+                gen_kwargs["temperature"] = 0
+            if "top_p" not in gen_kwargs:
+                gen_kwargs["top_p"] = None
+            if "num_beams" not in gen_kwargs:
+                gen_kwargs["num_beams"] = 1
+            # payload["max_tokens"] = gen_kwargs["max_new_tokens"]
+            # payload["temperature"] = gen_kwargs["temperature"]
+            for attempt in range(5):
+                try:
+                    response = url_requests.post(API_URL, headers=headers, json=payload, timeout=20)
+                    response_data = response.json()
+                    content = response_data["choices"][0]["message"]["content"].strip()
+                    break  # If successful, break out of the loop
+                except Exception as e:
+                    eval_logger.info(f"Attempt {attempt + 1} failed with error: {str(e)}")
+                    if attempt < 5 - 1:  # If we have retries left, sleep and then continue to next attempt
+                        time.sleep(NUM_SECONDS_TO_SLEEP)
+                    else:  # If this was the last attempt, log and return empty
+                        eval_logger.error(f"All 5 attempts failed. Last error message: {str(e)}")
+                        content = ""
+            res.append(content)
+            pbar.update(1)
+        return res
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        # TODO
+        assert False, "GPT4V not support"

EAGLE/lmms_eval/tasks/__init__.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import os
+from typing import List, Union, Dict
+from lmms_eval import utils
+# from lmms_eval import prompts
+from lmms_eval.api.task import TaskConfig, Task, ConfigurableTask
+from lmms_eval.api.registry import (
+    register_task,
+    register_group,
+    TASK_REGISTRY,
+    GROUP_REGISTRY,
+    ALL_TASKS,
+)
+import logging
+eval_logger = logging.getLogger("lmms-eval")
+def register_configurable_task(config: Dict[str, str]) -> int:
+    SubClass = type(
+        config["task"] + "ConfigurableTask",
+        (ConfigurableTask,),
+        {"CONFIG": TaskConfig(**config)},
+    )
+    if "task" in config:
+        task_name = "{}".format(config["task"])
+        register_task(task_name)(SubClass)
+    if "group" in config:
+        if config["group"] == config["task"]:
+            raise ValueError("task and group name cannot be the same")
+        elif type(config["group"]) == str:
+            group_name = [config["group"]]
+        else:
+            group_name = config["group"]
+        for group in group_name:
+            register_group(group)(SubClass)
+    return 0
+def register_configurable_group(config: Dict[str, str]) -> int:
+    group = config["group"]
+    task_list = config["task"]
+    task_names = utils.pattern_match(task_list, ALL_TASKS)
+    for task in task_names:
+        if (task in TASK_REGISTRY) or (task in GROUP_REGISTRY):
+            if group in GROUP_REGISTRY:
+                GROUP_REGISTRY[group].append(task)
+            else:
+                GROUP_REGISTRY[group] = [task]
+                ALL_TASKS.add(group)
+    return 0
+def get_task_name_from_config(task_config: Dict[str, str]) -> str:
+    if "dataset_name" in task_config:
+        return "{dataset_path}_{dataset_name}".format(**task_config)
+    else:
+        return "{dataset_path}".format(**task_config)
+def include_task_folder(task_dir: str, register_task: bool = True) -> None:
+    """
+    Calling this function
+    """
+    for root, subdirs, file_list in os.walk(task_dir):
+        # if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
+        for f in file_list:
+            if f.endswith(".yaml"):
+                yaml_path = os.path.join(root, f)
+                try:
+                    config = utils.load_yaml_config(yaml_path)
+                    if "task" not in config:
+                        continue
+                    if register_task:
+                        if type(config["task"]) == str:
+                            register_configurable_task(config)
+                    else:
+                        if type(config["task"]) == list:
+                            register_configurable_group(config)
+                # Log this silently and show it only when
+                # the user defines the appropriate verbosity.
+                except ModuleNotFoundError as e:
+                    eval_logger.debug(f"{yaml_path}: {e}. Config will not be added to registry.")
+                except Exception as error:
+                    import traceback
+                    eval_logger.debug(f"Failed to load config in {yaml_path}. Config will not be added to registry\n" f"Error: {error}\n" f"Traceback: {traceback.format_exc()}")
+    return 0
+def include_path(task_dir):
+    include_task_folder(task_dir)
+    # Register Benchmarks after all tasks have been added
+    include_task_folder(task_dir, register_task=False)
+    return 0
+def initialize_tasks(verbosity="INFO"):
+    eval_logger.setLevel(getattr(logging, f"{verbosity}"))
+    task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
+    include_path(task_dir)
+def get_task(task_name, model_name):
+    try:
+        return TASK_REGISTRY[task_name](model_name=model_name)
+    except KeyError:
+        eval_logger.info("Available tasks:")
+        eval_logger.info(list(TASK_REGISTRY) + list(GROUP_REGISTRY))
+        raise KeyError(f"Missing task {task_name}")
+def get_task_name_from_object(task_object):
+    for name, class_ in TASK_REGISTRY.items():
+        if class_ is task_object:
+            return name
+    # TODO: scrap this
+    # this gives a mechanism for non-registered tasks to have a custom name anyways when reporting
+    return task_object.EVAL_HARNESS_NAME if hasattr(task_object, "EVAL_HARNESS_NAME") else type(task_object).__name__
+# TODO: pass num_fewshot and other cmdline overrides in a better way
+def get_task_dict(task_name_list: List[Union[str, Dict, Task]], model_name: str):
+    all_task_dict = {}
+    # Ensure task_name_list is a list to simplify processing
+    if not isinstance(task_name_list, list):
+        task_name_list = [task_name_list]
+    for task_element in task_name_list:
+        if isinstance(task_element, str) and task_element in GROUP_REGISTRY:
+            group_name = task_element
+            for task_name in GROUP_REGISTRY[task_element]:
+                if task_name not in all_task_dict:
+                    # Recursively get the task dictionary for nested groups
+                    task_obj = get_task_dict([task_name], model_name)
+                    # Merge the dictionaries
+                    all_task_dict.update({task_name: (group_name, task_obj.get(task_name, None))})
+        else:
+            task_name = task_element if isinstance(task_element, str) else task_element.EVAL_HARNESS_NAME
+            if task_name not in all_task_dict:
+                task_obj = get_task(task_name=task_name, model_name=model_name)
+                all_task_dict[task_name] = task_obj
+    return all_task_dict

EAGLE/lmms_eval/tasks/_task_utils/file_utils.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import os
+def generate_submission_file(file_name, args, subpath="submissions"):
+    path = os.path.join(args.output_path, subpath)
+    os.makedirs(path, exist_ok=True)
+    path = os.path.join(path, file_name)
+    return os.path.abspath(path)

EAGLE/lmms_eval/tasks/_task_utils/gpt_eval_utils.py ADDED Viewed

File without changes

EAGLE/lmms_eval/tasks/_task_utils/vqa_eval_metric.py ADDED Viewed

	@@ -0,0 +1,213 @@

+import re
+class EvalAIAnswerProcessor:
+    """
+    Processes an answer similar to Eval AI
+        copied from
+        https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897
+    """
+    CONTRACTIONS = {
+        "aint": "ain't",
+        "arent": "aren't",
+        "cant": "can't",
+        "couldve": "could've",
+        "couldnt": "couldn't",
+        "couldn'tve": "couldn't've",
+        "couldnt've": "couldn't've",
+        "didnt": "didn't",
+        "doesnt": "doesn't",
+        "dont": "don't",
+        "hadnt": "hadn't",
+        "hadnt've": "hadn't've",
+        "hadn'tve": "hadn't've",
+        "hasnt": "hasn't",
+        "havent": "haven't",
+        "hed": "he'd",
+        "hed've": "he'd've",
+        "he'dve": "he'd've",
+        "hes": "he's",
+        "howd": "how'd",
+        "howll": "how'll",
+        "hows": "how's",
+        "Id've": "I'd've",
+        "I'dve": "I'd've",
+        "Im": "I'm",
+        "Ive": "I've",
+        "isnt": "isn't",
+        "itd": "it'd",
+        "itd've": "it'd've",
+        "it'dve": "it'd've",
+        "itll": "it'll",
+        "let's": "let's",
+        "maam": "ma'am",
+        "mightnt": "mightn't",
+        "mightnt've": "mightn't've",
+        "mightn'tve": "mightn't've",
+        "mightve": "might've",
+        "mustnt": "mustn't",
+        "mustve": "must've",
+        "neednt": "needn't",
+        "notve": "not've",
+        "oclock": "o'clock",
+        "oughtnt": "oughtn't",
+        "ow's'at": "'ow's'at",
+        "'ows'at": "'ow's'at",
+        "'ow'sat": "'ow's'at",
+        "shant": "shan't",
+        "shed've": "she'd've",
+        "she'dve": "she'd've",
+        "she's": "she's",
+        "shouldve": "should've",
+        "shouldnt": "shouldn't",
+        "shouldnt've": "shouldn't've",
+        "shouldn'tve": "shouldn't've",
+        "somebody'd": "somebodyd",
+        "somebodyd've": "somebody'd've",
+        "somebody'dve": "somebody'd've",
+        "somebodyll": "somebody'll",
+        "somebodys": "somebody's",
+        "someoned": "someone'd",
+        "someoned've": "someone'd've",
+        "someone'dve": "someone'd've",
+        "someonell": "someone'll",
+        "someones": "someone's",
+        "somethingd": "something'd",
+        "somethingd've": "something'd've",
+        "something'dve": "something'd've",
+        "somethingll": "something'll",
+        "thats": "that's",
+        "thered": "there'd",
+        "thered've": "there'd've",
+        "there'dve": "there'd've",
+        "therere": "there're",
+        "theres": "there's",
+        "theyd": "they'd",
+        "theyd've": "they'd've",
+        "they'dve": "they'd've",
+        "theyll": "they'll",
+        "theyre": "they're",
+        "theyve": "they've",
+        "twas": "'twas",
+        "wasnt": "wasn't",
+        "wed've": "we'd've",
+        "we'dve": "we'd've",
+        "weve": "we've",
+        "werent": "weren't",
+        "whatll": "what'll",
+        "whatre": "what're",
+        "whats": "what's",
+        "whatve": "what've",
+        "whens": "when's",
+        "whered": "where'd",
+        "wheres": "where's",
+        "whereve": "where've",
+        "whod": "who'd",
+        "whod've": "who'd've",
+        "who'dve": "who'd've",
+        "wholl": "who'll",
+        "whos": "who's",
+        "whove": "who've",
+        "whyll": "why'll",
+        "whyre": "why're",
+        "whys": "why's",
+        "wont": "won't",
+        "wouldve": "would've",
+        "wouldnt": "wouldn't",
+        "wouldnt've": "wouldn't've",
+        "wouldn'tve": "wouldn't've",
+        "yall": "y'all",
+        "yall'll": "y'all'll",
+        "y'allll": "y'all'll",
+        "yall'd've": "y'all'd've",
+        "y'alld've": "y'all'd've",
+        "y'all'dve": "y'all'd've",
+        "youd": "you'd",
+        "youd've": "you'd've",
+        "you'dve": "you'd've",
+        "youll": "you'll",
+        "youre": "you're",
+        "youve": "you've",
+    }
+    NUMBER_MAP = {
+        "none": "0",
+        "zero": "0",
+        "one": "1",
+        "two": "2",
+        "three": "3",
+        "four": "4",
+        "five": "5",
+        "six": "6",
+        "seven": "7",
+        "eight": "8",
+        "nine": "9",
+        "ten": "10",
+    }
+    ARTICLES = ["a", "an", "the"]
+    PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)")
+    COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)")
+    PUNCTUATIONS = [
+        ";",
+        r"/",
+        "[",
+        "]",
+        '"',
+        "{",
+        "}",
+        "(",
+        ")",
+        "=",
+        "+",
+        "\\",
+        "_",
+        "-",
+        ">",
+        "<",
+        "@",
+        "`",
+        ",",
+        "?",
+        "!",
+    ]
+    def __init__(self, *args, **kwargs):
+        pass
+    def word_tokenize(self, word):
+        word = word.lower()
+        word = word.replace(",", "").replace("?", "").replace("'s", " 's")
+        return word.strip()
+    def process_punctuation(self, in_text):
+        out_text = in_text
+        for p in self.PUNCTUATIONS:
+            if (p + " " in in_text or " " + p in in_text) or (re.search(self.COMMA_STRIP, in_text) is not None):
+                out_text = out_text.replace(p, "")
+            else:
+                out_text = out_text.replace(p, " ")
+        out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE)
+        return out_text
+    def process_digit_article(self, in_text):
+        out_text = []
+        temp_text = in_text.lower().split()
+        for word in temp_text:
+            word = self.NUMBER_MAP.setdefault(word, word)
+            if word not in self.ARTICLES:
+                out_text.append(word)
+            else:
+                pass
+        for word_id, word in enumerate(out_text):
+            if word in self.CONTRACTIONS:
+                out_text[word_id] = self.CONTRACTIONS[word]
+        out_text = " ".join(out_text)
+        return out_text
+    def __call__(self, item):
+        item = self.word_tokenize(item)
+        item = item.replace("\n", " ").replace("\t", " ").strip()
+        item = self.process_punctuation(item)
+        item = self.process_digit_article(item)
+        return item

EAGLE/lmms_eval/tasks/cmmmu/_cmmmu.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+group: cmmmu
+task:
+- cmmmu_val
+- cmmmu_test

EAGLE/lmms_eval/tasks/cmmmu/_default_template_cmmmu_yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+dataset_path: lmms-lab/CMMMU
+output_type: generate_until
+doc_to_visual: !function utils.cmmmu_doc_to_visual
+doc_to_text: !function utils.cmmmu_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 16
+  image_aspect_ratio: original

EAGLE/lmms_eval/tasks/cmmmu/cmmmu_test.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+task: "cmmmu_test"
+test_split: test
+# The return value of process_results will be used by metrics
+process_results: !function utils.cmmmu_process_test_results_for_submission
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+metric_list:
+  - metric: submission
+    aggregation: !function utils.cmmmu_test_aggregate_results_for_submission
+    higher_is_better: false
+metadata:
+  - version: 0.0
+include: _default_template_cmmmu_yaml

EAGLE/lmms_eval/tasks/cmmmu/cmmmu_val.yaml ADDED Viewed

	@@ -0,0 +1,15 @@

+task: "cmmmu_val"
+test_split: val
+# The return value of process_results will be used by metrics
+process_results: !function utils.cmmmu_process_results
+# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
+generation_kwargs:
+  max_new_tokens: 16
+  image_aspect_ratio: original
+metric_list:
+  - metric: cmmmu_acc
+    aggregation: !function utils.cmmmu_aggregate_results
+    higher_is_better: true
+metadata:
+  - version: 0.0
+include: _default_template_cmmmu_yaml

EAGLE/lmms_eval/tasks/cmmmu/utils.py ADDED Viewed

	@@ -0,0 +1,421 @@

+from collections import defaultdict
+import re
+import random
+import os
+import json
+import logging
+from collections import Counter
+from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
+eval_logger = logging.getLogger("lmms-eval")
+PROMPT = {
+    "task_instructions": [
+        "请回答以下多项选择题，并选出正确选项。这些题目可能包括单选和多选题型。如果所提供的信息不足以确定一个明确的答案，那么请根据可用的数据和你的判断来选择最可能正确的选项。",
+        "请回答以下判断题，并根据题目描述和所给的信息来判断问题中陈述的对错。如果信息不完整或不足以作出绝对判断，请运用你的逻辑推理和现有信息来做出最可能的判断。",
+        "请回答以下填空题，并根据题目的要求和所提供的信息来给出最恰当的答案。如果信息不足以确切回答，那么请依据现有的数据和你的推理能力来填写最合理的答案。",
+    ],
+    "multi_choice_example_format": ["问题：{}\n选项：\n{}\n正确答案：\n"],
+    "T/F_example_format": ["问题：{}\n正确答案：\n"],
+    "short_ans_example_format": ["问题：{}\n正确答案：\n"],
+}
+def construct_prompt(sample):
+    question = sample["question"]
+    task_instructions = PROMPT["task_instructions"]
+    if sample["type"] == "选择":
+        formatted_options = ""
+        start_chr = "A"
+        for i in range(1, 5):
+            formatted_options += f"({start_chr}) {sample[f'option{i}']}\n"
+            start_chr = chr(ord(start_chr) + 1)
+        current_example_template = PROMPT["multi_choice_example_format"][0]
+        current_example = current_example_template.format(question, formatted_options)
+        final_input_prompt = task_instructions[0] + "\n\n" + current_example
+    elif sample["type"] == "判断":
+        current_example_template = PROMPT["T/F_example_format"][0]
+        current_example = current_example_template.format(question)
+        final_input_prompt = task_instructions[1] + "\n\n" + current_example
+    else:  # For fill in the blanks questions.
+        current_example_template = PROMPT["short_ans_example_format"][0]
+        current_example = current_example_template.format(question)
+        final_input_prompt = task_instructions[2] + "\n\n" + current_example
+    for i in range(1, 6):
+        final_input_prompt = final_input_prompt.replace(f'<img="{sample[f"image_{i}_filename"]}">', f"<图片 {i}>")
+    return final_input_prompt
+def cmmmu_doc_to_text(doc):
+    return construct_prompt(doc)
+def cmmmu_doc_to_visual(doc):
+    prompt = construct_prompt(doc)
+    image_tokens = re.findall(r"<图片 \d+>", prompt)
+    # Remove <> and  swap space as _
+    image_tokens = [image_token.strip("<>").replace(" ", "_").replace("图片", "image") for image_token in image_tokens]
+    visual = [doc[image_token].convert("RGB") for image_token in image_tokens]
+    return visual
+def cmmmu_process_results(doc, results):
+    pred = results[0]
+    if doc["type"] == "选择":
+        index2ans, all_choices = get_multi_choice_info([doc[f"option{i}"] for i in range(1, 5)])
+        parsed_pred = get_multi_choice_prediction(pred, all_choices, index2ans)
+    elif doc["type"] == "判断":
+        parsed_pred = get_TF_prediction(pred)
+    else:
+        parsed_pred = get_fill_blank_prediction(pred, doc["answer"])
+    return {"cmmmu_acc": {"id": doc["id"], "subdomain": doc["subcategory"], "question_type": doc["type"], "answer": doc["answer"], "parsed_pred": parsed_pred}}
+def cmmmu_aggregate_results(results):
+    evaluation_result = {}
+    subset_to_eval_samples = defaultdict(list)
+    for result in results:
+        subset_to_eval_samples[result["subdomain"]].append(result)
+    for subset, sub_eval_samples in subset_to_eval_samples.items():
+        metric_dict = eval_cmmmu(sub_eval_samples)
+        evaluation_result[subset] = metric_dict
+    printable_results = {}
+    for domain, in_domain_cats in DOMAIN_CAT2SUB_CAT.items():
+        in_domain_cat_results = {}
+        for cat_name in in_domain_cats:
+            if cat_name in evaluation_result.keys():
+                in_domain_cat_results[cat_name] = evaluation_result[cat_name]
+            else:
+                pass
+        in_domain_ins_acc = calculate_ins_level_acc(in_domain_cat_results)
+        in_domain_data_num = sum([cat_results["entries_num"] for cat_results in in_domain_cat_results.values()])
+        printable_results["Overall-" + domain] = {
+            "num": int(in_domain_data_num),
+            "acc": round(in_domain_ins_acc, 3),
+        }
+        # add sub category
+        for cat_name, cat_results in in_domain_cat_results.items():
+            printable_results[cat_name] = {
+                "num": int(cat_results["entries_num"]),
+                "acc": round(cat_results["acc"], 3),
+            }
+    all_ins_acc = calculate_ins_level_acc(evaluation_result)
+    printable_results["Overall"] = {
+        "num": sum([cat_results["entries_num"] for cat_results in evaluation_result.values()]),
+        "acc": round(all_ins_acc, 3),
+    }
+    print(printable_results)
+    return printable_results["Overall"]["acc"]
+def cmmmu_process_test_results_for_submission(doc, results):
+    response = results[0]
+    return {"submission": {"id": doc["id"], "type": doc["type"], "response": response}}
+def cmmmu_test_aggregate_results_for_submission(results, args):
+    file = generate_submission_file("cmmmu_test_for_submission.jsonl", args)
+    with open(file, "w", encoding="utf8") as f:
+        for result in results:
+            json.dump(result, f, ensure_ascii=False)
+            f.write("\n")
+    eval_logger.info(f"Submission file saved to {file}")
+##################
+# Helper functions
+##################
+DOMAIN_CAT2SUB_CAT = {
+    "艺术与设计": ["艺术", "艺术理论", "设计", "音乐"],
+    "商业": ["会计", "经济", "金融", "管理", "营销"],
+    "科学": ["生物", "化学", "地理", "数学", "物理"],
+    "健康与医学": ["基础医学", "临床医学", "诊断学与实验室医学", "制药", "公共卫生"],
+    "人文社会科学": ["历史", "文献学", "社会学", "心理学"],
+    "技术与工程": ["农业", "建筑学", "计算机科学", "电子学", "能源和电力", "材料", "机械工程"],
+}
+def eval_cmmmu(entries):
+    correct_cnt = 0
+    for entry in entries:
+        parsed_pred = entry.get("parsed_pred", "")
+        correct = False
+        if entry.get("question_type") == "选择":
+            if parsed_pred == entry["answer"]:
+                correct_cnt += 1
+                correct = True
+        elif entry.get("question_type") == "填空":
+            norm_answers = normalize_str(entry["answer"], entry["answer"])
+            for pred in parsed_pred:
+                # already normalized
+                if isinstance(pred, str):  # if it's a string, then find if ans in the pred_i
+                    for norm_ans in norm_answers:
+                        # only see if the string answer in the string pred
+                        # print(norm_ans, pred)
+                        if isinstance(norm_ans, str) and norm_ans in pred:
+                            if not correct:
+                                correct_cnt += 1
+                                correct = True
+                            break
+                else:  # it's a number
+                    if pred in norm_answers:
+                        if not correct:
+                            correct_cnt += 1
+                            correct = True
+                        break
+        else:
+            positive_keywords = ["正确", "对", "准确", "肯定", "对的"]
+            negative_keywords = ["不对", "错误", "不正确", "不准确", "不合适", "否定", "错的", "错"]
+            ambiguous_keywords = ["对错", "是否正确", "否正确", "或者", "是否", "正确性", "对不"]
+            def judge_similarity(pred_list, positive_keywords, negative_keywords):
+                positive_count = 0
+                negative_count = 0
+                for pred in pred_list:
+                    if any(pos_word in pred for pos_word in positive_keywords):
+                        positive_count += 1
+                    elif any(neg_word in pred for neg_word in negative_keywords):
+                        negative_count += 1
+                if positive_count > negative_count:
+                    return "对"
+                elif negative_count > positive_count:
+                    return "错"
+                else:
+                    return random.choice(["对", "错"])
+            answer = entry["answer"]
+            parsed_pred = [word for word in parsed_pred if not any(ambiguous in word for ambiguous in ambiguous_keywords)]
+            result = judge_similarity(parsed_pred, positive_keywords, negative_keywords)
+            if result == answer:
+                correct_cnt += 1
+                correct = True
+        if correct:
+            entry["judge"] = "正确"
+        else:
+            entry["judge"] = "错误"
+    if len(entries) == 0:
+        print("entries_num == 0, please check your file")
+        results_count = {"correct_num": 0, "entries_num": 0, "acc": 0}
+    else:
+        results_count = {"correct_num": correct_cnt, "entries_num": len(entries), "acc": correct_cnt / len(entries)}
+    return results_count
+def get_multi_choice_prediction(response, all_choices, index2ans):
+    for char in [",", ".", "!", "?", ";", ":", "'"]:
+        response = response.strip(char)
+    response = " " + response + " "  # add space to avoid partial match
+    candidates = []
+    for choice in all_choices:  # (A) (B) (C) (D)
+        # Add the choice to candidates each time it appears in the response
+        candidates.extend([choice for _ in range(response.count(f"({choice})"))])
+    if len(candidates) == 0:
+        for choice in all_choices:  # A B C D
+            # Similarly, add the choice for each occurrence
+            candidates.extend([choice for _ in range(response.count(f"{choice}"))])
+    if len(candidates) == 0 and len(response.split()) >= 1:
+        for index, ans in index2ans.items():
+            # Add index for each occurrence of ans in response
+            candidates.extend([index for _ in range(response.count(ans))])
+    # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
+    if len(candidates) == 0 and len(response.split()) >= 1:
+        for index, ans in index2ans.items():
+            if ans in response:
+                candidates.append(index)
+                index_ans = False  # it's content ans.
+    if len(candidates) == 0:  # still not get answer, randomly choose one.
+        return random.choice(all_choices)
+        # return ''
+    else:
+        # Count the occurrence of each candidate
+        candidate_counts = Counter(candidates)
+        # Select the most frequent candidates
+        max_count = max(candidate_counts.values())
+        most_frequent_candidates = [c for c in all_choices if candidate_counts.get(c, 0) == max_count]
+        # Combine the most frequent candidates in ABCD order
+        return "".join(most_frequent_candidates)
+def extract_numbers(string):
+    # Pattern for numbers with Chinese commas
+    pattern_commas = r"-?\d{1,3}(?:，\d{3})+"
+    # Pattern for scientific notation
+    pattern_scientific = r"-?\d+(?:\.\d+)?[eE][+-]?\d+"
+    # Pattern for simple numbers without Chinese commas
+    pattern_simple = r"-?(?:\d+\.\d+|\.\d+|\d+)(?![eE][+-]?\d+)(?!，\d)"
+    # Extract numbers with Chinese commas
+    numbers_with_commas = re.findall(pattern_commas, string)
+    # Extract numbers in scientific notation
+    numbers_scientific = re.findall(pattern_scientific, string)
+    # Extract simple numbers without Chinese commas
+    numbers_simple = re.findall(pattern_simple, string)
+    # Combine all extracted numbers
+    all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
+    return all_numbers
+def check_is_number(string):
+    try:
+        float(string.replace(",", ""))
+        return True
+    except ValueError:
+        # check if there's comma inside
+        return False
+def count_letters(string):
+    return sum(c.isalpha() and "a" <= c <= "z" or "A" <= c <= "Z" for c in string)
+def normalize_str(string, answer):
+    # check if characters in the string
+    # if number, numerize it.
+    if string == None:
+        return [string]
+    string = string.strip()
+    is_number = check_is_number(string)
+    if is_number:
+        string = string.replace(",", "")
+        string = float(string)
+        # leave 2 decimal
+        string = round(string, 2)
+        return [string]
+    else:  # it's likely to be a string
+        if len(string) > len(answer) + 20 or count_letters(string) > count_letters(answer) + 2:
+            return []
+        return [string]
+def get_fill_blank_prediction(response, answer):
+    """get the prediction from the generated response,
+    return a list of predicted strings or numbers"""
+    def get_key_subresponses(response):
+        key_responses = []
+        response = response.strip("。").strip()
+        sub_responses = re.split(r"。|\n", response)
+        indicators_of_keys = ["是", "为", "所以", "等于", "方案", "选择", "正确答案", "因此", "最后", "答案", "结果"]
+        key_responses = []
+        for index, resp in enumerate(sub_responses):
+            # if last one, accept it's an equation (the entire response can be just one sentence with equation)
+            if index == len(sub_responses) - 1:
+                indicators_of_keys.extend(["="])
+            shortest_key_response = None  # the shortest response that may contain the answer (tail part of the response)
+            for indicator in indicators_of_keys:
+                if indicator in resp:
+                    if not shortest_key_response:
+                        shortest_key_response = resp.split(indicator)[-1].strip()
+                    else:
+                        if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response):
+                            shortest_key_response = resp.split(indicator)[-1].strip()
+            if shortest_key_response:
+                # and it's not trivial
+                if shortest_key_response.strip() not in [":", ",", ".", "!", "?", ";", ":", "'"]:
+                    key_responses.append(shortest_key_response)
+        if len(key_responses) == 0:  # did not found any
+            return [response]
+        return key_responses
+    key_responses = get_key_subresponses(response)
+    pred_list = key_responses.copy()  # keep the original string response
+    for resp in key_responses:
+        pred_list.extend(extract_numbers(resp))
+    tmp_pred_list = []
+    for i in range(len(pred_list)):
+        tmp_pred_list.extend(normalize_str(pred_list[i], answer))
+    pred_list = tmp_pred_list
+    # remove duplicates
+    pred_list = list(set(pred_list))
+    return pred_list
+def get_TF_prediction(response):
+    """get the prediction from the generated response,
+    return a list of predicted strings or numbers"""
+    def get_key_subresponses(response):
+        key_responses = []
+        response = response.strip("。").strip()
+        sub_responses = re.split(r"。|\n", response)
+        indicators_of_keys = ["是", "为", "所以", "判断", "陈述", "说法", "表达", "答案", "结果"]
+        key_responses = []
+        for index, resp in enumerate(sub_responses):
+            shortest_key_response = None  # the shortest response that may contain the answer (tail part of the response)
+            for indicator in indicators_of_keys:
+                if indicator in resp:
+                    if not shortest_key_response:
+                        shortest_key_response = resp.split(indicator)[-1].strip()
+                    else:
+                        if len(resp.split(indicator)[-1].strip()) < len(shortest_key_response):
+                            shortest_key_response = resp.split(indicator)[-1].strip()
+            if shortest_key_response:
+                # and it's not trivial
+                if shortest_key_response.strip() not in [":", ",", ".", "!", "?", ";", ":", "'"]:
+                    key_responses.append(shortest_key_response)
+        if len(key_responses) == 0:  # did not found any
+            return [response]
+        return key_responses
+    key_responses = get_key_subresponses(response)
+    pred_list = key_responses.copy()  # keep the original string response
+    # remove duplicates
+    pred_list = list(set(pred_list))
+    return pred_list
+def get_multi_choice_info(options):
+    start_chr = "A"
+    all_choices = []
+    index2ans = {}
+    for i, option in enumerate(options):
+        index2ans[chr(ord(start_chr) + i)] = option
+        all_choices.append(chr(ord(start_chr) + i))
+    return index2ans, all_choices
+def calculate_ins_level_acc(results):
+    correct_sum = 0
+    entries_sum = 0
+    for cat_results in results.values():
+        correct_sum += cat_results["correct_num"]
+        entries_sum += cat_results["entries_num"]
+    if entries_sum == 0:
+        return 0
+    return correct_sum / entries_sum

EAGLE/lmms_eval/tasks/gqa/gqa.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+dataset_path: lmms-lab/GQA
+dataset_name: testdev_balanced_instructions
+dataset_kwargs:
+  token: True
+task: "gqa"
+test_split: testdev
+output_type: generate_until
+doc_to_visual: !function utils.gqa_doc_to_visual
+doc_to_text: !function utils.gqa_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 16
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+metadata:
+  - version: 0.0
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer the question using a single word or phrase."
+  qwen_vl:
+    pre_prompt: ""
+    post_prompt: " Answer:"

EAGLE/lmms_eval/tasks/gqa/utils.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from datasets import load_dataset
+GQA_RAW_IMAGE_DATASET = None
+GQA_ID2IMAGE = None
+def gqa_doc_to_visual(doc):
+    global GQA_RAW_IMAGE_DATASET
+    global GQA_ID2IMAGE
+    if GQA_RAW_IMAGE_DATASET is None:
+        GQA_RAW_IMAGE_DATASET = load_dataset("lmms-lab/GQA", "testdev_balanced_images", split="testdev", token=True)
+        GQA_ID2IMAGE = {}
+        for row in GQA_RAW_IMAGE_DATASET:
+            GQA_ID2IMAGE[row["id"]] = row["image"].convert("RGB")
+    image = GQA_ID2IMAGE[doc["imageId"]]
+    return [image]
+def gqa_doc_to_text(doc, model_specific_prompt_kwargs):
+    question = doc["question"]
+    pre_prompt = model_specific_prompt_kwargs["pre_prompt"]
+    post_prompt = model_specific_prompt_kwargs["post_prompt"]
+    return f"{pre_prompt}{question}{post_prompt}"

EAGLE/lmms_eval/tasks/llava-in-the-wild/llava-in-the-wild.yaml ADDED Viewed

	@@ -0,0 +1,39 @@

+dataset_path: lmms-lab/llava-bench-in-the-wild
+dataset_kwargs:
+  token: True
+task: "llava_in_the_wild"
+test_split: train
+output_type: generate_until
+doc_to_visual: !function utils.llava_doc_to_visual
+doc_to_text: !function utils.llava_doc_to_text
+doc_to_target: "gpt_answer"
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+  image_aspect_ratio: original
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
+process_results: !function utils.llava_process_results
+metric_list:
+  - metric: gpt_eval_llava_all
+    aggregation: !function utils.llava_all_aggregation
+    higher_is_better: true
+  - metric: gpt_eval_llava_conv
+    aggregation: !function utils.llava_conv_aggregation
+    higher_is_better: true
+  - metric: gpt_eval_llava_detail
+    aggregation: !function utils.llava_detail_aggregation
+    higher_is_better: true
+  - metric: gpt_eval_llava_complex
+    aggregation: !function utils.llava_complex_aggregation
+    higher_is_better: true
+metadata:
+  version: 0.0
+  gpt_eval_model_name: "gpt-4-0613"
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: ""

EAGLE/lmms_eval/tasks/llava-in-the-wild/rule.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "coding": {"role": "Assistant", "prompt": "Your task is to evaluate the coding abilities of the above two assistants. They have been asked to implement a program to solve a given problem. Please review their code submissions, paying close attention to their problem-solving approach, code structure, readability, and the inclusion of helpful comments.\n\nPlease ensure that the assistants' submissions:\n\n1. Correctly implement the given problem statement.\n2. Contain accurate and efficient code.\n3. Include clear and concise comments that explain the code's logic and functionality.\n4. Adhere to proper coding standards and best practices.\n\nOnce you have carefully reviewed both submissions, provide detailed feedback on their strengths and weaknesses, along with any suggestions for improvement. You should first output a single line containing two scores on the scale of 1-10 (1: no code/no sense; 10: perfect) for Assistant 1 and 2, respectively. Then give extra comments starting from the next line."},
+    "math":  {"role": "Assistant", "prompt": "We would like to request your feedback on the mathematical proficiency of two AI assistants regarding the given user question.\nFirstly, please solve the problem independently, without referring to the answers provided by Assistant 1 and Assistant 2.\nAfterward, please examine the problem-solving process of Assistant 1 and Assistant 2 step-by-step to ensure their correctness, identifying any incorrect steps if present. Your evaluation should take into account not only the answer but also the problem-solving steps.\nFinally, please output a Python tuple containing two numerical scores for Assistant 1 and Assistant 2, ranging from 1 to 10, respectively. If applicable, explain the reasons for any variations in their scores and determine which assistant performed better."},
+    "default":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "conv":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "detail":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "complex":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "llava_bench_conv":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "llava_bench_detail":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "llava_bench_complex":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}
+}

EAGLE/lmms_eval/tasks/llava-in-the-wild/utils.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import json
+import logging
+import os
+import requests
+import numpy as np
+import openai
+from openai import OpenAI
+import time
+import yaml
+from pathlib import Path
+from copy import deepcopy
+eval_logger = logging.getLogger("lmms-eval")
+NUM_SECONDS_TO_SLEEP = 5
+LLAVA_W_METRICS = ["gpt_eval_llava_conv", "gpt_eval_llava_detail", "gpt_eval_llava_complex"]
+rule_dict = json.load(open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "rule.json"), "r"))
+with open(Path(__file__).parent / "llava-in-the-wild.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+    config = yaml.safe_load("".join(safe_data))
+GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
+API_TYPE = os.getenv("API_TYPE", "openai")
+if API_TYPE == "openai":
+    API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+    API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "Authorization": f"Bearer {API_KEY}",
+        "Content-Type": "application/json",
+    }
+elif API_TYPE == "azure":
+    API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+    headers = {
+        "api-key": API_KEY,
+        "Content-Type": "application/json",
+    }
+def get_eval(content: str, max_tokens: int, retries: int = 5):
+    global headers
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful and precise assistant for checking the quality of the answer.",
+        },
+        {"role": "user", "content": content},
+    ]
+    payload = {
+        "model": GPT_EVAL_MODEL_NAME,
+        "messages": messages,
+        "temperature": 0.2,
+        "max_tokens": max_tokens,
+    }
+    for attempt in range(retries):
+        try:
+            response = requests.post(API_URL, headers=headers, json=payload, timeout=60)
+            response.raise_for_status()
+            response_data = response.json()
+            content = response_data["choices"][0]["message"]["content"].strip()
+            if content != "":
+                return content, response_data["model"]
+            break  # If successful, break out of the loop
+        except Exception as e:
+            eval_logger.info(f"Attempt {attempt + 1} failed with error: {e}")
+            if attempt < retries:  # If we have retries left, sleep and then continue to next attempt
+                time.sleep(NUM_SECONDS_TO_SLEEP)
+            else:  # If this was the last attempt, log and return empty
+                eval_logger.error(f"All {retries} attempts failed. Last error message: {e}")
+                return "", ""
+    return "", ""
+def parse_score(review):
+    try:
+        score_pair = review.split("\n")[0]
+        score_pair = score_pair.replace(",", " ")
+        sp = score_pair.split(" ")
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            eval_logger.debug(f"Can not split: {review}. Returning [-1, -1]")
+            return [-1, -1]
+    except Exception as e:
+        eval_logger.debug(f"Error: {e}. Returning [-1, -1]")
+        return [-1, -1]
+def llava_doc_to_visual(doc):
+    return [doc["image"].convert("RGB")]
+def llava_doc_to_text(doc, model_specific_prompt_kwargs=None):
+    if model_specific_prompt_kwargs is None:
+        model_specific_prompt_kwargs = {}
+    pre_prompt = model_specific_prompt_kwargs.get("pre_prompt", "")
+    post_prompt = model_specific_prompt_kwargs.get("post_prompt", "")
+    return f"{pre_prompt}{doc['question']}{post_prompt}"
+def llava_process_results(doc, result):
+    """
+    Args:
+        doc: a instance of the eval dataset
+        results: [pred]
+    Returns:
+        a dictionary with key: metric name (in this case coco_bleu), value: metric value
+    """
+    try:
+        question = doc.get("question", "")
+        ans1 = doc.get("gpt_answer", "")
+        ans2 = result[0] if result else ""
+        captions = doc.get("caption", [])
+        context = "\n".join(captions) if isinstance(captions, list) else captions
+        category = "llava_bench_" + doc.get("category", "")
+        rule = rule_dict.get(category, {})
+        prompt = rule.get("prompt", "")
+        role = rule.get("role", "user")
+        content = f"[Context]\n{context}\n\n" f"[Question]\n{question}\n\n" f"[{role} 1]\n{ans1}\n\n[End of {role} 1]\n\n" f"[{role} 2]\n{ans2}\n\n[End of {role} 2]\n\n" f"[System]\n{prompt}\n\n"
+        review, model_name = get_eval(content, 1024)
+        scores = parse_score(review)
+    except Exception as e:
+        eval_logger.error(f"Error for Question ID: {doc.get('question_id', 'Unknown')}: {e}")
+        review = "Failed to Get a Proper Review."
+        model_name = "Failed Request"
+        scores = [-1, -1]
+    metric = f"gpt_eval_llava_{doc.get('category', 'all')}"
+    category_review_dict = {"question": question, "ans1": ans1, "ans2": ans2, "context": context, "category": category, "review": review, "scores": scores, "eval_model": model_name, "content": content}
+    non_category_review_dict = deepcopy(category_review_dict)
+    non_category_review_dict["scores"] = [-999, -999]
+    data_dict = {}
+    for m in LLAVA_W_METRICS:
+        if m == metric:
+            data_dict[m] = category_review_dict
+        else:
+            data_dict[m] = non_category_review_dict
+    data_dict["gpt_eval_llava_all"] = category_review_dict
+    # return {"gpt_eval_llava_all": review_dict}
+    return data_dict
+def llava_conv_aggregation(results):
+    return llava_aggregation(results, "conv")
+def llava_complex_aggregation(results):
+    return llava_aggregation(results, "complex")
+def llava_detail_aggregation(results):
+    return llava_aggregation(results, "detail")
+def llava_all_aggregation(results):
+    return llava_aggregation(results, "all")
+def llava_aggregation(results, category):
+    try:
+        scores = []
+        for result in results:
+            if -999 in result["scores"]:
+                continue
+            scores.append(result["scores"])
+        stats = np.asarray(scores).mean(0).tolist()
+        stats = [round(x, 3) for x in stats]
+        # gpt4_score_percentage = stats[0] * 10
+        # model_score_percentage = stats[1] * 10
+        # eval_logger.info(f"Category: {category}")
+        # eval_logger.info(f"GPT4 Score: {gpt4_score_percentage:.1f}%")
+        # eval_logger.info(f"Model Score: {model_score_percentage:.1f}%")
+        # eval_logger.info("=========================")
+        return round(stats[1] / stats[0] * 100, 1)
+    except Exception as e:
+        eval_logger.info(f"Error in llava_aggregation: {e}, and in category: {category}")
+        return None

EAGLE/lmms_eval/tasks/mmbench/_default_template_mmbench_cn_yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+dataset_path: lmms-lab/MMBench
+dataset_kwargs:
+  token: True
+doc_to_target: "answer"
+dataset_name: "cn"
+output_type: generate_until
+doc_to_visual: !function cn_utils.mmbench_doc_to_visual
+doc_to_text: !function cn_utils.mmbench_doc_to_text
+generation_kwargs:
+  max_new_tokens: 256
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
+process_results: !function cn_utils.mmbench_process_results
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\n请直接使用所提供的选项字母作为答案回答。"
+model_specific_generation_kwargs:
+  llava:
+    image_aspect_ratio: original

EAGLE/lmms_eval/tasks/mmbench/_default_template_mmbench_en_yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+dataset_path: lmms-lab/MMBench
+dataset_kwargs:
+  token: True
+doc_to_target: "answer"
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer with the option's letter from the given choices directly."
+doc_to_visual: !function en_utils.mmbench_doc_to_visual
+doc_to_text: !function en_utils.mmbench_doc_to_text
+doc_to_target: "answer"
+process_results: !function en_utils.mmbench_process_results
+model_specific_generation_kwargs:
+  llava:
+    image_aspect_ratio: original
+output_type: generate_until
+dataset_name: "en"
+generation_kwargs:
+  until:
+    - "ASSISTANT:"
+  max_new_tokens: 1024
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false

EAGLE/lmms_eval/tasks/mmbench/cc_utils.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import logging
+import yaml
+import os
+from pathlib import Path
+import pandas as pd
+import json
+eval_logger = logging.getLogger("lmms-eval")
+from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator
+from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
+with open(Path(__file__).parent / "mmbench.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+    config = yaml.safe_load("".join(safe_data))
+GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
+API_TYPE = os.getenv("API_TYPE", "openai")
+if API_TYPE == "openai":
+    API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+    API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+elif API_TYPE == "azure":
+    API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)
+def mmbench_doc_to_visual(doc):
+    return [doc["image"].convert("RGB")]
+def mmbench_cn_cc_doc_to_text(doc, model_specific_prompt_kwargs=None):
+    option_candidate = ["A", "B", "C", "D", "E"]
+    options_prompt, options_dict = mmbench_evaluator.create_options_prompt(doc, option_candidate)
+    data = {
+        # "img": doc["image"],
+        "question": doc["question"],
+        "answer": doc.get("answer", None),
+        "options": options_prompt,
+        "category": doc["category"],
+        "options_dict": options_dict,
+        "index": doc["index"],
+        "source": doc["source"],
+    }
+    query_prompt = f"{data['question']} {data['options']}"
+    if model_specific_prompt_kwargs:
+        query_prompt = f"{query_prompt}\n{model_specific_prompt_kwargs['post_prompt']}"
+    return query_prompt
+def mmbench_cn_cc_process_results(doc, results):
+    model_response = results[0].strip()
+    data = {
+        "gpt_eval_score": {
+            "index": doc["index"],
+            "question": doc["question"],
+            "answer": doc["answer"],
+            "prediction": model_response,
+            "source": doc["source"],
+            "category": doc["category"],
+        },
+        "submission": {
+            "index": doc["index"],
+            "question": doc["question"],
+            "answer": doc["answer"],
+            "prediction": model_response,
+            "source": doc["source"],
+            "category": doc["category"],
+        },
+    }
+    option_candidate = ["A", "B", "C", "D", "E"]
+    for c in option_candidate:
+        data["submission"][c] = doc.get(c, "nan")
+        data["gpt_eval_score"][c] = doc.get(c, "nan")
+    return data
+def mmbench_cn_cc_aggregate_dev_results_eval(results, args):
+    print(f"============= MMBench-CN(CC) Detailed Results =============")
+    overall_acc, category_acc, l2_category_acc = mmbench_evaluator.eval_result(results, eval_method="openai")
+    file = generate_submission_file("mmbench_cn_cc_results.json", args)
+    details_info = {
+        "overall_acc": overall_acc,
+        "category_acc": category_acc,
+        "l2_category_acc": l2_category_acc,
+    }
+    with open(file, "w") as f:
+        json.dump(details_info, f)
+    return overall_acc * 100
+def mmbench_cn_cc_aggregate_results(results, args):
+    df = pd.DataFrame(results)
+    file = generate_submission_file("mmbench_cn_cc_results.xlsx", args)
+    with pd.ExcelWriter(file) as writer:
+        df.to_excel(writer, index=False)
+    eval_logger.info(f"Saved results to {file}")

EAGLE/lmms_eval/tasks/mmbench/cn_utils.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import logging
+import yaml
+import os
+from pathlib import Path
+import pandas as pd
+import json
+from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
+eval_logger = logging.getLogger("lmms-eval")
+from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator
+from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
+with open(Path(__file__).parent / "mmbench.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+    config = yaml.safe_load("".join(safe_data))
+GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
+API_TYPE = os.getenv("API_TYPE", "openai")
+if API_TYPE == "openai":
+    API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+    API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+elif API_TYPE == "azure":
+    API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)
+def mmbench_doc_to_visual(doc):
+    return [doc["image"].convert("RGB")]
+def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None):
+    option_candidate = ["A", "B", "C", "D", "E"]
+    options_prompt, options_dict = mmbench_evaluator.create_options_prompt(doc, option_candidate)
+    data = {
+        # "img": doc["image"],
+        "question": doc["question"],
+        "answer": doc.get("answer", None),
+        "options": options_prompt,
+        "category": doc["category"],
+        "L2-category": doc["L2-category"],
+        "options_dict": options_dict,
+        "index": doc["index"],
+        "hint": doc["hint"],
+        "source": doc["source"],
+        "split": doc["split"],
+    }
+    query_prompt = f"{data['hint']} {data['question']} {data['options']}" if pd.notna(data["hint"]) else f"{data['question']} {data['options']}"
+    if model_specific_prompt_kwargs:
+        query_prompt = f"{query_prompt}\n{model_specific_prompt_kwargs['post_prompt']}"
+    return query_prompt
+def mmbench_process_results(doc, results):
+    model_response = results[0].strip()
+    data = {
+        "gpt_eval_score": {
+            "index": doc["index"],
+            "question": doc["question"],
+            "answer": doc["answer"],
+            "prediction": model_response,
+            "hint": doc["hint"],
+            "source": doc["source"],
+            "split": doc["split"],
+            "category": doc["category"],
+            "L2-category": doc["L2-category"],
+        },
+        "submission": {
+            "index": doc["index"],
+            "question": doc["question"],
+            "answer": doc["answer"],
+            "prediction": model_response,
+            "hint": doc["hint"],
+            "source": doc["source"],
+            "split": doc["split"],
+            "category": doc["category"],
+            "L2-category": doc["L2-category"],
+        },
+    }
+    option_candidate = ["A", "B", "C", "D", "E"]
+    for c in option_candidate:
+        data["submission"][c] = doc.get(c, "nan")
+        data["gpt_eval_score"][c] = doc.get(c, "nan")
+    return data
+def mmbench_aggregate_dev_results_eval(results, args):
+    print(f"============= MMBench-CN(Dev) Detailed Results =============")
+    overall_acc, category_acc, l2_category_acc = mmbench_evaluator.eval_result(results, eval_method="openai")
+    file = generate_submission_file("mmbench_cn_dev_results.json", args)
+    details_info = {
+        "overall_acc": overall_acc,
+        "category_acc": category_acc,
+        "l2_category_acc": l2_category_acc,
+    }
+    with open(file, "w") as f:
+        json.dump(details_info, f)
+    return overall_acc * 100
+def mmbench_aggregate_dev_results(results, args):
+    df = pd.DataFrame(results)
+    excel_write_path = generate_submission_file("mmbench_cn_dev_results.xlsx", args)
+    with pd.ExcelWriter(excel_write_path) as writer:
+        df.to_excel(writer, index=False)
+    eval_logger.info(f"Saved results to {excel_write_path}")
+def mmbench_aggregate_test_results(results, args):
+    df = pd.DataFrame(results)
+    excel_write_path = generate_submission_file("mmbench_cn_test_results.xlsx", args)
+    with pd.ExcelWriter(excel_write_path) as writer:
+        df.to_excel(writer, index=False)
+    eval_logger.info(f"Saved results to {excel_write_path}")

EAGLE/lmms_eval/tasks/mmbench/en_utils.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import logging
+import yaml
+import os
+from pathlib import Path
+import pandas as pd
+import json
+eval_logger = logging.getLogger("lmms-eval")
+from lmms_eval.tasks.mmbench.mmbench_evals import MMBench_Evaluator
+from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
+with open(Path(__file__).parent / "mmbench.yaml", "r") as f:
+    raw_data = f.readlines()
+    safe_data = []
+    for i, line in enumerate(raw_data):
+        # remove function definition since yaml load cannot handle it
+        if "!function" not in line:
+            safe_data.append(line)
+    config = yaml.safe_load("".join(safe_data))
+GPT_EVAL_MODEL_NAME = config["metadata"]["gpt_eval_model_name"]
+API_TYPE = os.getenv("API_TYPE", "openai")
+if API_TYPE == "openai":
+    API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
+    API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
+elif API_TYPE == "azure":
+    API_URL = os.getenv("AZURE_ENDPOINT", "https://api.cognitive.microsoft.com/sts/v1.0/issueToken")
+    API_KEY = os.getenv("AZURE_API_KEY", "YOUR_API_KEY")
+mmbench_evaluator = MMBench_Evaluator(sys_prompt=config["metadata"]["sys_prompt"], API_KEY=API_KEY, API_URL=API_URL, model_version=GPT_EVAL_MODEL_NAME)
+def mmbench_doc_to_visual(doc):
+    return [doc["image"].convert("RGB")]
+def mmbench_doc_to_text(doc, model_specific_prompt_kwargs=None):
+    option_candidate = ["A", "B", "C", "D", "E"]
+    options_prompt, options_dict = mmbench_evaluator.create_options_prompt(doc, option_candidate)
+    data = {
+        # "img": doc["image"],
+        "question": doc["question"],
+        "answer": doc.get("answer", None),
+        "options": options_prompt,
+        "category": doc["category"],
+        "L2-category": doc["L2-category"],
+        "options_dict": options_dict,
+        "index": doc["index"],
+        "hint": doc["hint"],
+        "source": doc["source"],
+        "split": doc["split"],
+    }
+    query_prompt = f"{data['hint']} {data['question']} {data['options']}" if pd.notna(data["hint"]) and data["hint"] != "nan" else f"{data['question']} {data['options']}"
+    if model_specific_prompt_kwargs:
+        query_prompt = f"{query_prompt}\n{model_specific_prompt_kwargs['post_prompt']}"
+    return query_prompt
+def mmbench_process_results(doc, results):
+    model_response = results[0].strip()
+    data = {
+        "gpt_eval_score": {
+            "index": doc["index"],
+            "question": doc["question"],
+            "answer": doc["answer"],
+            "prediction": model_response,
+            "hint": doc["hint"],
+            "source": doc["source"],
+            "split": doc["split"],
+            "category": doc["category"],
+            "L2-category": doc["L2-category"],
+        },
+        "submission": {
+            "index": doc["index"],
+            "question": doc["question"],
+            "answer": doc["answer"],
+            "prediction": model_response,
+            "hint": doc["hint"],
+            "source": doc["source"],
+            "split": doc["split"],
+            "category": doc["category"],
+            "L2-category": doc["L2-category"],
+        },
+    }
+    option_candidate = ["A", "B", "C", "D", "E"]
+    for c in option_candidate:
+        data["submission"][c] = doc.get(c, "nan")
+        data["gpt_eval_score"][c] = doc.get(c, "nan")
+    return data
+def mmbench_aggregate_dev_results_eval(results, args):
+    print(f"============= MMBench-EN(Dev) Detailed Results =============")
+    overall_acc, category_acc, l2_category_acc = mmbench_evaluator.eval_result(results, eval_method="openai")
+    file = generate_submission_file("mmbench_en_dev_results.json", args)
+    details_info = {
+        "overall_acc": overall_acc,
+        "category_acc": category_acc,
+        "l2_category_acc": l2_category_acc,
+    }
+    with open(file, "w") as f:
+        json.dump(details_info, f)
+    return overall_acc * 100
+def mmbench_aggregate_dev_results_submission(results, args):
+    df = pd.DataFrame(results)
+    excel_write_path = generate_submission_file("mmbench_en_dev_results.xlsx", args)
+    with pd.ExcelWriter(excel_write_path) as writer:
+        df.to_excel(writer, index=False)
+    eval_logger.info(f"Saved results to {excel_write_path}")
+def mmbench_aggregate_test_results(results, args):
+    df = pd.DataFrame(results)
+    excel_write_path = generate_submission_file("mmbench_en_test_results.xlsx", args)
+    with pd.ExcelWriter(excel_write_path) as writer:
+        df.to_excel(writer, index=False)
+    eval_logger.info(f"Saved results to {excel_write_path}")

EAGLE/lmms_eval/tasks/mmbench/mmbench.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+group: mmbench
+task:
+  - mmbench_en_dev
+  - mmbench_en_test
+  - mmbench_cn_dev
+  - mmbench_cn_test
+  - mmbench_cn_cc
+metadata:
+  version: 0.0
+  sys_prompt: "There are several options:"
+  gpt_eval_model_name: "gpt-3.5-turbo-0613"

EAGLE/lmms_eval/tasks/mmbench/mmbench_cc.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+dataset_path: lmms-lab/MMBench
+dataset_name: cc
+dataset_kwargs:
+  token: True
+task: "mmbench_cn_cc"
+test_split: test
+output_type: generate_until
+doc_to_visual: !function cc_utils.mmbench_doc_to_visual
+doc_to_text: !function cc_utils.mmbench_cn_cc_doc_to_text
+doc_to_target: "answer"
+generation_kwargs:
+  max_new_tokens: 256
+  temperature: 0
+  top_p: 0
+  num_beams: 1
+  do_sample: false
+process_results: !function cc_utils.mmbench_cn_cc_process_results
+metric_list:
+  - metric: gpt_eval_score
+    aggregation: !function cc_utils.mmbench_cn_cc_aggregate_dev_results_eval
+    higher_is_better: true
+  - metric: submission
+    aggregation: !function cc_utils.mmbench_cn_cc_aggregate_results
+metadata:
+  version: 0.0
+  gpt_eval_model_name: "gpt-3.5-turbo-0613"
+model_specific_prompt_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\n请直接使用所提供的选项字母作为答案回答。"
+model_specific_generation_kwargs:
+  llava:
+    image_aspect_ratio: original

EAGLE/lmms_eval/tasks/mmbench/mmbench_cn.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+group: mmbench_cn
+task:
+  - mmbench_cn_dev
+  - mmbench_cn_test
+  - mmbench_cn_cc
+metadata:
+  version: 0.0
+  gpt_eval_model_name: "gpt-3.5-turbo-0613"
+  sys_prompt: "有如下几个选项："