Uploading folder contents

Browse files

Files changed (7) hide show

__pycache__/imagenet_s_test.cpython-39.pyc +0 -0
__pycache__/mask_image_test.cpython-310.pyc +0 -0
__pycache__/mask_image_test.cpython-39.pyc +0 -0
alpha_grit.py +130 -0
imagenet_s_test.py +144 -0
mask_image.py +136 -0
mask_image_test.py +457 -0

__pycache__/imagenet_s_test.cpython-39.pyc ADDED Viewed

Binary file (4.04 kB). View file

__pycache__/mask_image_test.cpython-310.pyc ADDED Viewed

Binary file (12.1 kB). View file

__pycache__/mask_image_test.cpython-39.pyc ADDED Viewed

Binary file (14.2 kB). View file

alpha_grit.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import json
+import os
+import random
+from tqdm import tqdm
+from torch.utils.data import Dataset
+from mask_image import ImageNet_Masked
+from pycocotools.coco import COCO
+from pycocotools import mask as maskUtils
+from PIL import Image
+import cv2
+import random
+from torchvision import transforms
+from tqdm import tqdm
+PIXEL_MEAN = (0.48145466, 0.4578275, 0.40821073)
+MASK_FILL = [int(255 * c) for c in PIXEL_MEAN]
+import pickle
+import torch
+import numpy as np
+import copy
+import sys
+import shutil
+from PIL import Image
+def get_file(url):
+    return #TODO: get file path from local directory
+clip_standard_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((224, 224), interpolation=Image.BICUBIC),
+    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+])
+hi_clip_standard_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((336, 336), interpolation=Image.BICUBIC),
+    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+])
+res_clip_standard_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((336, 336), interpolation=Image.BICUBIC),
+    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+])
+mask_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((224, 224)),
+    transforms.Normalize(0.5, 0.26)
+])
+hi_mask_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((336, 336)),
+    transforms.Normalize(0.5, 0.26)
+])
+res_mask_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((336, 336)),
+    transforms.Normalize(0.5, 0.26)
+])
+def crop_center(img, croph, cropw):
+    h, w = img.shape[:2]
+    starth = h//2 - (croph//2)
+    startw = w//2 - (cropw//2)
+    return img[starth:starth+croph, startw:startw+cropw, :]
+class Alpha_GRIT(Dataset):
+    def __init__(self, ids_file='grit_1m_ids.pkl', root_pth='grit-1m/', common_pair=0.0, hi_res=False, subnum=None):
+        if subnum is not None:
+            self.ids = pickle.load(open(ids_file, 'rb'))[:subnum]
+        else:
+            self.ids = pickle.load(open(ids_file, 'rb'))
+        self.root_pth = root_pth
+        self.with_common_pair_prop = common_pair
+        if hi_res:
+            self.mask_transform = res_mask_transform
+            self.clip_standard_transform = res_clip_standard_transform
+        else:
+            self.mask_transform = mask_transform
+            self.clip_standard_transform = clip_standard_transform
+    def __len__(self):
+        return len(self.ids)
+    def __getitem__(self, index):
+        id = self.ids[index]
+        ann = json.loads(get_file(self.root_pth + str(id) + '.json'))
+        image_data = get_file(self.root_pth + str(id) + '.jpg')
+        img = np.frombuffer(image_data, dtype=np.uint8)
+        img = cv2.imdecode(img, cv2.IMREAD_COLOR)
+        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        ref_exps = ann['ref_exps']
+        # random choose single ref with its corresponding masks
+        choice = random.randint(0, len(ref_exps)-1)
+        ref_exp = ref_exps[choice]
+        text = ann['caption'][int(ref_exp[0]): int(ref_exp[1])]
+        mask = maskUtils.decode(ann['seudo_masks'][choice])
+        if mask.shape != img.shape[:2]:
+            img = np.rot90(img)
+        rgba = np.concatenate((img, np.expand_dims(mask, axis=-1)), axis=-1)
+        h, w = rgba.shape[:2]
+        choice = random.randint(0, 1)
+        choice = 0
+        if choice == 0:
+            if max(h, w) == w:
+                pad = (w - h) // 2
+                l, r = pad, w - h - pad
+                rgba = np.pad(rgba, ((l, r), (0, 0), (0, 0)), 'constant', constant_values=0)
+            else:
+                pad = (h - w) // 2
+                l, r = pad, h - w - pad
+                rgba = np.pad(rgba, ((0, 0), (l, r), (0, 0)), 'constant', constant_values=0)
+        else:
+            if min(h, w) == h:
+                rgba = crop_center(rgba, h, h)
+            else:
+                rgba = crop_center(rgba, w, w)
+        rgb = rgba[:, :, :-1]
+        mask = rgba[:, :, -1]
+        image_torch = self.clip_standard_transform(rgb)
+        choice = random.random()
+        if choice >= self.with_common_pair_prop:
+            mask_torch = self.mask_transform(mask * 255)
+            return image_torch, mask_torch, text
+        else: # half ori image
+            mask_torch = self.mask_transform(np.ones_like(mask) * 255)
+            return image_torch, mask_torch, ann['caption']

imagenet_s_test.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import json
+import os
+import random
+from tqdm import tqdm
+from torch.utils.data import Dataset
+from pycocotools.coco import COCO
+from pycocotools import mask as maskUtils
+from PIL import Image
+import cv2
+import random
+from torchvision import transforms
+from tqdm import tqdm
+import pickle
+import torch
+import numpy as np
+import copy
+import sys
+import shutil
+from PIL import Image
+from nltk.corpus import wordnet
+PIXEL_MEAN = (0.48145466, 0.4578275, 0.40821073)
+MASK_FILL = [int(255 * c) for c in PIXEL_MEAN]
+clip_standard_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((224, 224), interpolation=Image.BICUBIC),
+    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+])
+hi_clip_standard_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((336, 336), interpolation=Image.BICUBIC),
+    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+])
+res_clip_standard_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((336, 336), interpolation=Image.BICUBIC),
+    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+])
+mask_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((224, 224)),
+    transforms.Normalize(0.5, 0.26)
+])
+hi_mask_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((336, 336)),
+    transforms.Normalize(0.5, 0.26)
+])
+res_mask_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((336, 336)),
+    transforms.Normalize(0.5, 0.26)
+])
+def crop_center(img, croph, cropw):
+    h, w = img.shape[:2]
+    starth = h//2 - (croph//2)
+    startw = w//2 - (cropw//2)
+    return img[starth:starth+croph, startw:startw+cropw, :]
+class Imagenet_S(Dataset):
+    def __init__(self, ann_file='data/imagenet_s/imagenet_919.json', hi_res=False, all_one=False):
+        self.anns = json.load(open(ann_file, 'r'))
+        self.root_pth = 'data/imagenet_s/'
+        cats = []
+        for ann in self.anns:
+            if ann['category_word'] not in cats:
+                cats.append(ann['category_word'])
+            ann['cat_index'] = len(cats) - 1
+        self.classes = []
+        for cat_word in cats:
+            synset = wordnet.synset_from_pos_and_offset('n', int(cat_word[1:]))
+            synonyms = [x.name() for x in synset.lemmas()]
+            self.classes.append(synonyms[0])
+        self.choice = "center_crop"
+        if hi_res:
+            self.mask_transform = res_mask_transform
+            self.clip_standard_transform = res_clip_standard_transform
+        else:
+            self.mask_transform = mask_transform
+            self.clip_standard_transform = clip_standard_transform
+        self.all_one = all_one
+    def __len__(self):
+        return len(self.anns)
+    def __getitem__(self, index):
+        ann = self.anns[index]
+        image = cv2.imread(self.root_pth + ann['image_pth'])
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        mask = maskUtils.decode(ann['mask'])
+        rgba = np.concatenate((image, np.expand_dims(mask, axis=-1)), axis=-1)
+        h, w = rgba.shape[:2]
+        if self.choice == "padding":
+            if max(h, w) == w:
+                pad = (w - h) // 2
+                l, r = pad, w - h - pad
+                rgba = np.pad(rgba, ((l, r), (0, 0), (0, 0)), 'constant', constant_values=0)
+            else:
+                pad = (h - w) // 2
+                l, r = pad, h - w - pad
+                rgba = np.pad(rgba, ((0, 0), (l, r), (0, 0)), 'constant', constant_values=0)
+        else:
+            if min(h, w) == h:
+                rgba = crop_center(rgba, h, h)
+            else:
+                rgba = crop_center(rgba, w, w)
+        rgb = rgba[:, :, :-1]
+        mask = rgba[:, :, -1]
+        image_torch = self.clip_standard_transform(rgb)
+        bi_mask = mask == 1
+        h, w = bi_mask.shape[-2:]
+        in_height = np.max(bi_mask, axis=-1)
+        in_height_coords = np.max(bi_mask, axis=-1) * np.arange(h)
+        b_e = in_height_coords.max()
+        in_height_coords = in_height_coords + h * (~in_height)
+        t_e = in_height_coords.min()
+        in_width = np.max(bi_mask, axis=-2)
+        in_width_coords = np.max(bi_mask, axis=-2) * np.arange(w)
+        r_e = in_width_coords.max()
+        in_width_coords = in_width_coords + w * (~in_width)
+        l_e = in_width_coords.min()
+        if self.all_one:
+            mask_torch = self.mask_transform(np.ones_like(mask) * 255)
+        else:
+            mask_torch = self.mask_transform(mask * 255)
+        return image_torch, mask_torch, ann['cat_index']
+if __name__ == "__main__":
+    data = Imagenet_S()
+    for i in tqdm(range(data.__len__())):
+        data.__getitem__(i)

mask_image.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import json
+import os
+import random
+from torch.utils.data import Dataset
+from pycocotools.coco import COCO
+from pycocotools import mask as maskUtils
+from PIL import Image
+from PIL import ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+Image.MAX_IMAGE_PIXELS = None
+from tqdm import tqdm
+from torchvision import transforms
+from tqdm import tqdm
+import pickle
+import cv2
+import torch
+import numpy as np
+import copy
+from transformers import AutoProcessor
+from nltk.corpus import wordnet
+from bg_aug import get_bkgd
+import jax
+import random
+clip_standard_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((224, 224), interpolation=Image.BICUBIC),
+    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+])
+to_tensor = transforms.ToTensor()
+normalize = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
+mask_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((224, 224)),
+    transforms.Normalize(0.5, 0.26)
+])
+crop_aug = transforms.Compose([
+    transforms.RandomCrop((224-32, 224-32)),
+    transforms.Resize((224, 224)),
+])
+def text_filter(text):
+    text = text.replace(' with a white background', '')
+    text = text.replace(' with white background', '')
+    text = text.replace(' next to a white background', '')
+    text = text.replace(' over a white background', '')
+    text = text.replace(' is cut out of a white background', '')
+    text = text.replace(' across a white background', '')
+    text = text.replace(' on a white background', '')
+    text = text.replace(' sticking out of a white background', '')
+    text = text.replace(' in the middle of a white background', '')
+    text = text.replace(' on white background', '')
+    text = text.replace(' in a white background', '')
+    text = text.replace(' and a white background', '')
+    text = text.replace(' and white background', '')
+    text = text.replace(' in front of a white background', '')
+    text = text.replace(' on top of a white background', '')
+    text = text.replace(' against a white background', '')
+    text = text.replace('a white background with ', '')
+    text = text.replace(' and has a white background', '')
+    text = text.replace('white background', 'background')
+    text = text + '.'
+    return text
+def crop(image: np.array, bbox_xywh: np.array, bi_mask: np.array, scale=1.5):
+    tl_x = int(bbox_xywh[0])
+    tl_y = int(bbox_xywh[1])
+    w = int(bbox_xywh[2]) if int(bbox_xywh[2]) > 0 else 1
+    h = int(bbox_xywh[3]) if int(bbox_xywh[3]) > 0 else 1
+    image_h, image_w = image.shape[:2]
+    # shape maintained
+    r = max(h, w)
+    tl_x -= (r - w) / 2
+    tl_y -= (r - h) / 2
+    half_scale = (scale - 1.0) / 2
+    w_l = int(tl_x - half_scale * r) if (tl_x - half_scale * r) > 0 else 0
+    w_r = int(tl_x + (1+half_scale) * r) if (tl_x + (1+half_scale) * r) < image_w else image_w - 1
+    h_t = int(tl_y - half_scale * r) if (tl_y - half_scale * r) > 0 else 0
+    h_b = int(tl_y + (1+half_scale) * r) if (tl_y + (1+half_scale) * r) < image_h else image_h - 1
+    return image[h_t: h_b, w_l: w_r, :], bi_mask[h_t: h_b, w_l: w_r]
+def masked_crop(image: np.array, bbox_xywh: np.array, bi_mask: np.array, crop_scale=1.0, masked_color=[255, 255, 255]):
+    # padding to make_sure bboxshape maintained
+    image = np.pad(image, ((600, 600), (600, 600), (0, 0)), 'constant', constant_values=255)
+    bi_mask = np.pad(bi_mask, ((600, 600), (600, 600)), "constant", constant_values=0)
+    bbox_xywh[:2] += 600
+    cropped_image, cropped_mask = crop(image, bbox_xywh, bi_mask, crop_scale)
+    cropped_image[np.nonzero(cropped_mask == 0)] = masked_color
+    return cropped_image, cropped_mask
+class ImageNet_Masked(Dataset):
+    def __init__(self, ann_file="M_ImageNet_top_460k.json",  masked_color=[255, 255, 255]):
+        self.masked_color = masked_color
+        self.anns_list = json.load(open(ann_file, 'r'))
+        random.shuffle(self.anns_list)
+        self.crop_scale = 1.5
+        self.transform = clip_standard_transform
+        self.res = 224
+        self.blur = 10.0
+    def __len__(self):
+        return len(self.anns_list)
+    def __getitem__(self, index):
+        cv2.ocl.setUseOpenCL(False)
+        cv2.setNumThreads(0)
+        ann = self.anns_list[index]
+        # TODO: change list to dict key.
+        img_pth = ann[2]
+        # img_pth = img_pth.replace('imagenet-21k/images', 'imagenet-21k-demo/*')
+        mask = ann[3]
+        bbox = ann[4]
+        text = ann[6]
+        image = cv2.imread(img_pth)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        bbox_xywh = np.copy(np.array(bbox))
+        binary_mask = maskUtils.decode(mask)
+        cat_word = img_pth.split("/")[3]
+        synset = wordnet.synset_from_pos_and_offset('n', int(cat_word[1:]))
+        synonyms = [x.name() for x in synset.lemmas()]
+        text = text.replace(".", f", probably {synonyms[0]}").replace(" ", "_").replace("/", "_").replace("\\", "_")
+        image[np.nonzero(binary_mask == 1)] = (0.5 * image[np.nonzero(binary_mask == 1)] + 0.5 * np.array([0, 255, 0])).astype(np.uint8)
+        os.makedirs(os.path.split(img_pth.replace("imagenet-21k/images", "visual_train_c"))[0], exist_ok=True)
+        Image.fromarray(image).save(os.path.split(img_pth.replace("imagenet-21k/images", "visual_train_c"))[0] + f"/{text}_" + os.path.split(img_pth.replace("imagenet-21k/images", "visual_train_c"))[1])
+if __name__ == "__main__":
+    data = ImageNet_Masked()
+    for i in tqdm(range(data.__len__())):
+        data.__getitem__(i)

mask_image_test.py ADDED Viewed

	@@ -0,0 +1,457 @@

+import json
+import os
+import random
+from collections import defaultdict
+import alpha_clip
+from torch.utils.data import Dataset
+from pycocotools.coco import COCO
+from pycocotools import mask as maskUtils
+from lvis import LVIS
+from PIL import Image
+from PIL import ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+Image.MAX_IMAGE_PIXELS = None
+from tqdm import tqdm
+from torchvision import transforms
+from tqdm import tqdm
+import pickle
+import cv2
+import torch
+import numpy as np
+import copy
+from transformers import AutoProcessor
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+PIXEL_MEAN = (0.48145466, 0.4578275, 0.40821073)
+MASK_FILL = [int(255 * c) for c in PIXEL_MEAN]
+def _convert_image_to_rgb(image):
+    return image.convert("RGB")
+clip_standard_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((224, 224), interpolation=Image.BICUBIC),
+    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+])
+hi_clip_standard_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((336, 336), interpolation=Image.BICUBIC),
+    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+])
+mask_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((224, 224)),
+    transforms.Normalize(0.5, 0.26)
+])
+hi_mask_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Resize((336, 336)),
+    transforms.Normalize(0.5, 0.26)
+])
+def crop(image: np.array, bbox_xywh: np.array, bi_mask: np.array, scale=1.5):
+    tl_x = int(bbox_xywh[0])
+    tl_y = int(bbox_xywh[1])
+    w = int(bbox_xywh[2]) if int(bbox_xywh[2]) > 0 else 1
+    h = int(bbox_xywh[3]) if int(bbox_xywh[3]) > 0 else 1
+    image_h, image_w = image.shape[:2]
+    # shape maintained
+    r = max(h, w)
+    tl_x -= (r - w) / 2
+    tl_y -= (r - h) / 2
+    half_scale = (scale - 1.0) / 2
+    w_l = int(tl_x - half_scale * r) if (tl_x - half_scale * r) > 0 else 0
+    w_r = int(tl_x + (1+half_scale) * r) if (tl_x + (1+half_scale) * r) < image_w else image_w - 1
+    h_t = int(tl_y - half_scale * r) if (tl_y - half_scale * r) > 0 else 0
+    h_b = int(tl_y + (1+half_scale) * r) if (tl_y + (1+half_scale) * r) < image_h else image_h - 1
+    return image[h_t: h_b, w_l: w_r, :], bi_mask[h_t: h_b, w_l: w_r]
+def masked_crop(image: np.array, bbox_xywh: np.array, bi_mask: np.array, crop_scale=1.0, masked_color=[255, 255, 255]):
+    # padding to make_sure bboxshape maintained
+    image = np.pad(image, ((600, 600), (600, 600), (0, 0)), 'constant', constant_values=255)
+    bi_mask = np.pad(bi_mask, ((600, 600), (600, 600)), "constant", constant_values=0)
+    bbox_xywh[:2] += 600
+    cropped_image, cropped_mask = crop(image, bbox_xywh, bi_mask, crop_scale)
+    # cropped_image[np.nonzero(cropped_mask == 0)] = MASK_FILL
+    return cropped_image, cropped_mask
+class COCO_Masked_Test(Dataset):
+    def __init__(self, ann_file="data/coco/annotations/instances_val2017.json",  masked_color=[255, 255, 255], root_directory="data/coco/val2017", hi_res=False):
+        self.masked_color = masked_color
+        self.coco = COCO(annotation_file=ann_file)
+        self.image_directory = root_directory
+        self.crop_scale = 1.5
+        self.anns_list = list(self.coco.anns.keys())
+        self.index2id = [x['id'] for x in self.coco.cats.values()]
+        self.id2index = dict()
+        for i, item in enumerate(self.index2id):
+            self.id2index[item] = i
+        self.class_num = 80
+        self.classes = [x['name'] for x in self.coco.cats.values()]
+        if hi_res:
+            self.mask_transform = hi_mask_transform
+            self.clip_standard_transform = hi_clip_standard_transform
+        else:
+            self.mask_transform = mask_transform
+            self.clip_standard_transform = clip_standard_transform
+    def __len__(self):
+        return len(self.anns_list)
+    def __getitem__(self, index):
+        ann_id = self.anns_list[index]
+        ann = self.coco.anns[ann_id]
+        img_id = self.coco.anns[ann_id]['image_id']
+        image = np.array(Image.open(os.path.join(self.image_directory, self.coco.imgs[img_id]['file_name'])).convert('RGB'))
+        bbox_xywh = np.copy(np.array(ann['bbox']))
+        binary_mask = self.coco.annToMask(ann)
+        cropped_image, cropped_mask =  masked_crop(image, bbox_xywh, binary_mask, crop_scale=self.crop_scale, masked_color=self.masked_color)
+        image = self.clip_standard_transform(cropped_image)
+        mask_torch = self.mask_transform(cropped_mask * 255)
+        return image, mask_torch, self.id2index[ann['category_id']]
+class LVIS_Masked_Test(Dataset):
+    def __init__(self, ann_file="data/lvis/annotations/lvis_v1_val.json",  masked_color=[255, 255, 255], hi_res=False):
+        self.masked_color = masked_color
+        self.lvis = LVIS(ann_file)
+        self.crop_scale = 1.5
+        self.anns_list = list(self.lvis.anns.keys())
+        self.index2id = [x['id'] for x in self.lvis.cats.values()]
+        self.id2index = dict()
+        for i, item in enumerate(self.index2id):
+            self.id2index[item] = i
+        self.class_num = 1203
+        self.classes = [x['name'] for x in self.lvis.cats.values()]
+        if hi_res:
+            self.mask_transform = hi_mask_transform
+            self.clip_standard_transform = hi_clip_standard_transform
+        else:
+            self.mask_transform = mask_transform
+            self.clip_standard_transform = clip_standard_transform
+    def __len__(self):
+        return len(self.anns_list)
+    def __getitem__(self, index):
+        ann_id = self.anns_list[index]
+        ann = self.lvis.anns[ann_id]
+        img_id = self.lvis.anns[ann_id]['image_id']
+        image = np.array(Image.open(self.lvis.imgs[img_id]['coco_url'].replace('http://images.cocodataset.org', 'data/coco')).convert('RGB'))
+        binary_mask = self.lvis.ann_to_mask(ann)
+        rgba = np.concatenate((image, np.expand_dims(binary_mask, axis=-1)), axis=-1)
+        h, w = rgba.shape[:2]
+        if max(h, w) == w:
+            pad = (w - h) // 2
+            l, r = pad, w - h - pad
+            rgba = np.pad(rgba, ((l, r), (0, 0), (0, 0)), 'constant', constant_values=0)
+        else:
+            pad = (h - w) // 2
+            l, r = pad, h - w - pad
+            rgba = np.pad(rgba, ((0, 0), (l, r), (0, 0)), 'constant', constant_values=0)
+        rgb = rgba[:, :, :-1]
+        mask = rgba[:, :, -1]
+        image = self.clip_standard_transform(rgb)
+        mask_torch = self.mask_transform(mask * 255)
+        return image, mask_torch, self.id2index[ann['category_id']],
+class RGBD:
+    def __init__(self, annotation_file=None):
+        self.anns, self.imgs, self.answers, self.types = defaultdict(list), dict(), dict(), dict()
+        if not annotation_file == None:
+            with open(annotation_file, 'r') as reader:
+                datas = json.load(reader)
+                for data in datas:
+                    self.anns[data['id']] = data['captions']
+                    self.imgs[data['id']] = data['image']
+                    self.answers[data['id']] = data['answer']
+                    self.types[data['id']] = data['type']
+class RGBD_Outdoor_Benchmark(Dataset):
+    def __init__(self, root_dir,tasks):
+        self.root_dir = root_dir
+        # import pdb;pdb.set_trace()
+        self.dataset = RGBD(os.path.join(root_dir, tasks))
+        self.image_ids = list(self.dataset.imgs.keys())
+        self.captions = [x for x in self.dataset.anns.values()]
+        self.depth_transform = transforms.Compose([
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+        ])
+        self.transform =clip_standard_transform
+        # self.transform = hi_clip_standard_transform
+        # self.depth_transform = transforms.Compose([
+        #     transforms.Resize((336, 336)),
+        #     transforms.ToTensor(),
+        # ])
+    def __len__(self):
+        return len(self.image_ids)
+    def __getitem__(self, idx):
+        if torch.is_tensor(idx):
+            idx = idx.tolist()
+        img_ids = self.image_ids[idx]
+        image_path = os.path.join(self.root_dir, 'pic_all', self.dataset.imgs[img_ids])
+        depth_path = os.path.join(self.root_dir, 'pic_depth' ,self.dataset.imgs[img_ids])
+        image = Image.open(image_path).convert('RGB')
+        depth = Image.open(depth_path).convert('L')
+        answer = self.dataset.answers[img_ids]
+        if self.transform:
+            image = self.transform(image)
+        if self.depth_transform:
+            depth = self.depth_transform(depth)
+        return image, depth, answer
+class RGBD_Benchmark_Test(Dataset):
+    def __init__(self, root_dir):
+        self.root_dir = root_dir
+        self.dataset = RGBD(os.path.join(root_dir, 'annotations.json'))
+        self.image_ids = list(self.dataset.imgs.keys())
+        self.captions = [x for x in self.dataset.anns.values()]
+        # self.transform = transforms.Compose([
+        #     transforms.Resize((224, 224)),
+        #     transforms.ToTensor(),
+        # ])
+        self.transform =clip_standard_transform
+        self.depth_transform = transforms.Compose([
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+        ])
+    def __len__(self):
+        return len(self.image_ids)
+    def __getitem__(self, idx):
+        if torch.is_tensor(idx):
+            idx = idx.tolist()
+        img_ids = self.image_ids[idx]
+        image_path = os.path.join(self.root_dir, 'all_pic', self.dataset.imgs[img_ids])
+        depth_path = os.path.join(self.root_dir, 'depth-new' ,self.dataset.imgs[img_ids])
+        image = Image.open(image_path).convert('RGB')
+        depth = Image.open(depth_path).convert('L')
+        answer = self.dataset.answers[img_ids]
+        if self.transform:
+            image = self.transform(image)
+        if self.depth_transform:
+            depth = self.depth_transform(depth)
+        return image, depth, answer
+class RGBD_Benchmark_Test2(Dataset):
+    def __init__(self, root_dir):
+        self.root_dir = root_dir
+        self.dataset = RGBD(os.path.join(root_dir, 'annotations2.json'))
+        self.image_ids = list(self.dataset.imgs.keys())
+        self.captions = [x for x in self.dataset.anns.values()]
+        # self.transform = transforms.Compose([
+        #     transforms.Resize((224, 224)),
+        #     transforms.ToTensor(),
+        # ])
+        self.transform =clip_standard_transform
+        self.depth_transform = transforms.Compose([
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+        ])
+    def __len__(self):
+        return len(self.image_ids)
+    def __getitem__(self, idx):
+        if torch.is_tensor(idx):
+            idx = idx.tolist()
+        img_ids = self.image_ids[idx]
+        image_path = os.path.join(self.root_dir, 'all_pic', self.dataset.imgs[img_ids])
+        depth_path = os.path.join(self.root_dir, 'depth-new' ,self.dataset.imgs[img_ids])
+        image = Image.open(image_path).convert('RGB')
+        depth = Image.open(depth_path).convert('L')
+        answer = self.dataset.answers[img_ids]
+        if self.transform:
+            image = self.transform(image)
+        if self.depth_transform:
+            depth = self.depth_transform(depth)
+        return image, depth, answer
+class ScanRefer:
+    def __init__(self, annotation_file=None):
+        self.anns, self.imgs, self.answers, self.scene_id = defaultdict(list), dict(), dict(), dict()
+        if not annotation_file == None:
+            with open(annotation_file, 'r') as reader:
+                datas = json.load(reader)
+                for data in datas:
+                    self.anns[data['unique_id']] = data['descriptions']
+                    self.imgs[data['unique_id']] = data['image']
+                    self.answers[data['unique_id']] = data['answer']
+                    self.scene_id[data['unique_id']] = data['scene_id']
+class ScanRefer_Test(Dataset):
+    def __init__(self, root_dir, model):
+        self.root_dir = root_dir
+        self.dataset = ScanRefer(os.path.join(root_dir, 'scanrefer_annotations_all.json'))
+        # self.dataset = ScanRefer(root_dir)
+        self.model = model
+        self.image_ids = list(self.dataset.imgs.keys())
+        # self.transform = transforms.Compose([
+        #     transforms.Resize((224, 224)),
+        #     transforms.ToTensor(),
+        # ])
+        self.transform = transforms.Compose([
+            transforms.Resize(224, interpolation=BICUBIC),
+            transforms.CenterCrop(224),
+            _convert_image_to_rgb,
+            transforms.ToTensor(),
+            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+        ])
+        self.depth_transform = transforms.Compose([
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+        ])
+    def __len__(self):
+        return len(self.image_ids)
+    def __getitem__(self, idx):
+        if torch.is_tensor(idx):
+            idx = idx.tolist()
+        img_ids = self.image_ids[idx]
+        image_path = os.path.join(self.root_dir, self.dataset.scene_id[img_ids], 'color', self.dataset.imgs[img_ids])
+        depth_path = os.path.join(self.root_dir, self.dataset.scene_id[img_ids], 'depth', self.dataset.imgs[img_ids].split('.')[0] + '.png')
+        image = Image.open(image_path).convert('RGB')
+        depth = Image.open(depth_path).convert('L')
+        if self.transform:
+            image = self.transform(image)
+        if self.depth_transform:
+            depth = self.depth_transform(depth)
+        caption = self.dataset.anns[img_ids]
+        texts = alpha_clip.tokenize(caption).cuda()
+        text_embeddings = self.model.encode_text(texts)
+        text_embeddings /= text_embeddings.norm(dim=-1, keepdim=True)
+        answer = self.dataset.answers[img_ids]
+        return image, depth, text_embeddings, answer
+class ScanRefer_Test2(Dataset):
+    def __init__(self, root_dir, model):
+        self.root_dir = root_dir
+        self.dataset = ScanRefer(os.path.join(root_dir, 'annotations_2.json'))
+        # self.dataset = ScanRefer(root_dir)
+        self.model = model
+        self.image_ids = list(self.dataset.imgs.keys())
+        # self.transform = transforms.Compose([
+        #     transforms.Resize((224, 224)),
+        #     transforms.ToTensor(),
+        # ])
+        self.transform = transforms.Compose([
+            transforms.Resize(224, interpolation=BICUBIC),
+            transforms.CenterCrop(224),
+            _convert_image_to_rgb,
+            transforms.ToTensor(),
+            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+        ])
+        self.depth_transform = transforms.Compose([
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+        ])
+    def __len__(self):
+        return len(self.image_ids)
+    def __getitem__(self, idx):
+        if torch.is_tensor(idx):
+            idx = idx.tolist()
+        img_ids = self.image_ids[idx]
+        image_path = os.path.join(self.root_dir, self.dataset.scene_id[img_ids], 'color', self.dataset.imgs[img_ids])
+        depth_path = os.path.join(self.root_dir, self.dataset.scene_id[img_ids], 'depth', self.dataset.imgs[img_ids].split('.')[0] + '.png')
+        image = Image.open(image_path).convert('RGB')
+        depth = Image.open(depth_path).convert('L')
+        if self.transform:
+            image = self.transform(image)
+        if self.depth_transform:
+            depth = self.depth_transform(depth)
+        caption = self.dataset.anns[img_ids]
+        texts = alpha_clip.tokenize(caption).cuda()
+        text_embeddings = self.model.encode_text(texts)
+        text_embeddings /= text_embeddings.norm(dim=-1, keepdim=True)
+        answer = self.dataset.answers[img_ids]
+        return image, depth, text_embeddings, answer
+class ScanRefer_Testnr3d(Dataset):
+    def __init__(self, root_dir, model):
+        self.root_dir = root_dir
+        self.dataset = ScanRefer(os.path.join(root_dir, 'nr3d_annotations.json'))
+        # self.dataset = ScanRefer(root_dir)
+        self.model = model
+        self.image_ids = list(self.dataset.imgs.keys())
+        # self.transform = transforms.Compose([
+        #     transforms.Resize((224, 224)),
+        #     transforms.ToTensor(),
+        # ])
+        self.transform = transforms.Compose([
+            transforms.Resize(224, interpolation=BICUBIC),
+            transforms.CenterCrop(224),
+            _convert_image_to_rgb,
+            transforms.ToTensor(),
+            transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+        ])
+        self.depth_transform = transforms.Compose([
+            transforms.Resize((224, 224)),
+            transforms.ToTensor(),
+        ])
+    def __len__(self):
+        return len(self.image_ids)
+    def __getitem__(self, idx):
+        if torch.is_tensor(idx):
+            idx = idx.tolist()
+        img_ids = self.image_ids[idx]
+        image_path = os.path.join(self.root_dir, self.dataset.scene_id[img_ids], 'color', self.dataset.imgs[img_ids])
+        depth_path = os.path.join(self.root_dir, self.dataset.scene_id[img_ids], 'depth', self.dataset.imgs[img_ids].split('.')[0] + '.png')
+        image = Image.open(image_path).convert('RGB')
+        depth = Image.open(depth_path).convert('L')
+        if self.transform:
+            image = self.transform(image)
+        if self.depth_transform:
+            depth = self.depth_transform(depth)
+        caption = self.dataset.anns[img_ids]
+        texts = alpha_clip.tokenize(caption).cuda()
+        text_embeddings = self.model.encode_text(texts)
+        text_embeddings /= text_embeddings.norm(dim=-1, keepdim=True)
+        answer = self.dataset.answers[img_ids]
+        return image, depth, text_embeddings, answer
+if __name__ == "__main__":
+    data = LVIS_Masked_Test()
+    for i in tqdm(range(data.__len__())):
+        data.__getitem__(i)