Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

ASDA/dataset/__pycache__/data_loader.cpython-39.pyc +0 -0
ASDA/dataset/__pycache__/data_loader_gref_sbert.cpython-39.pyc +0 -0
ASDA/dataset/__pycache__/data_loader_rccp.cpython-39.pyc +0 -0
ASDA/dataset/__pycache__/data_loader_test.cpython-39.pyc +0 -0
ASDA/dataset/__pycache__/refer.cpython-39.pyc +0 -0
ASDA/dataset/data.sh +12 -0
ASDA/dataset/data_loader.py +314 -0
ASDA/dataset/data_loader_gref_sbert.py +343 -0
ASDA/dataset/data_loader_rccp.py +279 -0
ASDA/dataset/data_loader_test.py +315 -0
ASDA/dataset/data_process.py +225 -0
ASDA/dataset/datascript.py +56 -0
ASDA/dataset/refer.py +485 -0

ASDA/dataset/__pycache__/data_loader.cpython-39.pyc ADDED Viewed

Binary file (7.71 kB). View file

ASDA/dataset/__pycache__/data_loader_gref_sbert.cpython-39.pyc ADDED Viewed

Binary file (8.75 kB). View file

ASDA/dataset/__pycache__/data_loader_rccp.cpython-39.pyc ADDED Viewed

Binary file (7 kB). View file

ASDA/dataset/__pycache__/data_loader_test.cpython-39.pyc ADDED Viewed

Binary file (7.55 kB). View file

ASDA/dataset/__pycache__/refer.cpython-39.pyc ADDED Viewed

Binary file (11.9 kB). View file

ASDA/dataset/data.sh ADDED Viewed

	@@ -0,0 +1,12 @@

+#!/bin/bash
+# data process
+python data_process.py --data_root ../ln_data --output_dir ../ln_data --dataset refcoco --split unc --generate_mask
+python data_process.py --data_root ../ln_data --output_dir ../ln_data --dataset refcoco+ --split unc --generate_mask
+python data_process.py --data_root ../ln_data --output_dir ../ln_data --dataset refcocog --split google --generate_mask
+python data_process.py --data_root ../ln_data --output_dir ../ln_data --dataset refcocog --split umd --generate_mask
+# datascript
+python datascript.py --dataset refcoco
+python datascript.py --dataset refcoco+
+python datascript.py --dataset refcocog_google
+python datascript.py --dataset refcocog_umd

ASDA/dataset/data_loader.py ADDED Viewed

	@@ -0,0 +1,314 @@

+# -*- coding: utf-8 -*-
+"""
+refcoco, refcoco+ and refcocog referring image detection and segmentation PyTorch dataset.
+"""
+import sys
+import cv2
+import torch
+import random
+import numpy as np
+import os.path as osp
+import torch.utils.data as data
+sys.path.append('.')
+import utils
+import re
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+from utils.transforms import letterbox, random_affine, random_copy, random_crop, random_erase
+import copy
+import clip
+sys.modules['utils'] = utils
+cv2.setNumThreads(0)
+def read_examples(input_line, unique_id):
+    """Read a list of `InputExample`s from an input file."""
+    examples = []
+    # unique_id = 0
+    line = input_line #reader.readline()
+    # if not line:
+    #     break
+    line = line.strip()
+    text_a = None
+    text_b = None
+    m = re.match(r"^(.*) \|\|\| (.*)$", line)
+    if m is None:
+        text_a = line
+    else:
+        text_a = m.group(1) #'man in black'
+        text_b = m.group(2)
+    examples.append(
+        InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
+    # unique_id += 1
+    return examples
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+  while True:
+    total_length = len(tokens_a) + len(tokens_b)
+    if total_length <= max_length:
+      break
+    if len(tokens_a) > len(tokens_b):
+      tokens_a.pop()
+    else:
+      tokens_b.pop()
+## Bert text encoding
+class InputExample(object):
+    def __init__(self, unique_id, text_a, text_b):
+        self.unique_id = unique_id
+        self.text_a = text_a
+        self.text_b = text_b
+class InputFeatures(object):
+    """A single set of features of data."""
+    def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
+        self.unique_id = unique_id
+        self.tokens = tokens
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.input_type_ids = input_type_ids
+def convert_examples_to_features(examples, seq_length, tokenizer):
+    """Loads a data file into a list of `InputBatch`s."""
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        tokens_a = tokenizer.tokenize(example.text_a) # ['far', 'left', 'vase']
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+        if tokens_b:
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > seq_length - 2:
+                tokens_a = tokens_a[0:(seq_length - 2)]
+        tokens = []
+        input_type_ids = []
+        tokens.append("[CLS]")
+        input_type_ids.append(0)
+        for token in tokens_a:
+            tokens.append(token)
+            input_type_ids.append(0)
+        tokens.append("[SEP]")
+        input_type_ids.append(0)
+        if tokens_b:
+            for token in tokens_b:
+                tokens.append(token)
+                input_type_ids.append(1)
+            tokens.append("[SEP]")
+            input_type_ids.append(1)
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+        # Zero-pad up to the sequence length.
+        while len(input_ids) < seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            input_type_ids.append(0)
+        assert len(input_ids) == seq_length
+        assert len(input_mask) == seq_length
+        assert len(input_type_ids) == seq_length
+        features.append(
+            InputFeatures(
+                unique_id=example.unique_id,
+                tokens=tokens,
+                input_ids=input_ids,
+                input_mask=input_mask,
+                input_type_ids=input_type_ids))
+    return features
+class DatasetNotFoundError(Exception):
+    pass
+class ReferDataset(data.Dataset):
+    SUPPORTED_DATASETS = {
+        'refcoco': {
+            'splits': ('train', 'val', 'testA', 'testB'),
+            'params': {'dataset': 'refcoco', 'split_by': 'unc'}
+        },
+        'refcoco+': {
+            'splits': ('train', 'val', 'testA', 'testB'),
+            'params': {'dataset': 'refcoco+', 'split_by': 'unc'}
+        },
+        'refcocog': {
+            'splits': ('train', 'val', 'test'),
+            'params': {'dataset': 'refcocog', 'split_by': 'unc'}
+        },
+        'refcocog_g': {
+            'splits': ('train', 'val'),
+            'params': {'dataset': 'refcocog', 'split_by': 'google'}
+        },
+        'refcocog_u': {
+            'splits': ('train', 'val', 'test'),
+            'params': {'dataset': 'refcocog', 'split_by': 'unc'}
+        },
+        'grefcoco': {
+            'splits': ('train', 'val', 'testA', 'testB'),
+            'params': {'dataset': 'grefcoco', 'split_by': 'unc'}
+        }
+    }
+    def __init__(self, data_root, split_root='data', dataset='refcoco', imsize=256, splitby='umd',
+                 transform=None, augment=False, split='train', max_query_len=128,
+                 bert_model='bert-base-uncased'):
+        self.images = []
+        self.data_root = data_root
+        self.split_root = split_root
+        self.dataset = dataset
+        self.imsize = imsize
+        self.query_len = max_query_len
+        self.transform = transform
+        self.split = split
+        self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True) # should be true for English
+        self.augment=augment
+        valid_splits = self.SUPPORTED_DATASETS[self.dataset]['splits']
+        if split not in valid_splits:
+            raise ValueError(
+                'Dataset {0} does not have split {1}'.format(
+                    self.dataset, split))
+        self.anns_root = osp.join(self.data_root, 'anns', self.dataset, self.split+'.txt')
+        if self.dataset == 'refcocog' :
+            mask_anno_str = '{0}_{1}'.format(self.dataset, splitby)
+            self.mask_root = osp.join(self.data_root, 'masks', mask_anno_str)
+        else :
+            self.mask_root = osp.join(self.data_root, 'masks', self.dataset)
+        self.im_dir = osp.join(self.data_root, 'images', 'train2014')
+        if self.dataset == 'refcocog' :
+            dataset_path = osp.join(self.split_root, self.dataset + '_' + splitby)
+            splits = [split]
+            for split in splits:
+                imgset_file = '{0}_{1}_{2}.pth'.format(self.dataset, splitby, split)
+                imgset_path = osp.join(dataset_path, imgset_file)
+                self.images += torch.load(imgset_path)
+        else :
+            dataset_path = osp.join(self.split_root, self.dataset)
+            splits = [split]
+            for split in splits:
+                imgset_file = '{0}_{1}.pth'.format(self.dataset, split)
+                imgset_path = osp.join(dataset_path, imgset_file)
+                self.images += torch.load(imgset_path)
+    def exists_dataset(self):
+        return osp.exists(osp.join(self.split_root, self.dataset))
+    def pull_item(self, idx):
+        img_file, seg_id, bbox, phrase = self.images[idx]
+        bbox = np.array(bbox, dtype=int) # x1y1x2y2
+        img_path = osp.join(self.im_dir, img_file)
+        img = cv2.imread(img_path) # BGR [512, 640, 3]
+        ## duplicate channel if gray image
+        if img.shape[-1] > 1:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) #RGB
+        else:
+            img = np.stack([img] * 3)
+        ## seg map
+        seg_map = np.load(osp.join(self.mask_root, str(seg_id)+'.npy')) # [512, 640]
+        seg_map = np.array(seg_map).astype(np.float32)
+        return img, phrase, bbox, seg_map
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, idx):
+        img, phrase, bbox, seg_map = self.pull_item(idx)
+        phrase = phrase.lower()
+        if self.augment:
+            augment_flip, augment_hsv, augment_affine, augment_crop, augment_copy, augment_erase = \
+                    True,        True,           True,        False,        False,          False
+        ## seems a bug in torch transformation resize, so separate in advance
+        h,w = img.shape[0], img.shape[1]
+        # print("img.shape", img.shape)
+        if self.augment:
+            ## random horizontal flip
+            if augment_flip and random.random() > 0.5:
+                img = cv2.flip(img, 1)
+                seg_map = cv2.flip(seg_map, 1)
+                bbox[0], bbox[2] = w-bbox[2]-1, w-bbox[0]-1
+                phrase = phrase.replace('right','*&^special^&*').replace('left','right').replace('*&^special^&*','left')
+            ## random copy and add left or right
+            if augment_copy:
+                img, seg_map, phrase, bbox = random_copy(img, seg_map, phrase, bbox)
+            ## random erase for occluded
+            if augment_erase:
+                img, seg_map = random_erase(img, seg_map)
+            ## random padding and crop
+            if augment_crop:
+                img, seg_map = random_crop(img, seg_map, 40, h, w)
+            ## random intensity, saturation change
+            if augment_hsv:
+                fraction = 0.50
+                img_hsv = cv2.cvtColor(cv2.cvtColor(img, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2HSV)
+                S = img_hsv[:, :, 1].astype(np.float32)
+                V = img_hsv[:, :, 2].astype(np.float32)
+                a = (random.random() * 2 - 1) * fraction + 1
+                if a > 1:
+                    np.clip(S, a_min=0, a_max=255, out=S)
+                a = (random.random() * 2 - 1) * fraction + 1
+                V *= a
+                if a > 1:
+                    np.clip(V, a_min=0, a_max=255, out=V)
+                img_hsv[:, :, 1] = S.astype(np.uint8)
+                img_hsv[:, :, 2] = V.astype(np.uint8)
+                img = cv2.cvtColor(cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR), cv2.COLOR_BGR2RGB)
+            img, seg_map, ratio, dw, dh = letterbox(img, seg_map, self.imsize)
+            bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw
+            bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh
+            ## random affine transformation
+            if augment_affine:
+                img, seg_map, bbox, M = random_affine(img, seg_map, bbox, \
+                    degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.90, 1.10)) # 255 white fill
+        else:   ## should be inference, or specified training
+            img, _, ratio, dw, dh = letterbox(img, None, self.imsize)
+            bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw
+            bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh
+        draw_img = copy.deepcopy(img)
+        # Norm, to tensor
+        if self.transform is not None:
+            img = self.transform(img)
+        ## encode phrase to clip input
+        word_id = clip.tokenize(phrase, 17, truncate=True)
+        word_mask = ~ (word_id == 0)
+        if self.augment: # train
+            seg_map = cv2.resize(seg_map, (self.imsize // 2, self.imsize // 2),interpolation=cv2.INTER_NEAREST) # (208, 208)
+            seg_map = np.reshape(seg_map, [1, np.shape(seg_map)[0], np.shape(seg_map)[1]])
+            return img, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
+            np.array(bbox, dtype=np.float32), np.array(seg_map, dtype=np.float32)
+        else:
+            seg_map = np.reshape(seg_map, [1, np.shape(seg_map)[0], np.shape(seg_map)[1]])
+            return img, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
+            np.array(bbox, dtype=np.float32), np.array(seg_map, dtype=np.float32), np.array(ratio, dtype=np.float32), \
+            np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0], self.images[idx][3], np.array(draw_img, dtype=np.uint8)

ASDA/dataset/data_loader_gref_sbert.py ADDED Viewed

	@@ -0,0 +1,343 @@

+# -*- coding: utf-8 -*-
+"""
+refcoco, refcoco+ and refcocog referring image detection and segmentation PyTorch dataset.
+"""
+import sys
+import cv2
+import os
+import torch
+import json
+import random
+import numpy as np
+import os.path as osp
+import torch.utils.data as data
+sys.path.append('.')
+import utils
+import re
+# from pytorch_pretrained_bert.tokenization import BertTokenizer
+from utils.transforms import letterbox, random_affine, random_copy, random_crop, random_erase
+import copy
+import clip
+sys.modules['utils'] = utils
+cv2.setNumThreads(0)
+class ReferDataset(data.Dataset):
+    SUPPORTED_DATASETS = {
+        'refcoco': {
+            'splits': ('train', 'val', 'testA', 'testB'),
+            'params': {'dataset': 'refcoco', 'split_by': 'unc'}
+        },
+        'refcoco+': {
+            'splits': ('train', 'val', 'testA', 'testB'),
+            'params': {'dataset': 'refcoco+', 'split_by': 'unc'}
+        },
+        'refcocog': {
+            'splits': ('train', 'val', 'test'),
+            'params': {'dataset': 'refcocog', 'split_by': 'unc'}
+        },
+        'refcocog_g': {
+            'splits': ('train', 'val'),
+            'params': {'dataset': 'refcocog', 'split_by': 'google'}
+        },
+        'refcocog_u': {
+            'splits': ('train', 'val', 'test'),
+            'params': {'dataset': 'refcocog', 'split_by': 'unc'}
+        },
+        'grefcoco': {
+            'splits': ('train', 'val', 'testA', 'testB'),
+            'params': {'dataset': 'grefcoco', 'split_by': 'unc'}
+        }
+    }
+    def _load_multi_obj_ref_ids(self):
+        # Load multi-object reference IDs based on configurations
+        if not self.exclude_multiobj and not self.exclude_position :
+            return None
+        elif self.exclude_position:
+            multiobj_path = os.path.join(self.ROOT, 'multiobj_ov2_nopos.txt')
+        elif self.exclude_multiobj :
+            multiobj_path = os.path.join(self.ROOT, 'multiobj_ov3.txt')
+        with open(multiobj_path, 'r') as f:
+            return [int(line.strip()) for line in f.readlines()]
+    def _load_metadata(self):
+        # Load metadata for hard positive verb phrases, hard negative queries
+        # we set refined file as default option
+        hardpos_path = '/data2/projects/seunghoon/VerbRIS/CrossVLT/hardpos_verdict_gref_v4.json'
+        with open(hardpos_path, 'r', encoding='utf-8') as f:
+            hardpos_json = json.load(f)
+        return hardpos_json
+    def __init__(self, data_root, split_root='data', dataset='refcoco', imsize=256, splitby='umd',
+                 transform=None, augment=False, split='train', max_query_len=128, metric_learning=None):
+        images_tmp = []
+        self.data_root = data_root
+        self.split_root = split_root
+        self.dataset = dataset
+        self.imsize = imsize
+        self.query_len = max_query_len
+        self.transform = transform
+        self.word_len = 17
+        self.emb_size = 384
+        self.split = split
+        self.augment=augment
+        valid_splits = self.SUPPORTED_DATASETS[self.dataset]['splits']
+        if split not in valid_splits:
+            raise ValueError(
+                'Dataset {0} does not have split {1}'.format(
+                    self.dataset, split))
+        self.anns_root = osp.join(self.data_root, 'anns', self.dataset, self.split+'.txt')
+        if self.dataset == 'refcocog' :
+            mask_anno_str = '{0}_{1}'.format(self.dataset, splitby)
+            self.mask_root = osp.join(self.data_root, 'masks', mask_anno_str)
+        else :
+            self.mask_root = osp.join(self.data_root, 'masks', self.dataset)
+        self.im_dir = osp.join(self.data_root, 'images', 'train2014')
+        if self.dataset == 'refcocog' :
+            dataset_path = osp.join(self.split_root, self.dataset + '_' + splitby)
+            splits = [split]
+            for split in splits:
+                imgset_file = '{0}_{1}_{2}.pth'.format(self.dataset, splitby, split)
+                imgset_path = osp.join(dataset_path, imgset_file)
+                images_tmp += torch.load(imgset_path)
+            # metric learning options
+            self.ROOT = '/data2/projects/seunghoon/VerbRIS/VerbCentric_CY/'
+            self.all_hp_root = "/data2/dataset/RefCOCO/refcocog/SBERT_gref_umd"
+            # self.exclude_position = args.exclude_pos
+            self.exclude_position = True
+            self.exclude_multiobj = True
+            self.metric_learning = metric_learning
+            # self.metric_mode = args.metric_mode
+            self.hp_selection = 'strict'
+            # meta data loading
+            if self.metric_learning and self.split == 'train':
+                self.multi_obj_ref_ids = self._load_multi_obj_ref_ids()
+                self.hardpos_meta = self._load_metadata()
+                # make new self.images file with sentence idx and total sent num (per ref_id)
+                from collections import defaultdict
+                ref_sentence_counts = defaultdict(int)
+                for item in images_tmp:
+                    ref_sentence_counts[item[1]] += 1
+                self.images = []
+                ref_sentence_indices = defaultdict(int)
+                for item in images_tmp:
+                    img_name, seg_id, box, sentence = item
+                    sent_index = ref_sentence_indices[seg_id]
+                    total_sentences = ref_sentence_counts[seg_id]
+                    self.images.append((img_name, seg_id, box, sentence, sent_index, total_sentences))
+                    ref_sentence_indices[seg_id] += 1
+            else :
+                self.images = images_tmp
+                self.multi_obj_ref_ids = None
+                self.hardpos_meta = None
+        else :
+            dataset_path = osp.join(self.split_root, self.dataset)
+            splits = [split]
+            for split in splits:
+                imgset_file = '{0}_{1}.pth'.format(self.dataset, split)
+                imgset_path = osp.join(dataset_path, imgset_file)
+                self.images += torch.load(imgset_path)
+    def exists_dataset(self):
+        return osp.exists(osp.join(self.split_root, self.dataset))
+    def _get_hardpos_verb(self, seg_id, sent_idx):
+        """
+        Handle the logic for selecting hard positive verb phrases during metric learning.
+        Returns the sentence, raw_verb, and tokenized verb if applicable.
+        """
+        # If the object appears multiple times, no hard positive is used
+        if seg_id in self.multi_obj_ref_ids:
+            verb_embed = torch.zeros(self.emb_size, dtype=torch.float32)
+            return '', verb_embed
+        # Extract metadata for hard positives if present
+        hardpos_dict = self.hardpos_meta.get(str(seg_id), {})
+        if self.hp_selection == 'strict' :
+            sent_id_list = list(hardpos_dict.keys())
+            cur_sent_id = sent_id_list[sent_idx]
+            cur_hardpos = hardpos_dict.get(cur_sent_id, {}).get('phrases', [])
+        if cur_hardpos:
+            # Assign a hard positive verb phrase if available
+            rand_index = random.randint(0, len(cur_hardpos) - 1)
+            raw_verb = cur_hardpos[rand_index]
+            verb_embed = torch.from_numpy(self._get_hardpos_embed(seg_id, cur_sent_id, rand_index))
+            # print("Positive phrase : " , raw_verb)
+            return raw_verb, verb_embed
+        verb_embed = torch.zeros(self.emb_size, dtype=torch.float32)
+        return '', verb_embed
+    def _get_hardpos_embed(self, seg_id, sent_id, rand_index):
+        emb_folder = os.path.join(self.all_hp_root, str(seg_id))
+        emb_files = sorted([f for f in os.listdir(emb_folder) if f.startswith(f"hp_{sent_id}_") and f.endswith(".npy")])
+        selected_emb_file = os.path.join(emb_folder, emb_files[rand_index])
+        return np.load(selected_emb_file)
+    def pull_item(self, idx):
+        # if metric learning and in train mode
+        if self.metric_learning and self.augment :
+            # sent_idx refers to index of sent among sent_num-1
+            img_file, seg_id, bbox, phrase, sent_idx, sent_num = self.images[idx]
+        else :
+            img_file, seg_id, bbox, phrase = self.images[idx]
+        bbox = np.array(bbox, dtype=int) # x1y1x2y2
+        img_path = osp.join(self.im_dir, img_file)
+        img = cv2.imread(img_path) # BGR [512, 640, 3]
+        ## duplicate channel if gray image
+        if img.shape[-1] > 1:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) #RGB
+        else:
+            img = np.stack([img] * 3)
+        ## seg map
+        seg_map = np.load(osp.join(self.mask_root, str(seg_id)+'.npy')) # [512, 640]
+        seg_map = np.array(seg_map).astype(np.float32)
+        if self.metric_learning and self.split == 'train' :
+            return img, phrase, bbox, seg_map, seg_id, sent_idx
+        else :
+            return img, phrase, bbox, seg_map, seg_id
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, idx):
+        if self.metric_learning and self.augment :
+            img, phrase, bbox, seg_map, seg_id, sent_idx = self.pull_item(idx)
+        else :
+            img, phrase, bbox, seg_map, seg_id = self.pull_item(idx)
+        phrase = phrase.lower()
+        if self.augment:
+            augment_flip, augment_hsv, augment_affine, augment_crop, augment_copy, augment_erase = \
+                    True,        True,           True,        False,        False,          False
+        ## seems a bug in torch transformation resize, so separate in advance
+        h,w = img.shape[0], img.shape[1]
+        # print("img.shape", img.shape)
+        if self.augment:
+            ## random horizontal flip
+            if augment_flip and random.random() > 0.5:
+                img = cv2.flip(img, 1)
+                seg_map = cv2.flip(seg_map, 1)
+                bbox[0], bbox[2] = w-bbox[2]-1, w-bbox[0]-1
+                phrase = phrase.replace('right','*&^special^&*').replace('left','right').replace('*&^special^&*','left')
+            ## random copy and add left or right
+            if augment_copy:
+                img, seg_map, phrase, bbox = random_copy(img, seg_map, phrase, bbox)
+            ## random erase for occluded
+            if augment_erase:
+                img, seg_map = random_erase(img, seg_map)
+            ## random padding and crop
+            if augment_crop:
+                img, seg_map = random_crop(img, seg_map, 40, h, w)
+            ## random intensity, saturation change
+            if augment_hsv:
+                fraction = 0.50
+                img_hsv = cv2.cvtColor(cv2.cvtColor(img, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2HSV)
+                S = img_hsv[:, :, 1].astype(np.float32)
+                V = img_hsv[:, :, 2].astype(np.float32)
+                a = (random.random() * 2 - 1) * fraction + 1
+                if a > 1:
+                    np.clip(S, a_min=0, a_max=255, out=S)
+                a = (random.random() * 2 - 1) * fraction + 1
+                V *= a
+                if a > 1:
+                    np.clip(V, a_min=0, a_max=255, out=V)
+                img_hsv[:, :, 1] = S.astype(np.uint8)
+                img_hsv[:, :, 2] = V.astype(np.uint8)
+                img = cv2.cvtColor(cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR), cv2.COLOR_BGR2RGB)
+            img, seg_map, ratio, dw, dh = letterbox(img, seg_map, self.imsize)
+            bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw
+            bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh
+            ## random affine transformation
+            if augment_affine:
+                img, seg_map, bbox, M = random_affine(img, seg_map, bbox, \
+                    degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.90, 1.10)) # 255 white fill
+        else:   ## should be inference, or specified training
+            img, _, ratio, dw, dh = letterbox(img, None, self.imsize)
+            bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw
+            bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh
+        draw_img = copy.deepcopy(img)
+        # Norm, to tensor
+        if self.transform is not None:
+            img = self.transform(img)
+        ## encode phrase to clip input
+        word_id = clip.tokenize(phrase, 17, truncate=True)
+        word_mask = ~ (word_id == 0)
+        orig_word_id = np.array(word_id, dtype=int)
+        orig_word_mask = np.array(word_mask, dtype=int)
+        # Get hardpos verb phrase
+        if self.metric_learning and self.augment:
+            raw_hardpos, hardpos_emb = self._get_hardpos_verb(seg_id, sent_idx)
+            pos_type = 'nopos'
+            if raw_hardpos:
+                pos_type = 'hardpos'
+                hardpos_id = clip.tokenize(raw_hardpos, self.word_len, truncate=True)
+            else:
+                # Empty phrase → Create a zero tensor matching shape of tokenized input
+                hardpos_id = np.zeros((1, self.word_len), dtype=int)
+            # Masking
+            hardpos_mask = hardpos_id != 0  # Mask should be boolean
+            hp_word_id = np.array(hardpos_id, dtype=int)
+            hp_word_mask = np.array(hardpos_mask, dtype=int)
+        if self.augment: # train
+            seg_map = cv2.resize(seg_map, (self.imsize // 2, self.imsize // 2),interpolation=cv2.INTER_NEAREST) # (208, 208)
+            seg_map = np.reshape(seg_map, [1, np.shape(seg_map)[0], np.shape(seg_map)[1]])
+            if self.metric_learning :
+                params = {
+                    'hp_word_id' : hp_word_id,
+                    'hp_word_mask' : hp_word_mask,
+                    'hardpos_emb' : hardpos_emb.unsqueeze(0),
+                    'pos_type' : pos_type
+                }
+                return img, orig_word_id, orig_word_mask, np.array(bbox, dtype=np.float32), \
+                np.array(seg_map, dtype=np.float32), params
+            else :
+                return img, orig_word_id, orig_word_mask, \
+                np.array(bbox, dtype=np.float32), np.array(seg_map, dtype=np.float32)
+        else:
+            seg_map = np.reshape(seg_map, [1, np.shape(seg_map)[0], np.shape(seg_map)[1]])
+            return img, orig_word_id, orig_word_mask, \
+            np.array(bbox, dtype=np.float32), np.array(seg_map, dtype=np.float32), np.array(ratio, dtype=np.float32), \
+            np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0], self.images[idx][3], np.array(draw_img, dtype=np.uint8)

ASDA/dataset/data_loader_rccp.py ADDED Viewed

	@@ -0,0 +1,279 @@

+# -*- coding: utf-8 -*-
+"""
+refcoco, refcoco+ and refcocog referring image detection and segmentation PyTorch dataset.
+"""
+import sys
+import cv2
+import os
+import torch
+import json
+import random
+import numpy as np
+import os.path as osp
+import torch.utils.data as data
+sys.path.append('.')
+import utils
+import re
+# from pytorch_pretrained_bert.tokenization import BertTokenizer
+from utils.transforms import letterbox, random_affine, random_copy, random_crop, random_erase
+import copy
+import clip
+sys.modules['utils'] = utils
+cv2.setNumThreads(0)
+class ReferDataset(data.Dataset):
+    SUPPORTED_DATASETS = {
+        'refcoco': {
+            'splits': ('train', 'val', 'testA', 'testB'),
+            'params': {'dataset': 'refcoco', 'split_by': 'unc'}
+        },
+        'refcoco+': {
+            'splits': ('train', 'val', 'testA', 'testB'),
+            'params': {'dataset': 'refcoco+', 'split_by': 'unc'}
+        },
+        'refcocog': {
+            'splits': ('train', 'val', 'test'),
+            'params': {'dataset': 'refcocog', 'split_by': 'unc'}
+        },
+        'refcocog_g': {
+            'splits': ('train', 'val'),
+            'params': {'dataset': 'refcocog', 'split_by': 'google'}
+        },
+        'refcocog_u': {
+            'splits': ('train', 'val', 'test'),
+            'params': {'dataset': 'refcocog', 'split_by': 'unc'}
+        },
+        'grefcoco': {
+            'splits': ('train', 'val', 'testA', 'testB'),
+            'params': {'dataset': 'grefcoco', 'split_by': 'unc'}
+        }
+    }
+    def __init__(self, data_root, split_root='data', dataset='refcoco', imsize=256, splitby='umd',
+                 transform=None, augment=False, split='train', max_query_len=128, metric_learning=None):
+        images_tmp = []
+        self.data_root = data_root
+        self.split_root = split_root
+        self.dataset = dataset
+        self.imsize = imsize
+        self.query_len = max_query_len
+        self.transform = transform
+        self.word_len = 17
+        self.emb_size = 384
+        self.split = split
+        self.augment=augment
+        valid_splits = self.SUPPORTED_DATASETS[self.dataset]['splits']
+        if split not in valid_splits:
+            raise ValueError(
+                'Dataset {0} does not have split {1}'.format(
+                    self.dataset, split))
+        self.anns_root = osp.join(self.data_root, 'anns', self.dataset, self.split+'.txt')
+        if self.dataset == 'refcocog' :
+            mask_anno_str = '{0}_{1}'.format(self.dataset, splitby)
+            self.mask_root = osp.join(self.data_root, 'masks', mask_anno_str)
+        else :
+            self.mask_root = osp.join(self.data_root, 'masks', self.dataset)
+        self.im_dir = osp.join(self.data_root, 'images', 'train2014')
+        # if self.dataset in ['refcoco', 'refcoco+']
+        dataset_path = osp.join(self.split_root, self.dataset)
+        splits = [split]
+        for split in splits:
+            imgset_file = '{0}_{1}.pth'.format(self.dataset, split)
+            imgset_path = osp.join(dataset_path, imgset_file)
+            images_tmp += torch.load(imgset_path)
+        # hardpos related
+        self.ROOT = '/data2/dataset/RefCOCO/VRIS'
+        if self.dataset == 'refcoco' :
+            self.all_hp_root = '/data2/dataset/RefCOCO/refcoco/SBERT_rcc_unc'
+        elif self.dataset == 'refcoco+' :
+            self.all_hp_root = '/data2/dataset/RefCOCO/refcoco+/SBERT_rccp_unc'
+        self.metric_learning = metric_learning
+        if self.metric_learning :
+            self.exclude_position = True
+            self.exclude_multiobj = True
+            self.hp_selection = 'strict'
+            self.multi_obj_ref_ids = None
+            self.hardpos_meta = None
+            # make new self.images file with sentence idx and total sent num (per ref_id)
+            from collections import defaultdict
+            ref_sentence_counts = defaultdict(int)
+            for item in images_tmp:
+                ref_sentence_counts[item[1]] += 1
+            if self.split == 'train' :
+                images = []
+                ref_sentence_indices = defaultdict(int)
+                for item in images_tmp:
+                    img_name, seg_id, box, sentence = item
+                    sent_index = ref_sentence_indices[seg_id]
+                    total_sentences = ref_sentence_counts[seg_id]
+                    images.append((img_name, seg_id, box, sentence, sent_index, total_sentences))
+                    ref_sentence_indices[seg_id] += 1
+                self.images = images
+            else :
+                self.images = images_tmp
+        else :
+            self.images = images_tmp
+    def exists_dataset(self):
+        return osp.exists(osp.join(self.split_root, self.dataset))
+    def _get_hardpos_verb_rcc(self, seg_id, sent_idx):
+        emb_folder = os.path.join(self.all_hp_root, str(seg_id))
+        emb_files = sorted([f for f in os.listdir(emb_folder) if f.startswith(f"hp_") and f.endswith(".npy")])
+        if self.hp_selection == 'strict' :
+            # choose only corresponding (selected) sentence embedding
+            emb_file = emb_files[sent_idx]
+        else :
+            # choose any sentence embedding
+            emb_files = sorted([f for f in os.listdir(emb_folder) if f.startswith(f"hp_") and f.endswith(".npy")])
+            emb_file = random.choice(emb_files)
+        selected_emb = np.load(os.path.join(emb_folder, emb_file))
+        verb_embed = torch.from_numpy(selected_emb)
+        return verb_embed
+    def pull_item(self, idx):
+        # if metric learning and in train mode
+        if self.metric_learning and self.augment :
+            # sent_idx refers to index of sent among sent_num-1
+            img_file, seg_id, bbox, phrase, sent_idx, sent_num = self.images[idx]
+        else :
+            img_file, seg_id, bbox, phrase = self.images[idx]
+        bbox = np.array(bbox, dtype=int) # x1y1x2y2
+        img_path = osp.join(self.im_dir, img_file)
+        img = cv2.imread(img_path) # BGR [512, 640, 3]
+        ## duplicate channel if gray image
+        if img.shape[-1] > 1:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) #RGB
+        else:
+            img = np.stack([img] * 3)
+        ## seg map
+        seg_map = np.load(osp.join(self.mask_root, str(seg_id)+'.npy')) # [512, 640]
+        seg_map = np.array(seg_map).astype(np.float32)
+        if self.metric_learning and self.split == 'train' :
+            return img, phrase, bbox, seg_map, seg_id, sent_idx
+        else :
+            return img, phrase, bbox, seg_map, seg_id
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, idx):
+        if self.metric_learning and self.augment :
+            img, phrase, bbox, seg_map, seg_id, sent_idx = self.pull_item(idx)
+        else :
+            img, phrase, bbox, seg_map, seg_id = self.pull_item(idx)
+        phrase = phrase.lower()
+        if self.augment:
+            augment_flip, augment_hsv, augment_affine, augment_crop, augment_copy, augment_erase = \
+                    True,        True,           True,        False,        False,          False
+        ## seems a bug in torch transformation resize, so separate in advance
+        h,w = img.shape[0], img.shape[1]
+        # print("img.shape", img.shape)
+        if self.augment:
+            ## random horizontal flip
+            if augment_flip and random.random() > 0.5:
+                img = cv2.flip(img, 1)
+                seg_map = cv2.flip(seg_map, 1)
+                bbox[0], bbox[2] = w-bbox[2]-1, w-bbox[0]-1
+                phrase = phrase.replace('right','*&^special^&*').replace('left','right').replace('*&^special^&*','left')
+            ## random copy and add left or right
+            if augment_copy:
+                img, seg_map, phrase, bbox = random_copy(img, seg_map, phrase, bbox)
+            ## random erase for occluded
+            if augment_erase:
+                img, seg_map = random_erase(img, seg_map)
+            ## random padding and crop
+            if augment_crop:
+                img, seg_map = random_crop(img, seg_map, 40, h, w)
+            ## random intensity, saturation change
+            if augment_hsv:
+                fraction = 0.50
+                img_hsv = cv2.cvtColor(cv2.cvtColor(img, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2HSV)
+                S = img_hsv[:, :, 1].astype(np.float32)
+                V = img_hsv[:, :, 2].astype(np.float32)
+                a = (random.random() * 2 - 1) * fraction + 1
+                if a > 1:
+                    np.clip(S, a_min=0, a_max=255, out=S)
+                a = (random.random() * 2 - 1) * fraction + 1
+                V *= a
+                if a > 1:
+                    np.clip(V, a_min=0, a_max=255, out=V)
+                img_hsv[:, :, 1] = S.astype(np.uint8)
+                img_hsv[:, :, 2] = V.astype(np.uint8)
+                img = cv2.cvtColor(cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR), cv2.COLOR_BGR2RGB)
+            img, seg_map, ratio, dw, dh = letterbox(img, seg_map, self.imsize)
+            bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw
+            bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh
+            ## random affine transformation
+            if augment_affine:
+                img, seg_map, bbox, M = random_affine(img, seg_map, bbox, \
+                    degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.90, 1.10)) # 255 white fill
+        else:   ## should be inference, or specified training
+            img, _, ratio, dw, dh = letterbox(img, None, self.imsize)
+            bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw
+            bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh
+        draw_img = copy.deepcopy(img)
+        # Norm, to tensor
+        if self.transform is not None:
+            img = self.transform(img)
+        ## encode phrase to clip input
+        word_id = clip.tokenize(phrase, 17, truncate=True)
+        word_mask = ~ (word_id == 0)
+        orig_word_id = np.array(word_id, dtype=int)
+        orig_word_mask = np.array(word_mask, dtype=int)
+        # Get hardpos verb phrase
+        if self.metric_learning and self.augment:
+            original_emb = self._get_hardpos_verb_rcc(seg_id, sent_idx)
+        if self.augment: # train
+            seg_map = cv2.resize(seg_map, (self.imsize // 2, self.imsize // 2),interpolation=cv2.INTER_NEAREST) # (208, 208)
+            seg_map = np.reshape(seg_map, [1, np.shape(seg_map)[0], np.shape(seg_map)[1]])
+            if self.metric_learning :
+                params = {
+                    'seg_id' : seg_id,
+                    'sent' : phrase,
+                    'hardpos_emb' : original_emb.unsqueeze(0)
+                }
+                return img, orig_word_id, orig_word_mask, np.array(bbox, dtype=np.float32), \
+                np.array(seg_map, dtype=np.float32), params
+            else :
+                return img, orig_word_id, orig_word_mask, \
+                np.array(bbox, dtype=np.float32), np.array(seg_map, dtype=np.float32)
+        else:
+            seg_map = np.reshape(seg_map, [1, np.shape(seg_map)[0], np.shape(seg_map)[1]])
+            return img, orig_word_id, orig_word_mask, \
+            np.array(bbox, dtype=np.float32), np.array(seg_map, dtype=np.float32), np.array(ratio, dtype=np.float32), \
+            np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0], self.images[idx][3], np.array(draw_img, dtype=np.uint8)

ASDA/dataset/data_loader_test.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# -*- coding: utf-8 -*-
+"""
+refcoco, refcoco+ and refcocog referring image detection and segmentation PyTorch dataset.
+"""
+import sys
+import cv2
+import torch
+import random
+import numpy as np
+import os.path as osp
+import torch.utils.data as data
+sys.path.append('.')
+import utils
+import re
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+from utils.transforms import letterbox, random_affine, random_copy, random_crop, random_erase
+import copy
+import clip
+sys.modules['utils'] = utils
+cv2.setNumThreads(0)
+def read_examples(input_line, unique_id):
+    """Read a list of `InputExample`s from an input file."""
+    examples = []
+    # unique_id = 0
+    line = input_line #reader.readline()
+    # if not line:
+    #     break
+    line = line.strip()
+    text_a = None
+    text_b = None
+    m = re.match(r"^(.*) \|\|\| (.*)$", line)
+    if m is None:
+        text_a = line
+    else:
+        text_a = m.group(1) #'man in black'
+        text_b = m.group(2)
+    examples.append(
+        InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
+    # unique_id += 1
+    return examples
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+  while True:
+    total_length = len(tokens_a) + len(tokens_b)
+    if total_length <= max_length:
+      break
+    if len(tokens_a) > len(tokens_b):
+      tokens_a.pop()
+    else:
+      tokens_b.pop()
+## Bert text encoding
+class InputExample(object):
+    def __init__(self, unique_id, text_a, text_b):
+        self.unique_id = unique_id
+        self.text_a = text_a
+        self.text_b = text_b
+class InputFeatures(object):
+    """A single set of features of data."""
+    def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
+        self.unique_id = unique_id
+        self.tokens = tokens
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.input_type_ids = input_type_ids
+def convert_examples_to_features(examples, seq_length, tokenizer):
+    """Loads a data file into a list of `InputBatch`s."""
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        tokens_a = tokenizer.tokenize(example.text_a) # ['far', 'left', 'vase']
+        tokens_b = None
+        if example.text_b:
+            tokens_b = tokenizer.tokenize(example.text_b)
+        if tokens_b:
+            # Modifies `tokens_a` and `tokens_b` in place so that the total
+            # length is less than the specified length.
+            # Account for [CLS], [SEP], [SEP] with "- 3"
+            _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
+        else:
+            # Account for [CLS] and [SEP] with "- 2"
+            if len(tokens_a) > seq_length - 2:
+                tokens_a = tokens_a[0:(seq_length - 2)]
+        tokens = []
+        input_type_ids = []
+        tokens.append("[CLS]")
+        input_type_ids.append(0)
+        for token in tokens_a:
+            tokens.append(token)
+            input_type_ids.append(0)
+        tokens.append("[SEP]")
+        input_type_ids.append(0)
+        if tokens_b:
+            for token in tokens_b:
+                tokens.append(token)
+                input_type_ids.append(1)
+            tokens.append("[SEP]")
+            input_type_ids.append(1)
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1] * len(input_ids)
+        # Zero-pad up to the sequence length.
+        while len(input_ids) < seq_length:
+            input_ids.append(0)
+            input_mask.append(0)
+            input_type_ids.append(0)
+        assert len(input_ids) == seq_length
+        assert len(input_mask) == seq_length
+        assert len(input_type_ids) == seq_length
+        features.append(
+            InputFeatures(
+                unique_id=example.unique_id,
+                tokens=tokens,
+                input_ids=input_ids,
+                input_mask=input_mask,
+                input_type_ids=input_type_ids))
+    return features
+class DatasetNotFoundError(Exception):
+    pass
+class ReferDataset(data.Dataset):
+    SUPPORTED_DATASETS = {
+        'refcoco': {
+            'splits': ('train', 'val', 'testA', 'testB'),
+            'params': {'dataset': 'refcoco', 'split_by': 'unc'}
+        },
+        'refcoco+': {
+            'splits': ('train', 'val', 'testA', 'testB'),
+            'params': {'dataset': 'refcoco+', 'split_by': 'unc'}
+        },
+        'refcocog': {
+            'splits': ('train', 'val', 'test'),
+            'params': {'dataset': 'refcocog', 'split_by': 'umd'}
+        },
+        'refcocog_g': {
+            'splits': ('train', 'val'),
+            'params': {'dataset': 'refcocog', 'split_by': 'google'}
+        },
+        'refcocog_u': {
+            'splits': ('train', 'val', 'test'),
+            'params': {'dataset': 'refcocog', 'split_by': 'umd'}
+        },
+        'grefcoco': {
+            'splits': ('train', 'val', 'testA', 'testB'),
+            'params': {'dataset': 'grefcoco', 'split_by': 'unc'}
+        }
+    }
+    def __init__(self, data_root, split_root='data', dataset='refcoco', imsize=256, splitby='umd',
+                 transform=None, augment=False, split='train', max_query_len=128,
+                 bert_model='bert-base-uncased'):
+        self.images = []
+        self.data_root = data_root
+        self.split_root = split_root
+        self.dataset = dataset
+        self.imsize = imsize
+        self.query_len = max_query_len
+        self.transform = transform
+        self.split = split
+        self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True) # should be true for English
+        self.augment=augment
+        valid_splits = self.SUPPORTED_DATASETS[self.dataset]['splits']
+        if split not in valid_splits:
+            raise ValueError(
+                'Dataset {0} does not have split {1}'.format(
+                    self.dataset, split))
+        self.anns_root = osp.join(self.data_root, 'anns', self.dataset, self.split+'.txt')
+        if self.dataset == 'refcocog_u' :
+            dataset = 'refcocog'
+            mask_anno_str = '{0}_{1}'.format(dataset, splitby)
+            self.mask_root = osp.join(self.data_root, 'masks', mask_anno_str)
+        else :
+            self.mask_root = osp.join(self.data_root, 'masks', self.dataset)
+        self.im_dir = osp.join(self.data_root, 'images', 'train2014')
+        if self.dataset == 'refcocog_u' :
+            dataset = 'refcocog'
+            dataset_path = osp.join(self.split_root, dataset + '_' + splitby)
+            splits = [split]
+            for split in splits:
+                imgset_file = '{0}_{1}_{2}.pth'.format(dataset, splitby, split)
+                imgset_path = osp.join(dataset_path, imgset_file)
+                self.images += torch.load(imgset_path)
+        else :
+            dataset_path = osp.join(self.split_root, self.dataset)
+            splits = [split]
+            for split in splits:
+                imgset_file = '{0}_{1}.pth'.format(self.dataset, split)
+                imgset_path = osp.join(dataset_path, imgset_file)
+                self.images += torch.load(imgset_path)
+    # def exists_dataset(self):
+    #     return osp.exists(osp.join(self.split_root, self.dataset))
+    def pull_item(self, idx):
+        img_file, seg_id, bbox, phrase = self.images[idx]
+        bbox = np.array(bbox, dtype=int) # x1y1x2y2
+        img_path = osp.join(self.im_dir, img_file)
+        img = cv2.imread(img_path) # BGR [512, 640, 3]
+        ## duplicate channel if gray image
+        if img.shape[-1] > 1:
+            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) #RGB
+        else:
+            img = np.stack([img] * 3)
+        ## seg map
+        seg_map = np.load(osp.join(self.mask_root, str(seg_id)+'.npy')) # [512, 640]
+        seg_map = np.array(seg_map).astype(np.float32)
+        return img, phrase, bbox, seg_map
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, idx):
+        img, phrase, bbox, seg_map = self.pull_item(idx)
+        phrase = phrase.lower()
+        if self.augment:
+            augment_flip, augment_hsv, augment_affine, augment_crop, augment_copy, augment_erase = \
+                    True,        True,           True,        False,        False,          False
+        ## seems a bug in torch transformation resize, so separate in advance
+        h,w = img.shape[0], img.shape[1]
+        # print("img.shape", img.shape)
+        if self.augment:
+            ## random horizontal flip
+            if augment_flip and random.random() > 0.5:
+                img = cv2.flip(img, 1)
+                seg_map = cv2.flip(seg_map, 1)
+                bbox[0], bbox[2] = w-bbox[2]-1, w-bbox[0]-1
+                phrase = phrase.replace('right','*&^special^&*').replace('left','right').replace('*&^special^&*','left')
+            ## random copy and add left or right
+            if augment_copy:
+                img, seg_map, phrase, bbox = random_copy(img, seg_map, phrase, bbox)
+            ## random erase for occluded
+            if augment_erase:
+                img, seg_map = random_erase(img, seg_map)
+            ## random padding and crop
+            if augment_crop:
+                img, seg_map = random_crop(img, seg_map, 40, h, w)
+            ## random intensity, saturation change
+            if augment_hsv:
+                fraction = 0.50
+                img_hsv = cv2.cvtColor(cv2.cvtColor(img, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2HSV)
+                S = img_hsv[:, :, 1].astype(np.float32)
+                V = img_hsv[:, :, 2].astype(np.float32)
+                a = (random.random() * 2 - 1) * fraction + 1
+                if a > 1:
+                    np.clip(S, a_min=0, a_max=255, out=S)
+                a = (random.random() * 2 - 1) * fraction + 1
+                V *= a
+                if a > 1:
+                    np.clip(V, a_min=0, a_max=255, out=V)
+                img_hsv[:, :, 1] = S.astype(np.uint8)
+                img_hsv[:, :, 2] = V.astype(np.uint8)
+                img = cv2.cvtColor(cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR), cv2.COLOR_BGR2RGB)
+            img, seg_map, ratio, dw, dh = letterbox(img, seg_map, self.imsize)
+            bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw
+            bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh
+            ## random affine transformation
+            if augment_affine:
+                img, seg_map, bbox, M = random_affine(img, seg_map, bbox, \
+                    degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.90, 1.10)) # 255 white fill
+        else:   ## should be inference, or specified training
+            img, _, ratio, dw, dh = letterbox(img, None, self.imsize)
+            bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw
+            bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh
+        draw_img = copy.deepcopy(img)
+        # Norm, to tensor
+        if self.transform is not None:
+            img = self.transform(img)
+        ## encode phrase to clip input
+        word_id = clip.tokenize(phrase, 17, truncate=True)
+        word_mask = ~ (word_id == 0)
+        if self.augment: # train
+            seg_map = cv2.resize(seg_map, (self.imsize // 2, self.imsize // 2),interpolation=cv2.INTER_NEAREST) # (208, 208)
+            seg_map = np.reshape(seg_map, [1, np.shape(seg_map)[0], np.shape(seg_map)[1]])
+            return img, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
+            np.array(bbox, dtype=np.float32), np.array(seg_map, dtype=np.float32)
+        else:
+            seg_map = np.reshape(seg_map, [1, np.shape(seg_map)[0], np.shape(seg_map)[1]])
+            return img, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
+            np.array(bbox, dtype=np.float32), np.array(seg_map, dtype=np.float32), np.array(ratio, dtype=np.float32), \
+            np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0], self.images[idx][3], np.array(draw_img, dtype=np.uint8)

ASDA/dataset/data_process.py ADDED Viewed

	@@ -0,0 +1,225 @@

+# encoding=utf8
+# %matplotlib inline
+import numpy as np
+import os
+from refer import REFER
+import os.path as osp
+import cv2
+import argparse
+parser = argparse.ArgumentParser(description='Data preparation')
+parser.add_argument('--data_root',  type=str) # contains refclef, refcoco, refcoco+, refcocog and images
+parser.add_argument('--output_dir',  type=str)
+parser.add_argument('--dataset', type=str, choices=['refcoco', 'refcoco+','refcocog'], default='refcoco')
+parser.add_argument('--split',  type=str,default='umd')
+parser.add_argument('--generate_mask',  action='store_true')
+args = parser.parse_args()
+# data_root # contains refclef, refcoco, refcoco+, refcocog and images
+refer = REFER(args.data_root, args.dataset, args.split)
+print ('dataset [%s_%s] contains: ' % (args.dataset, args.split))
+ref_ids = refer.getRefIds()
+image_ids = refer.getImgIds()
+print ('%s expressions for %s refs in %s images.' % (len(refer.Sents), len(ref_ids), len(image_ids)))
+print('\nAmong them:')
+if args.dataset == 'refclef':
+    if args.split == 'unc':
+        splits = ['train', 'val', 'testA','testB','testC']
+    else:
+        splits = ['train', 'val', 'test']
+elif args.dataset == 'refcoco':
+    splits = ['train', 'val', 'testA', 'testB']
+elif args.dataset == 'refcoco+':
+    splits = ['train', 'val',  'testA', 'testB']
+elif args.dataset == 'grefcoco':
+    splits = ['train', 'val',  'testA', 'testB']
+elif args.dataset == 'refcocog':
+    splits = ['train', 'val', 'test']  # we don't have test split for refcocog right now.
+# split data as a type in splits list
+for split in splits:
+    ref_ids = refer.getRefIds(split=split)
+    print('%s refs are in split [%s].' % (len(ref_ids), split))
+# show a batch data with bounding box,cat,sentences
+def show_a_batch(batch_size):
+    split='train'
+    # batch_size=32
+    ref_ids = refer.getRefIds(split=split)
+    print(split+'_size:',len(ref_ids))
+    batch_index=list(np.random.choice(len(ref_ids),batch_size))
+    # print(refer.Refs)
+    ref_id = [ref_ids[i] for i in batch_index]
+    refs = [refer.Refs[i] for i in ref_id]
+    bboxs=[refer.getRefBox(i) for i in ref_id]
+    sentences=[ref['sentences'] for ref in refs]
+    image_urls=[refer.loadImgs(image_ids=ref['image_id']) for ref in refs]
+    cats = [refer.loadCats(cat_ids=ref['category_id']) for ref in refs]
+    # plt.figure()
+    # plt.subplot(batch_size)
+    grid_width = 2
+    grid_height = int(batch_size / grid_width)
+    # fig, axs = plt.subplots(grid_height, grid_width, figsize=(grid_width*10, 10*grid_height))
+    for i in range(batch_size):
+        print('bbox for batch[{}]:'.format(i),bboxs[i])
+        print('sentences for batch[{}]:'.format(i))
+        for sid, sent in enumerate(sentences[i]):
+            print('%s. %s' % (sid+1, sent['sent']))
+        print('cats for batch[{}]:'.format(i), cats[i])
+        image_url=image_urls[i][0]
+        image=cv2.imread(osp.join(refer.IMAGE_DIR, image_url['file_name']))
+        print(image.shape)
+        # print(bboxs[i][0])
+        cv2.rectangle(image,(int(bboxs[i][0]), int(bboxs[i][1])), (int(bboxs[i][0]+bboxs[i][2]),int(bboxs[i][1]+ bboxs[i][3])),255,3)
+        cv2.putText(image,
+                        str(sent['sent']),
+                        (20, 20),
+                        cv2.FONT_HERSHEY_SIMPLEX,
+                        .9,(0,255,0), 2)
+        os.mkdir('debug_vis')
+        cv2.imwrite('./debug_vis/'+image_url['file_name'], image)
+        cv2.imwrite('./debug_vis/mask'+image_url['file_name'], refer.getMask(refs[i])['mask']*255)
+        # ax.imshow(image)
+    # plt.show()
+def cat_process(cat):
+    if cat >= 1 and cat <= 11:
+        cat = cat - 1
+    elif cat >= 13 and cat <= 25:
+        cat = cat - 2
+    elif cat >= 27 and cat <= 28:
+        cat = cat - 3
+    elif cat >= 31 and cat <= 44:
+        cat = cat - 5
+    elif cat >= 46 and cat <= 65:
+        cat = cat - 6
+    elif cat == 67:
+        cat = cat - 7
+    elif cat == 70:
+        cat = cat - 9
+    elif cat >= 72 and cat <= 82:
+        cat = cat - 10
+    elif cat >= 84 and cat <= 90:
+        cat = cat - 11
+    return  cat
+def bbox_process(bbox,cat,segement_id):
+    x_min = int(bbox[0])
+    y_min = int(bbox[1])
+    x_max = x_min + int(bbox[2])
+    y_max = y_min + int(bbox[3])
+    box_info = " %d,%d,%d,%d,%d,%d" % (int(x_min), int(y_min), int(x_max), int(y_max), int(cat),int(segement_id))
+    return box_info
+def prepare_dataset(dataset,splits,output_dir,generate_mask=False):
+    # split_type='train'
+    # splits=[split_type]
+    # batch_size=32
+    if dataset == 'refcocog':
+        dataset = 'refcocog_' + args.split
+    if not os.path.exists(os.path.join(output_dir,'anns',dataset)):
+        os.makedirs(os.path.join(output_dir,'anns',dataset))
+    if not os.path.exists(os.path.join(output_dir,'masks',dataset)):
+        os.makedirs(os.path.join(output_dir,'masks',dataset))
+    for split in splits:
+        f = open(os.path.join(output_dir,'anns', dataset, split + '.txt'), 'w', encoding='utf-8')
+        # print(split)
+        split_num=0
+        ll=0
+        ref_ids = refer.getRefIds(split=split)
+        print(split+'_size:',len(ref_ids))
+        for i in ref_ids:
+            # ref_id = ref_ids[i]
+            refs = refer.Refs[i]
+            bboxs=refer.getRefBox(i)
+            print("bboxs", bboxs)
+            sentences=refs['sentences']
+            image_urls=refer.loadImgs(image_ids=refs['image_id'])[0]
+            # grefcoco中的category_id是一个list
+            cat = refs['category_id']
+            if type(cat) == list:
+                for j in range(len(cat)):
+                    cat[j] = cat_process(cat[j])
+            else:
+                cat = cat_process(cat)
+            image_urls=image_urls['file_name']
+            if dataset=='refclef' and  image_urls in ['19579.jpg', '17975.jpg', '19575.jpg']:
+                continue
+            # RES中box信息和cat信息使用不到
+            if type(bboxs[0]) == list:
+                box_info = bbox_process(bboxs[0], cat[0], i)  # add segement id
+            else:
+                box_info=bbox_process(bboxs,cat,i) #add segement id
+            f.write(image_urls)
+            f.write(box_info)
+            # f.write(' '+str(i))
+            if generate_mask:
+                if dataset == 'grefcoco':
+                    np.save(os.path.join(output_dir,'masks',dataset,str(i)+'.npy'),refer.getMaskByRef(refs, merge=True)['mask'])
+                else:
+                    np.save(os.path.join(output_dir,'masks',dataset,str(i)+'.npy'),refer.getMask(refs)['mask'])  #if need seg mask ,set it!
+            for sentence in sentences:
+                f.write(' ~ ')
+                # print(sentence['sent'].encode('UTF-8'))
+                f.write(sentence['sent'])
+                if ll<len(sentence['sent']):
+                    ll=len(sentence['sent'])
+            f.write('\n')
+            split_num+=1
+        print('split_num:',split_num)
+        print('max_len:',ll)
+        f.close()
+def prepare_sentences_refcoco():
+    splits=['train','val']
+    # batch_size=32
+    f = open('sentences.txt', 'w')
+    for split in splits:
+        print(split)
+        ref_ids = refer.getRefIds(split=split)
+        print(split+'_size:',len(ref_ids))
+        for i in range(len(ref_ids)):
+            refs = refer.Refs[i]
+            sentences=refs['sentences']
+            for sentence in sentences:
+                f.write(sentence['sent'])
+                f.write('\n')
+    f.close()
+def test_length():
+    max_len=0
+    word_l_count=np.zeros([50],dtype=np.int)
+    with open('./refcocog/train.txt') as f:
+        lines = f.readlines()
+        for j in range(len(lines)):
+            line=lines[j].split()
+            stop = len(line)
+            for i in range(1, len(line)):
+                if (line[i] == '~'):
+                    stop = i
+                    break
+            sentences = []
+            sent_stop = stop + 1
+            for i in range(stop + 1, len(line)):
+                if line[i] == '~':
+                    # sentences.append(line[sent_stop:i])
+                    # print(len(line[sent_stop:i]))
+                    word_l_count[len(line[sent_stop:i])]+=1
+                    # if len(line[sent_stop:i])>max_len:
+                    #     max_len=len(line[sent_stop:i])
+                    sent_stop = i + 1
+    for i in range(50):
+        if word_l_count[i]>0:
+            print('length:%d'%i,',count:%d'%word_l_count[i])
+    # print('max_len:',max_len)
+        # print(len(lines))
+prepare_dataset(args.dataset,splits,args.output_dir,args.generate_mask)

ASDA/dataset/datascript.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# generate **.pth
+import os
+import sys
+import torch
+sys.path.append('.')
+import argparse
+parser = argparse.ArgumentParser(description='Data preparation')
+parser.add_argument('--dataset', type=str, choices=['refcoco', 'refcoco+','refcocog_google', 'refcocog_umd'], default='refcoco')
+args = parser.parse_args()
+def main(args):
+    dataset = args.dataset
+    input_txt_list = os.listdir(f'../ln_data/anns/{dataset}')
+    if not os.path.exists(f'../data/{dataset}'):
+        os.makedirs(f'../data/{dataset}')
+    for input_txt in input_txt_list:
+        split = input_txt.split('_')[-1].split('.')[0]
+        input_txt = os.path.join('../ln_data/anns', dataset, input_txt)
+        res = []
+        with open(input_txt, encoding='utf-8') as f:
+            lines = f.readlines()
+            for line in lines:
+                line = line.split()
+                stop = len(line)
+                img_name = line[0]
+                for i in range(1,len(line)):
+                    if (line[i]=='~'):
+                        stop=i
+                        break
+                box_ = [list(map(int,box.split(','))) for box in line[1:stop]]
+                box = box_[0][:4]
+                seg_id=box_[0][-1]
+                sent_stop=stop+1
+                for i in range(stop+1,len(line)):
+                    if line[i]=='~':
+                        des = ''
+                        for word in line[sent_stop:i]:
+                            des = des + word + ' '
+                        sent_stop=i+1
+                        des = des.rstrip(' ')
+                        res.append((img_name, seg_id, box, des))
+                des = ''
+                for word in line[sent_stop:len(line)]:
+                    des = des + word + ' '
+                des = des.rstrip(' ')
+                res.append((img_name, seg_id, box, des))
+            # print(res)
+        imgset_path = '{0}_{1}.pth'.format(dataset, split)
+        images = torch.save(res, os.path.join("../data", dataset, imgset_path))
+    print(dataset, " done")
+if __name__ == "__main__":
+    main(args)

ASDA/dataset/refer.py ADDED Viewed

	@@ -0,0 +1,485 @@

+__author__ = 'licheng'
+"""
+This interface provides access to four datasets:
+1) refclef
+2) refcoco
+3) refcoco+
+4) refcocog
+split by unc and google
+The following API functions are defined:
+REFER      - REFER api class
+getRefIds  - get ref ids that satisfy given filter conditions.
+getAnnIds  - get ann ids that satisfy given filter conditions.
+getImgIds  - get image ids that satisfy given filter conditions.
+getCatIds  - get category ids that satisfy given filter conditions.
+loadRefs   - load refs with the specified ref ids.
+loadAnns   - load anns with the specified ann ids.
+loadImgs   - load images with the specified image ids.
+loadCats   - load category names with the specified category ids.
+getRefBox  - get ref's bounding box [x, y, w, h] given the ref_id
+showRef    - show image, segmentation or box of the referred object with the ref
+getMask    - get mask and area of the referred object given ref
+showMask   - show mask of the referred object given ref
+"""
+import sys
+import os.path as osp
+import os
+import json
+# import _pickle  as pickle
+import pickle
+import time
+import itertools
+import skimage.io as io
+import matplotlib.pyplot as plt
+from matplotlib.collections import PatchCollection
+from matplotlib.patches import Polygon, Rectangle
+from pprint import pprint
+import numpy as np
+from pycocotools import mask
+import cv2
+# from skimage.measure import label, regionprops
+class REFER:
+    def __init__(self, data_root, dataset='refcoco', splitBy='unc'):
+        # provide data_root folder which contains refclef, refcoco, refcoco+ and refcocog
+        # also provide dataset name and splitBy information
+        # e.g., dataset = 'refcoco', splitBy = 'unc'
+        print('loading dataset %s into memory...' % dataset)
+        self.ROOT_DIR = osp.abspath(osp.dirname(__file__))
+        self.DATA_DIR = osp.join(data_root, dataset)
+        if dataset in ['refcoco', 'refcoco+', 'refcocog']:
+            self.IMAGE_DIR = osp.join(data_root, 'images/train2014')
+        elif dataset == 'refclef':
+            self.IMAGE_DIR = osp.join(data_root, 'images/saiapr_tc-12')
+        else:
+            print('No refer dataset is called [%s]' % dataset)
+            sys.exit()
+        # load refs from data/dataset/refs(dataset).json
+        tic = time.time()
+        ref_file = osp.join(self.DATA_DIR, 'refs('+splitBy+').p')
+        self.data = {}
+        self.data['dataset'] = dataset
+        self.data['refs'] = pickle.load(open(ref_file, 'rb'),fix_imports=True)
+        # load annotations from data/dataset/instances.json
+        instances_file = osp.join(self.DATA_DIR, 'instances.json')
+        instances = json.load(open(instances_file, 'r'))
+        self.data['images'] = instances['images']
+        self.data['annotations'] = instances['annotations']
+        self.data['categories'] = instances['categories']
+        # create index
+        self.createIndex()
+        print('DONE (t=%.2fs)' % (time.time()-tic))
+    def createIndex(self):
+        # create sets of mapping
+        # 1)  Refs: 	 	{ref_id: ref}
+        # 2)  Anns: 	 	{ann_id: ann}
+        # 3)  Imgs:		 	{image_id: image}
+        # 4)  Cats: 	 	{category_id: category_name}
+        # 5)  Sents:     	{sent_id: sent}
+        # 6)  imgToRefs: 	{image_id: refs}
+        # 7)  imgToAnns: 	{image_id: anns}
+        # 8)  refToAnn:  	{ref_id: ann}
+        # 9)  annToRef:  	{ann_id: ref}
+        # 10) catToRefs: 	{category_id: refs}
+        # 11) sentToRef: 	{sent_id: ref}
+        # 12) sentToTokens: {sent_id: tokens}
+        print('creating index...')
+        # fetch info from instances
+        Anns, Imgs, Cats, imgToAnns = {}, {}, {}, {}
+        for ann in self.data['annotations']:
+            Anns[ann['id']] = ann
+            imgToAnns[ann['image_id']] = imgToAnns.get(ann['image_id'], []) + [ann]
+        for img in self.data['images']:
+            Imgs[img['id']] = img
+        for cat in self.data['categories']:
+            Cats[cat['id']] = cat['name']
+        # fetch info from refs
+        Refs, imgToRefs, refToAnn, annToRef, catToRefs = {}, {}, {}, {}, {}
+        Sents, sentToRef, sentToTokens = {}, {}, {}
+        for ref in self.data['refs']:
+            # ids
+            ref_id = ref['ref_id']
+            ann_id = ref['ann_id']
+            category_id = ref['category_id']
+            image_id = ref['image_id']
+            # add mapping related to ref
+            Refs[ref_id] = ref
+            imgToRefs[image_id] = imgToRefs.get(image_id, []) + [ref]
+            catToRefs[category_id] = catToRefs.get(category_id, []) + [ref]
+            refToAnn[ref_id] = Anns[ann_id]
+            annToRef[ann_id] = ref
+            # add mapping of sent
+            for sent in ref['sentences']:
+                Sents[sent['sent_id']] = sent
+                sentToRef[sent['sent_id']] = ref
+                sentToTokens[sent['sent_id']] = sent['tokens']
+        # create class members
+        self.Refs = Refs
+        self.Anns = Anns
+        self.Imgs = Imgs
+        self.Cats = Cats
+        self.Sents = Sents
+        self.imgToRefs = imgToRefs
+        self.imgToAnns = imgToAnns
+        self.refToAnn = refToAnn
+        self.annToRef = annToRef
+        self.catToRefs = catToRefs
+        self.sentToRef = sentToRef
+        self.sentToTokens = sentToTokens
+        print('index created.')
+    def getRefIds(self, image_ids=[], cat_ids=[], ref_ids=[], split=''):
+        image_ids = image_ids if type(image_ids) == list else [image_ids]
+        cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+        if len(image_ids)==len(cat_ids)==len(ref_ids)==len(split)==0:
+            refs = self.data['refs']
+        else:
+            if not len(image_ids) == 0:
+                refs = [self.imgToRefs[image_id] for image_id in image_ids]
+            else:
+                refs = self.data['refs']
+            if not len(cat_ids) == 0:
+                refs = [ref for ref in refs if ref['category_id'] in cat_ids]
+            if not len(ref_ids) == 0:
+                refs = [ref for ref in refs if ref['ref_id'] in ref_ids]
+            if not len(split) == 0:
+                if split in ['testA', 'testB', 'testC']:
+                    refs = [ref for ref in refs if split[-1] in ref['split']] # we also consider testAB, testBC, ...
+                elif split in ['testAB', 'testBC', 'testAC']:
+                    refs = [ref for ref in refs if ref['split'] == split]  # rarely used I guess...
+                elif split == 'test':
+                    refs = [ref for ref in refs if 'test' in ref['split']]
+                elif split == 'train' or split == 'val':
+                    refs = [ref for ref in refs if ref['split'] == split]
+                else:
+                    print('No such split [%s]' % split)
+                    sys.exit()
+        ref_ids = [ref['ref_id'] for ref in refs]
+        return ref_ids
+    def getAnnIds(self, image_ids=[], cat_ids=[], ref_ids=[]):
+        image_ids = image_ids if type(image_ids) == list else [image_ids]
+        cat_ids = cat_ids if type(cat_ids) == list else [cat_ids]
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+        if len(image_ids) == len(cat_ids) == len(ref_ids) == 0:
+            ann_ids = [ann['id'] for ann in self.data['annotations']]
+        else:
+            if not len(image_ids) == 0:
+                lists = [self.imgToAnns[image_id] for image_id in image_ids if image_id in self.imgToAnns]  # list of [anns]
+                anns = list(itertools.chain.from_iterable(lists))
+            else:
+                anns = self.data['annotations']
+            if not len(cat_ids) == 0:
+                anns = [ann for ann in anns if ann['category_id'] in cat_ids]
+            ann_ids = [ann['id'] for ann in anns]
+            if not len(ref_ids) == 0:
+                ids = set(ann_ids).intersection(set([self.Refs[ref_id]['ann_id'] for ref_id in ref_ids]))
+        return ann_ids
+    def getImgIds(self, ref_ids=[]):
+        ref_ids = ref_ids if type(ref_ids) == list else [ref_ids]
+        if not len(ref_ids) == 0:
+            image_ids = list(set([self.Refs[ref_id]['image_id'] for ref_id in ref_ids]))
+        else:
+            image_ids = self.Imgs.keys()
+        return image_ids
+    def getCatIds(self):
+        return self.Cats.keys()
+    def loadRefs(self, ref_ids=[]):
+        if type(ref_ids) == list:
+            return [self.Refs[ref_id] for ref_id in ref_ids]
+        elif type(ref_ids) == int:
+            return [self.Refs[ref_ids]]
+    def loadAnns(self, ann_ids=[]):
+        if type(ann_ids) == list:
+            return [self.Anns[ann_id] for ann_id in ann_ids]
+        elif type(ann_ids) == int or type(ann_ids) == unicode:
+            return [self.Anns[ann_ids]]
+    def loadImgs(self, image_ids=[]):
+        if type(image_ids) == list:
+            return [self.Imgs[image_id] for image_id in image_ids]
+        elif type(image_ids) == int:
+            return [self.Imgs[image_ids]]
+    def loadCats(self, cat_ids=[]):
+        if type(cat_ids) == list:
+            return [self.Cats[cat_id] for cat_id in cat_ids]
+        elif type(cat_ids) == int:
+            return [self.Cats[cat_ids]]
+    def getRefBox(self, ref_id):
+        ref = self.Refs[ref_id]
+        ann = self.refToAnn[ref_id]
+        return ann['bbox']  # [x, y, w, h]
+    def showRef(self, ref, seg_box='seg'):
+        ax = plt.gca()
+        # show image
+        image = self.Imgs[ref['image_id']]
+        I = io.imread(osp.join(self.IMAGE_DIR, image['file_name']))
+        ax.imshow(I)
+        # show refer expression
+        for sid, sent in enumerate(ref['sentences']):
+            print('%s. %s' % (sid+1, sent['sent']))
+        # show segmentations
+        if seg_box == 'seg':
+            ann_id = ref['ann_id']
+            ann = self.Anns[ann_id]
+            polygons = []
+            color = []
+            c = 'none'
+            if type(ann['segmentation'][0]) == list:
+                # polygon used for refcoco*
+                for seg in ann['segmentation']:
+                    poly = np.array(seg).reshape((len(seg)//2, 2))
+                    polygons.append(Polygon(poly, True, alpha=0.4))
+                    color.append(c)
+                p = PatchCollection(polygons, facecolors=color, edgecolors=(1,1,0,0), linewidths=3, alpha=1)
+                ax.add_collection(p)  # thick yellow polygon
+                p = PatchCollection(polygons, facecolors=color, edgecolors=(1,0,0,0), linewidths=1, alpha=1)
+                ax.add_collection(p)  # thin red polygon
+            else:
+                # mask used for refclef
+                rle = ann['segmentation']
+                m = mask.decode(rle)
+                img = np.ones( (m.shape[0], m.shape[1], 3) )
+                color_mask = np.array([2.0,166.0,101.0])/255
+                for i in range(3):
+                    img[:,:,i] = color_mask[i]
+                ax.imshow(np.dstack( (img, m*0.5) ))
+        # show bounding-box
+        elif seg_box == 'box':
+            ann_id = ref['ann_id']
+            print(ann_id)
+            ann = self.Anns[ann_id]
+            bbox = 	self.getRefBox(ref['ref_id'])
+            box_plot = Rectangle((bbox[0], bbox[1]), bbox[2], bbox[3], fill=False, edgecolor='green', linewidth=3)
+            ax.add_patch(box_plot)
+    def getMask(self, ref):
+        # return mask, area and mask-center
+        ann = self.refToAnn[ref['ref_id']]
+        print(ann)
+        image = self.Imgs[ref['image_id']]
+        if type(ann['segmentation'][0]) == list: # polygon
+            rle = mask.frPyObjects(ann['segmentation'], image['height'], image['width'])
+        else:
+            rle = ann['segmentation']
+        # for i in range(len(rle['counts'])):
+        # print(rle)
+        m = mask.decode(rle)
+        m = np.sum(m, axis=2)  # sometimes there are multiple binary map (corresponding to multiple segs)
+        m = m.astype(np.uint8) # convert to np.uint8
+        # compute area
+        area = sum(mask.area(rle))  # should be close to ann['area']
+        return {'mask': m, 'area': area}
+        # # position
+        # position_x = np.mean(np.where(m==1)[1]) # [1] means columns (matlab style) -> x (c style)
+        # position_y = np.mean(np.where(m==1)[0]) # [0] means rows (matlab style)    -> y (c style)
+        # # mass position (if there were multiple regions, we use the largest one.)
+        # label_m = label(m, connectivity=m.ndim)
+        # regions = regionprops(label_m)
+        # if len(regions) > 0:
+        # 	largest_id = np.argmax(np.array([props.filled_area for props in regions]))
+        # 	largest_props = regions[largest_id]
+        # 	mass_y, mass_x = largest_props.centroid
+        # else:
+        # 	mass_x, mass_y = position_x, position_y
+        # # if centroid is not in mask, we find the closest point to it from mask
+        # if m[mass_y, mass_x] != 1:
+        # 	print 'Finding closes mask point ...'
+        # 	kernel = np.ones((10, 10),np.uint8)
+        # 	me = cv2.erode(m, kernel, iterations = 1)
+        # 	points = zip(np.where(me == 1)[0].tolist(), np.where(me == 1)[1].tolist())  # row, col style
+        # 	points = np.array(points)
+        # 	dist   = np.sum((points - (mass_y, mass_x))**2, axis=1)
+        # 	id     = np.argsort(dist)[0]
+        # 	mass_y, mass_x = points[id]
+        # 	# return
+        # return {'mask': m, 'area': area, 'position_x': position_x, 'position_y': position_y, 'mass_x': mass_x, 'mass_y': mass_y}
+        # # show image and mask
+        # I = io.imread(osp.join(self.IMAGE_DIR, image['file_name']))
+        # plt.figure()
+        # plt.imshow(I)
+        # ax = plt.gca()
+        # img = np.ones( (m.shape[0], m.shape[1], 3) )
+        # color_mask = np.array([2.0,166.0,101.0])/255
+        # for i in range(3):
+        #     img[:,:,i] = color_mask[i]
+        # ax.imshow(np.dstack( (img, m*0.5) ))
+        # plt.show()
+    def showMask(self, ref):
+        M = self.getMask(ref)
+        msk = M['mask']
+        ax = plt.gca()
+        ax.imshow(msk)
+if __name__ == '__main__':
+    refer = REFER(data_root="/home/ypf/workspace/code/BKINet/ln_data", dataset='refcoco', splitBy='unc')
+    save_path = "./visualization/"
+    ref_ids = refer.getRefIds()
+    print(len(ref_ids))
+    print(len(refer.Imgs))
+    print(len(refer.imgToRefs))
+    print(refer.Cats)
+    ref_ids = refer.getRefIds(split='train')
+    print('There are %s training referred objects.' % len(ref_ids))
+    img_ids = [8936, 52563]
+    # ref_ids = refer.getRefIds(image_ids=img_ids)
+    # refs = refer.loadRefs(ref_ids)
+    def custom_vis1(image, mask_):
+        # 将mask应用到蓝色图层
+        # 创建一个蓝色图层
+        blue_layer = np.zeros_like(image)
+        blue_layer[:, :, 0] = 255  # 对于OpenCV，蓝色通道是第一个
+        blue_mask = cv2.bitwise_and(blue_layer, blue_layer, mask=mask_)
+        # 将蓝色mask以一定的透明度覆盖到原图上
+        alpha = 0.1  # 透明度
+        cv2.addWeighted(blue_mask, alpha, image, 1 - alpha, 0, image)
+    def custom_vis2(image, mask_):
+        # 创建蓝色图层
+        blue_layer = np.zeros_like(image)
+        blue_layer[:, :, 0] = 255  # 对于OpenCV，蓝色通道是第一个
+        # 将mask应用到蓝色图层
+        blue_mask = cv2.bitwise_and(blue_layer, blue_layer, mask=mask_)
+        # alpha值定义了mask图层和原图的融合程度
+        alpha = 0.5  # 透明度
+        # 创建一个完全透明的图层
+        transparent_layer = np.zeros_like(image)
+        # 我们只在mask的区域上应用蓝色图层，并调整alpha值来控制透明度
+        for i in range(3):  # 只处理RGB三个通道
+            transparent_layer[:, :, i] = cv2.addWeighted(blue_mask[:, :, i], alpha, image[:, :, i], 1 - alpha, 0)
+        # 在mask区域外使用原图
+        transparent_layer[mask_ == 0] = image[mask_ == 0]
+        return transparent_layer
+    def custom_vis3(image, mask_):
+        """
+        直接在原图上修改指定mask区域的颜色为蓝色
+        不改变其他区域的亮度或色彩
+        """
+        image[mask_ != 0] = [255, 0, 0]  # OpenCV中的颜色顺序是BGR
+    def custom_vis4(image, mask_, alpha=0.4):
+        """
+        在原图上以指定的透明度应用蓝色遮罩。
+        alpha: 遮罩的透明度，范围从0（完全透明）到1（完全不透明）。
+        """
+        # 将原图从BGR转换为RGBA以添加Alpha通道
+        image_rgba = cv2.cvtColor(image, cv2.COLOR_BGR2BGRA)
+        # 创建一个同样大小的全蓝色图层
+        blue_mask = np.zeros_like(image_rgba)
+        blue_mask[:, :, 0] = 255  # B
+        blue_mask[:, :, 3] = 255  # Alpha设置为不透明
+        # 应用透明度到mask区域
+        blue_mask[mask_ != 0, 3] = int(alpha * 255)
+        # 将蓝色遮罩叠加到原图
+        image_rgba = cv2.addWeighted(image_rgba, 1, blue_mask, alpha, 0)
+        return image_rgba
+    for i, img_id in enumerate(img_ids):
+        ref = refer.imgToRefs[img_id][0]
+        print(ref)
+        mask_ = refer.getMask(ref)['mask']
+        # sentence = ref['sentences'][0]['sent']
+        img = refer.Imgs[img_id]
+        # I = io.imread(osp.join(refer.IMAGE_DIR, img['file_name']))
+        # 假设`image_path`是原始图像的路径，`mask`是一个与原图像相同大小的二值数组
+        image_path = osp.join(refer.IMAGE_DIR, img['file_name'])
+        image = cv2.imread(image_path)
+        # mask = np.zeros(image.shape[:2], dtype=np.uint8)  # 这里你需要有一个实际的mask
+        # custom_vis1(image, mask_)
+        image = custom_vis2(image, mask_)
+        # custom_vis3(image, mask_)
+        # image = custom_vis4(image=image, mask_=mask_, alpha=0.4)
+        # 保存结果图像到指定路径
+        image_dir = osp.join(save_path, str(img_id))
+        osp.exists(image_dir) or os.makedirs(image_dir)
+        # 复制原图
+        I = io.imread(osp.join(refer.IMAGE_DIR, img['file_name']))
+        io.imsave(osp.join(image_dir, img['file_name']), I)
+        cv2.imwrite(osp.join(image_dir, str(img_id)+".png"), image)
+        # 将json格式的ref保存
+        with open(osp.join(image_dir, str(img_id)+".json"), "w") as f:
+            json.dump(ref, f)
+    # i = 0
+    # for ref_id in ref_ids:
+    #     i += 1
+    #     ref = refer.loadRefs(ref_id)[0]
+    #     if len(ref['sentences']) < 2:
+    #         continue
+    #     print(ref)
+    #     print('The label is %s.' % refer.Cats[ref['category_id']])
+    #     plt.figure()
+    #     # refer.getMask(ref)
+    #     refer.showMask(ref)
+    #     # refer.showRef(ref, seg_box='seg')
+    #     plt.show()
+    #     if i == 0:
+    #         break
+    #     # save
+    #     plt.savefig('tmp.png')
+        # plt.figure()
+        # refer.showMask(ref)
+        # plt.show()