| import os
|
| import random
|
| from collections import OrderedDict
|
|
|
| import torch
|
| import numpy as np
|
|
|
| from lib.train.admin import env_settings
|
| from lib.train.data import jpeg4py_loader
|
| from .base_video_dataset import BaseVideoDataset
|
| from .refer import REFER
|
|
|
|
|
| class RefCOCOSeq(BaseVideoDataset):
|
| """ The COCO dataset. COCO is an image dataset. Thus, we treat each image as a sequence of length 1.
|
|
|
| Publication:
|
| Microsoft COCO: Common Objects in Context.
|
| Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona,
|
| Deva Ramanan, Piotr Dollar and C. Lawrence Zitnick
|
| ECCV, 2014
|
| https://arxiv.org/pdf/1405.0312.pdf
|
|
|
| Download the images along with annotations from http://cocodataset.org/#download. The root folder should be
|
| organized as follows.
|
| - coco_root
|
| - annotations
|
| - instances_train2014.json
|
| - instances_train2017.json
|
| - images
|
| - train2014
|
| - train2017
|
|
|
| Note: You also have to install the coco pythonAPI from https://github.com/cocodataset/cocoapi.
|
| """
|
|
|
| def __init__(self, root=None, image_loader=jpeg4py_loader, data_fraction=None, split="train", version="2014",
|
| name="refcoco", splitBy="google", multi_modal_vision=False, multi_modal_language=False):
|
| """
|
| args:
|
| root - path to the coco dataset.
|
| image_loader (default_image_loader) - The function to read the images. If installed,
|
| jpeg4py (https://github.com/ajkxyz/jpeg4py) is used by default. Else,
|
| opencv's imread is used.
|
| data_fraction (None) - Fraction of images to be used. The images are selected randomly. If None, all the
|
| images will be used
|
| split - 'train' or 'val'.
|
| version - version of coco dataset (2014 or 2017)
|
| """
|
| root = env_settings().coco_dir if root is None else root
|
| super().__init__('RefCOCOSeq', root, image_loader)
|
| self.split = split
|
| self.img_pth = os.path.join(root, 'images/{}{}'.format("train", version))
|
| self.anno_path = os.path.join(root, '{}/instances.json'.format(name))
|
| self.dataset_name = name
|
|
|
| self.coco_set = REFER(root, dataset=name, splitBy=splitBy)
|
|
|
| self.cats = self.coco_set.Cats
|
|
|
| self.class_list = self.get_class_list()
|
|
|
| self.sequence_list = self._get_sequence_list()
|
|
|
| if data_fraction is not None:
|
| self.sequence_list = random.sample(self.sequence_list, int(len(self.sequence_list) * data_fraction))
|
| self.seq_per_class = self._build_seq_per_class()
|
|
|
| self.multi_modal_vision = multi_modal_vision
|
| self.multi_modal_language = multi_modal_language
|
|
|
| def _get_sequence_list(self):
|
| ref_list = list(self.coco_set.getRefIds(split=self.split))
|
| seq_list = [a for a in ref_list if self.coco_set.refToAnn[a]['iscrowd'] == 0]
|
|
|
| return seq_list
|
|
|
| def is_video_sequence(self):
|
| return False
|
|
|
| def get_num_classes(self):
|
| return len(self.class_list)
|
|
|
| def get_name(self):
|
| return self.dataset_name
|
|
|
| def has_class_info(self):
|
| return True
|
|
|
| def get_class_list(self):
|
| class_list = []
|
| for cat_id in self.cats.keys():
|
| class_list.append(self.cats[cat_id])
|
| return class_list
|
|
|
| def has_segmentation_info(self):
|
| return True
|
|
|
| def get_num_sequences(self):
|
| return len(self.sequence_list)
|
|
|
| def _build_seq_per_class(self):
|
| seq_per_class = {}
|
| for i, seq in enumerate(self.sequence_list):
|
| class_name = self.cats[self.coco_set.refToAnn[seq]['category_id']]
|
| if class_name not in seq_per_class:
|
| seq_per_class[class_name] = [i]
|
| else:
|
| seq_per_class[class_name].append(i)
|
|
|
| return seq_per_class
|
|
|
| def get_sequences_in_class(self, class_name):
|
| return self.seq_per_class[class_name]
|
|
|
| def get_sequence_info(self, seq_id):
|
| anno = self._get_anno(seq_id)
|
|
|
| bbox = torch.Tensor(anno['bbox']).view(1, 4)
|
|
|
|
|
|
|
| '''2021.1.3 To avoid too small bounding boxes. Here we change the threshold to 50 pixels'''
|
| valid = (bbox[:, 2] > 50) & (bbox[:, 3] > 50)
|
|
|
| visible = valid.clone().byte()
|
|
|
| output = {'bbox': bbox, 'valid': valid, 'visible': visible}
|
| if self.multi_modal_language:
|
| nlp = self._read_nlp(seq_id)
|
| output['nlp'] = nlp
|
|
|
| return {'bbox': bbox, 'valid': valid, 'visible': visible, 'nlp': nlp}
|
|
|
| def _read_nlp(self, seq_id):
|
|
|
| ref = self.coco_set.Refs[self.sequence_list[seq_id]]
|
| sent = ref['sentences'][-1]['sent']
|
|
|
| return sent
|
|
|
| def _get_anno(self, seq_id):
|
| anno = self.coco_set.refToAnn[self.sequence_list[seq_id]]
|
|
|
| return anno
|
|
|
| def _get_frames(self, seq_id):
|
| path = self.coco_set.loadImgs([self.coco_set.refToAnn[self.sequence_list[seq_id]]['image_id']])[0]['file_name']
|
| img = self.image_loader(os.path.join(self.img_pth, path))
|
| if self.multi_modal_vision:
|
| img = np.concatenate((img, img), axis=-1)
|
| return img
|
|
|
| def get_meta_info(self, seq_id):
|
| try:
|
| cat_dict_current = self.cats[self.coco_set.refToAnn[self.sequence_list[seq_id]]['category_id']]
|
| object_meta = OrderedDict({'object_class_name': cat_dict_current['name'],
|
| 'motion_class': None,
|
| 'major_class': cat_dict_current['supercategory'],
|
| 'root_class': None,
|
| 'motion_adverb': None})
|
| except:
|
| object_meta = OrderedDict({'object_class_name': None,
|
| 'motion_class': None,
|
| 'major_class': None,
|
| 'root_class': None,
|
| 'motion_adverb': None})
|
| return object_meta
|
|
|
| def get_class_name(self, seq_id):
|
| cat_dict_current = self.cats[self.coco_set.refToAnn[self.sequence_list[seq_id]]['category_id']]
|
| return cat_dict_current['name']
|
|
|
| def get_frames(self, seq_id=None, frame_ids=None, anno=None):
|
|
|
|
|
| frame = self._get_frames(seq_id)
|
|
|
| frame_list = [frame.copy() for _ in frame_ids]
|
|
|
| if anno is None:
|
| anno = self.get_sequence_info(seq_id)
|
|
|
| anno_frames = {}
|
| for key, value in anno.items():
|
| if key == 'nlp':
|
| anno_frames[key] = [value for _ in frame_ids]
|
| else:
|
| anno_frames[key] = [value[0, ...] for _ in frame_ids]
|
|
|
| object_meta = self.get_meta_info(seq_id)
|
|
|
| return frame_list, anno_frames, object_meta
|
|
|
| def get_path(self, seq_id, frame_ids):
|
| img_name = self.coco_set.loadImgs([self.coco_set.refToAnn[self.sequence_list[seq_id]]['image_id']])[0]['file_name']
|
| return [os.path.join(self.img_pth, img_name) for _ in range(len(frame_ids))]
|
|
|
| def get_ref_id(self, seq_id):
|
| return self.sequence_list[seq_id]
|
|
|