|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json |
|
|
import logging |
|
|
import numpy as np |
|
|
import os |
|
|
|
|
|
from detectron2.structures import Boxes, BoxMode, PolygonMasks |
|
|
from detectron2.data import DatasetCatalog, MetadataCatalog |
|
|
from tqdm import tqdm |
|
|
""" |
|
|
This file contains functions to parse MeViS dataset of |
|
|
COCO-format annotations into dicts in "Detectron2 format". |
|
|
""" |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
__all__ = ["load_mevis_json", "register_mevis_instances"] |
|
|
|
|
|
|
|
|
def load_mevis_json(image_root, json_file): |
|
|
|
|
|
num_instances_without_valid_segmentation = 0 |
|
|
num_instances_valid_segmentation = 0 |
|
|
|
|
|
ann_file = json_file |
|
|
with open(str(ann_file), 'r') as f: |
|
|
subset_expressions_by_video = json.load(f)['videos'] |
|
|
videos = list(subset_expressions_by_video.keys()) |
|
|
print('number of video in the datasets:{}'.format(len(videos))) |
|
|
metas = [] |
|
|
if image_root.split('/')[-1] == 'train': |
|
|
mask_json = os.path.join(image_root, 'mask_dict_train.json') |
|
|
print(f'Loading masks form {mask_json} ...') |
|
|
with open(mask_json) as fp: |
|
|
mask_dict = json.load(fp) |
|
|
|
|
|
for vid in videos: |
|
|
vid_data = subset_expressions_by_video[vid] |
|
|
vid_frames = sorted(vid_data['frames']) |
|
|
vid_len = len(vid_frames) |
|
|
if vid_len < 2: |
|
|
continue |
|
|
for exp_id, exp_dict in vid_data['expressions'].items(): |
|
|
meta = {} |
|
|
meta['video'] = vid |
|
|
meta['exp'] = exp_dict['exp'] |
|
|
meta['idx'] = exp_dict['idx'] |
|
|
meta['obj_id'] = [int(x) for x in exp_dict['obj_id']] |
|
|
meta['anno_id'] = [str(x) for x in exp_dict['anno_id']] |
|
|
meta['frames'] = vid_frames |
|
|
meta['exp_id'] = exp_id |
|
|
meta['category'] = 0 |
|
|
meta['length'] = vid_len |
|
|
metas.append(meta) |
|
|
else: |
|
|
for vid in videos: |
|
|
vid_data = subset_expressions_by_video[vid] |
|
|
vid_frames = sorted(vid_data['frames']) |
|
|
vid_len = len(vid_frames) |
|
|
for exp_id, exp_dict in vid_data['expressions'].items(): |
|
|
meta = {} |
|
|
meta['video'] = vid |
|
|
meta['exp'] = exp_dict['exp'] |
|
|
meta['idx'] = -1 |
|
|
meta['obj_id'] = -1 |
|
|
meta['anno_id'] = -1 |
|
|
meta['frames'] = vid_frames |
|
|
meta['exp_id'] = exp_id |
|
|
meta['category'] = 0 |
|
|
meta['length'] = vid_len |
|
|
metas.append(meta) |
|
|
|
|
|
dataset_dicts = [] |
|
|
for vid_dict in tqdm(metas): |
|
|
record = {} |
|
|
record["file_names"] = [os.path.join(image_root, 'JPEGImages', vid_dict['video'], vid_dict["frames"][i]+ '.jpg') for i in range(vid_dict["length"])] |
|
|
record["length"] = vid_dict["length"] |
|
|
video_name, exp, anno_ids, obj_ids, category, exp_id, idx = \ |
|
|
vid_dict['video'], vid_dict['exp'], vid_dict['anno_id'], vid_dict['obj_id'], vid_dict['category'], vid_dict['exp_id'], vid_dict['idx'] |
|
|
|
|
|
exp = " ".join(exp.lower().split()) |
|
|
if "eval_idx" in vid_dict: |
|
|
record["eval_idx"] = vid_dict["eval_idx"] |
|
|
|
|
|
video_objs = [] |
|
|
if image_root.split('/')[-1] == 'train': |
|
|
for frame_idx in range(record["length"]): |
|
|
frame_objs = [] |
|
|
for x, obj_id in zip(anno_ids, obj_ids): |
|
|
obj = {} |
|
|
segm = mask_dict[x][frame_idx] |
|
|
if not segm: |
|
|
num_instances_without_valid_segmentation += 1 |
|
|
continue |
|
|
num_instances_valid_segmentation += 1 |
|
|
bbox = [0, 0, 0, 0] |
|
|
obj["id"] = obj_id |
|
|
obj["segmentation"] = segm |
|
|
obj["category_id"] = category |
|
|
obj["bbox"] = bbox |
|
|
obj["bbox_mode"] = BoxMode.XYXY_ABS |
|
|
frame_objs.append(obj) |
|
|
video_objs.append(frame_objs) |
|
|
record["annotations"] = video_objs |
|
|
record["sentence"] = exp |
|
|
record["refer_id"] = idx |
|
|
record["exp_id"] = exp_id |
|
|
record["video_name"] = video_name |
|
|
dataset_dicts.append(record) |
|
|
|
|
|
if num_instances_without_valid_segmentation > 0: |
|
|
logger.warning( |
|
|
"Total {} instance and Filtered out {} instances without valid segmentation. ".format( |
|
|
num_instances_valid_segmentation, num_instances_without_valid_segmentation |
|
|
) |
|
|
) |
|
|
return dataset_dicts |
|
|
|
|
|
|
|
|
|
|
|
def bounding_box(img): |
|
|
rows = np.any(img, axis=1) |
|
|
cols = np.any(img, axis=0) |
|
|
rmin, rmax = np.where(rows)[0][[0, -1]] |
|
|
cmin, cmax = np.where(cols)[0][[0, -1]] |
|
|
return rmin, rmax, cmin, cmax |
|
|
|
|
|
|
|
|
def register_mevis_instances(name, json_file, image_root): |
|
|
""" |
|
|
Register a dataset in YTVIS's json annotation format for |
|
|
instance tracking. |
|
|
|
|
|
Args: |
|
|
name (str): the name that identifies a dataset, e.g. "ytvis_train". |
|
|
metadata (dict): extra metadata associated with this dataset. You can |
|
|
leave it as an empty dict. |
|
|
json_file (str): path to the json instance annotation file. |
|
|
image_root (str or path-like): directory which contains all the images. |
|
|
""" |
|
|
assert isinstance(name, str), name |
|
|
assert isinstance(json_file, (str, os.PathLike)), json_file |
|
|
assert isinstance(image_root, (str, os.PathLike)), image_root |
|
|
|
|
|
DatasetCatalog.register(name, lambda: load_mevis_json(image_root, json_file)) |
|
|
|
|
|
|
|
|
|
|
|
MetadataCatalog.get(name).set( |
|
|
json_file=json_file, image_root=image_root, evaluator_type="mevis") |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
""" |
|
|
Test the YTVIS json dataset loader. |
|
|
""" |
|
|
import detectron2.data.datasets |
|
|
|
|
|
image_root = "./datasets/mevis/valid_u" |
|
|
json_file = "./datasets/mevis/valid_u/meta_expressions.json" |
|
|
dicts = load_mevis_json(image_root, json_file) |
|
|
print("Done loading {} samples.".format(len(dicts))) |