ObjectRelator-Original / datasets /build_DAVIS.py

Upload folder using huggingface_hub

625a17f verified about 2 months ago

13.5 kB

	import json
	import os
	from PIL import Image
	import numpy as np
	from pycocotools.mask import encode, decode, frPyObjects
	from tqdm import tqdm
	import copy
	from natsort import natsorted

	if __name__ == '__main__':
	root_path = '/data/work2-gcp-europe-west4-a/yuqian_fu/Ego/data_segswap'
	# splits = ['trainval', 'test-dev']
	# we only do val evaluation
	# annotation_path = os.path.join(root_path, f'2017/{splits[0]}/Annotations/480p')
	# image_path = os.path.join(root_path, f'2017/{splits[0]}/JPEGImages/480p')
	#
	# set_path = os.path.join(root_path, f'2017/{splits[0]}/ImageSets/2017/val.txt')
	save_path = os.path.join(root_path, 'egoexo_val_psalm.json')

	# val_set = []
	# with open(set_path, 'r') as f:
	# for line in f:
	# val_set.append(line.strip())
	#修改
	split_path = "/home/yuqian_fu/Projects/ego-exo4d-relation/correspondence/SegSwap/data/split.json"
	with open(split_path, "r") as fp:
	data_split = json.load(fp)
	# val_set = data_split["val"]
	val_set = ["51fc36b3-e769-4617-b087-3826b280cad3"]
	new_img_id = 0
	egoexo_dataset = []

	'''
	build_DAVIS.py的代码逻辑是先处理每个视频的第一帧，第一帧中的unique_instances、高宽等信息用于该视频下后续的每一帧。
	注意，unique_instances代表的是第一帧下像素的所有类别信息，如果该视频下后续的帧中有像素的类别不在unique_instances中，会报错
	'''



	for val_name in tqdm(val_set):
	#不同视角下两个相机的总路径
	vid_root_path = os.path.join(root_path, val_name)
	anno_path = os.path.join(vid_root_path, "annotation.json")
	with open(anno_path, 'r') as fp:
	annotations = json.load(fp)
	#取出本take下的所有物体
	objs = list(annotations["masks"].keys())
	print(len(objs))
	print(f"objs:{objs}")
	#将物体名称映射为id "cook":1 从1开始，区别于背景
	#TODO看看这个要不要修改为以obj_ref中的物体为准
	coco_id_to_cont_id = {coco_id: cont_id+1 for cont_id, coco_id in enumerate(objs)}

	#区分相机
	valid_cams = os.listdir(vid_root_path)
	#这一行必须加
	valid_cams.remove("annotation.json")

	#给相机排序，方便取出01开头的相机，因为序号小的相机对应的物体更多
	valid_cams = natsorted(valid_cams)
	print(valid_cams)


	ego_cams = []
	exo_cams = []
	for vc in valid_cams:
	if 'aria' in vc:
	ego_cams.append(vc)
	else:
	exo_cams.append(vc)

	ego = ego_cams[0]
	exo = exo_cams[0]
	# print(ego, exo)

	#ego、exo相机路径
	vid_ego_path = os.path.join(vid_root_path, ego)
	vid_exo_path = os.path.join(vid_root_path, exo)

	# first_frame_annotation_path = os.path.join(anno_path, sorted(os.listdir(anno_path))[0])
	# first_frame_annotation_relpath = os.path.relpath(first_frame_annotation_path, root_path)

	#setting为ego->exo，所以ego作为第一帧，即visual prompt
	#取出第一帧ego图像的id
	#获取帧的索引时，不能简单地通过os.listdir来获取，因为路径下有的图片是没有标注的，需要以注释文件的索引为准
	#路径下图片的索引和注释文件中的索引的关系：图片名称里的subsample_idx是包含annotations里的idx的即有的图片是没有对应的注释的，所以会出现索引报错
	ego_frames = natsorted(os.listdir(vid_ego_path))
	ego_frames = [int(f.split(".")[0]) for f in ego_frames]
	print(f"vid_exo_path:{vid_exo_path}")
	exo_frames = natsorted(os.listdir(vid_exo_path))
	# exo_frames = [int(f.split(".")[0]) for f in exo_frames]
	#exo_frames是字符串形式 be like ['1' '2' '3']
	exo_frames = [f.split(".")[0] for f in exo_frames]


	#先选出两个摄像机下都出现的物体作为总的物体范围,然后再判断在该摄像机视角下每一帧中出现了哪些物体
	objs_both_have = []
	for obj in objs:
	if ego in annotations["masks"][obj].keys() and exo in annotations["masks"][obj].keys():
	objs_both_have.append(obj)

	# 获取ego注释文件中的所有索引，用于后续和exo的交叉
	# 取所有ego obj annotated_frames最长的作为基准帧数
	# 后续对exo的操作以基准帧为核心，而不是以物体为核心
	# 取ego视角下出现时间最长的物体对应的所有注释帧，作为基准帧
	obj_ref = objs_both_have[0]
	for obj in objs_both_have:
	if len(list(annotations["masks"][obj_ref][ego].keys())) < len(list(annotations["masks"][obj][ego].keys())):
	obj_ref = obj
	ego_anno_frames = natsorted(list(annotations["masks"][obj_ref][ego].keys()))
	# TODO给frames排个序
	frames = natsorted(np.intersect1d(ego_anno_frames, exo_frames))
	print(f"frames:{frames}")

	#查看每一个物体下具体有哪些帧
	# ego_anno_frames = natsorted(list(annotations["masks"][objs[0]][ego].keys()))
	# ego_anno_frames2 = natsorted(list(annotations["masks"][objs[1]][ego].keys()))
	# print(f"ego_anno_frames3:{ego_anno_frames3}")

	#TODO测试一下结果是什么样的，默认最好是字符串




	#获取ego有注释的第一帧作为参考图像
	all_ref_keys = np.asarray(
	natsorted(annotations["masks"][obj_ref][ego])
	).astype(np.int64)
	#first_anno_key是ego有注释第一张图片的索引
	first_anno_key = str(all_ref_keys[0])
	rgb_name = f"{first_anno_key}.jpg"
	first_frame_img_path = os.path.join(vid_ego_path, rgb_name)
	first_frame_img_relpath = os.path.relpath(first_frame_img_path, root_path)

	# first_frame_annotation_img = Image.open(first_frame_annotation_path)
	# first_frame_annotation = np.array(first_frame_annotation_img)
	# height, width = first_frame_annotation.shape


	#测试查看subsample_idx和注释文件中的idx索引是否相同
	# id_sort = natsorted(os.listdir(vid_ego_path))
	# all_ref_keys = natsorted(annotations["masks"][objs[0]][ego])
	# json_output_path1 = '/home/yuqian_fu/Projects/PSALM/annotation456.json'
	# json_output_path2 = '/home/yuqian_fu/Projects/PSALM/annotation123.json'
	# with open(json_output_path1, 'w') as json_file:
	# json.dump(id_sort, json_file)
	# with open(json_output_path2, 'w') as json_file:
	# json.dump(all_ref_keys, json_file)
	# print(first_frame_img_id, type(first_frame_img_id))
	# print(objs[0])
	# print(ego)


	# 改为通过json文件获取大小
	height, width = annotations["masks"][obj_ref][ego][first_anno_key]["size"]
	# print(annotations["masks"][objs[0]][ego].keys())


	#np.unique存储每一帧中的所有像素类别
	# unique_instances = np.unique(first_frame_annotation)
	# unique_instances = unique_instances[unique_instances != 0]
	coco_format_annotations = []
	# for semi-supervised VOS, we use first frame's GT for input
	# for instance_value in unique_instances:
	# binary_mask = (first_frame_annotation == instance_value).astype(np.uint8)
	# segmentation = encode(np.asfortranarray(binary_mask))
	# #可以直接从annotation中取出来
	# segmentation = {
	# 'counts': segmentation['counts'].decode('ascii'),
	# 'size': segmentation['size'],
	# }
	# #area可能得decode搞一下
	# area = binary_mask.sum().astype(float)
	# coco_format_annotations.append(
	# {
	# 'segmentation': segmentation,
	# 'area': area,
	# 'category_id': instance_value.astype(float),
	# }
	# )

	#统计每一帧下具体有哪些物体，这里统计的是参考帧ego的
	#追踪的物体范围以ego参考帧中的物体为准，因为你输入的mask不可能超过这个范围
	obj_list_ego = []
	for obj in objs_both_have:
	if first_anno_key in annotations["masks"][obj][ego].keys():
	obj_list_ego.append(obj)
	print(len(obj_list_ego))
	print(obj_list_ego)




	#处理ego帧中的物体mask
	for obj in obj_list_ego:
	# binary_mask = (first_frame_annotation == instance_value).astype(np.uint8)
	#TODO看看segmentation中count和size的顺序影不影响使用
	segmentation = annotations["masks"][obj][ego][first_anno_key]
	# 可以直接从annotation中取出来
	# segmentation = {
	# 'counts': segmentation['counts'].decode('ascii'),
	# 'size': segmentation['size'],
	# }
	# area可能得decode搞一下
	binary_mask = decode(segmentation)
	#TODO检查一下binary mask
	area = binary_mask.sum().astype(float)
	coco_format_annotations.append(
	{
	'segmentation': segmentation,
	'area': area,
	'category_id': float(coco_id_to_cont_id[obj]),
	}
	)

	#检查每个物体对应哪些摄像机，因为并不是每个物体对应所有的摄像机
	# for obj in objs:
	# cams = list(annotations["masks"][obj].keys())
	# print(f"{obj}:{cams}")


	#TODO
	for idx in frames[1:]:
	filename = f"{idx}.jpg"
	sample_img_path = os.path.join(vid_exo_path, filename)
	sample_img_relpath = os.path.relpath(sample_img_path, root_path)

	#统计每一exo帧下有哪些物体
	#有两种方式，第一种是统计该帧下在本take所有物体范围objs中出现的物体，可能会出现Found new target not in the first frame的错误
	#第二种方式是统计统计该帧下在参考帧范围obj_list_ego中出现的物体
	obj_list_exo = []
	for obj in obj_list_ego:
	if idx in annotations["masks"][obj][exo].keys():
	obj_list_exo.append(obj)
	height, width = annotations["masks"][obj_list_exo[0]][exo][idx]["size"]
	image_info = {
	'file_name': sample_img_relpath,
	'height': height,
	'width': width,
	}

	# sample_annotation_path = os.path.join(anno_path, annfilename)
	# sample_annotation = np.array(Image.open(sample_annotation_path))
	# sample_unique_instances = np.unique(sample_annotation)
	# sample_unique_instances = sample_unique_instances[sample_unique_instances != 0]
	anns = []

	#统计本帧下物体对应的id，方便后续根据category_id调整first_frame_anns
	sample_unique_instances = [float(coco_id_to_cont_id[obj]) for obj in obj_list_exo]
	print(f"sample_unique_instances:{sample_unique_instances}")

	for obj in obj_list_exo:
	# binary_mask = (sample_annotation == instance_value).astype(np.uint8)
	assert obj in obj_list_ego, 'Found new target not in the first frame'
	segmentation = annotations["masks"][obj][exo][idx]
	binary_mask = decode(segmentation)
	area = binary_mask.sum().astype(float)
	anns.append(
	{
	'segmentation': segmentation,
	'area': area,
	'category_id': float(coco_id_to_cont_id[obj]),
	}
	)

	#deepcopy的目的是，后续要根据本exo帧中物体的数量对参考帧的注释进行调整，防止修改原始注释
	first_frame_anns = copy.deepcopy(coco_format_annotations)
	#考虑本帧物体的数量小于参考帧的情况，仅取出参考帧中本帧有的物体的注释；但是实际情况下，有可能本帧物体的数量会大于参考帧，这时候就需要调整统计本帧下有哪些物体时，总的物体范围
	if len(anns) < len(first_frame_anns):
	first_frame_anns = [ann for ann in first_frame_anns if ann['category_id'] in sample_unique_instances]
	assert len(anns) == len(first_frame_anns)
	sample = {
	'image': sample_img_relpath,
	'image_info': image_info,
	'anns': anns,
	'first_frame_image': first_frame_img_relpath,
	'first_frame_anns': first_frame_anns,
	'new_img_id': new_img_id,
	'video_name': val_name,
	}
	egoexo_dataset.append(sample)
	new_img_id += 1

	with open(save_path, 'w') as f:
	json.dump(egoexo_dataset, f)
	print(f'Save at {save_path}. Total sample: {len(egoexo_dataset)}')