ObjectRelator-Original / datasets /build_ego_exosize.py

Upload folder using huggingface_hub

625a17f verified 2 months ago

15.1 kB

	import json
	import os
	from PIL import Image
	import numpy as np
	from pycocotools.mask import encode, decode, frPyObjects
	from tqdm import tqdm
	import copy
	from natsort import natsorted
	import cv2


	#这一脚本的主要目的是把ego mask缩放到exo size，ego的图片大小还是704 704
	if __name__ == '__main__':
	root_path = '/data/work2-gcp-europe-west4-a/yuqian_fu/Ego/data_segswap'
	#跑实验需改动
	save_path = os.path.join(root_path, 'egoexo_val_exosize.json')

	#获取takes_id
	split_path = "/home/yuqian_fu/Projects/ego-exo4d-relation/correspondence/SegSwap/data/split.json"
	with open(split_path, "r") as fp:
	data_split = json.load(fp)
	val_set = data_split["val"]
	#跑实验需改动
	# val_set = ["1d0f3c10-ed0a-4f60-b0d2-a516690ff1cf"]

	#用来计数
	new_img_id = 0

	#用来存储json中的数据
	egoexo_dataset = []

	'''
	build_DAVIS.py的代码逻辑是先处理每个视频的第一帧，第一帧中的unique_instances、高宽等信息用于该视频下后续的每一帧。
	注意，unique_instances代表的是第一帧下像素的所有类别信息，如果该视频下后续的帧中有像素的类别不在unique_instances中，会报错
	'''


	bad_case = []
	for val_name in tqdm(val_set):
	#不同视角下两个相机的总路径
	vid_root_path = os.path.join(root_path, val_name)
	anno_path = os.path.join(vid_root_path, "annotation.json")
	with open(anno_path, 'r') as fp:
	annotations = json.load(fp)
	#取出本take下的所有物体
	# objs = list(annotations["masks"].keys())
	# 确保每次obj的顺序一样，构造的coco_id_to_cont_id数字与物体的映射一样
	objs = natsorted(list(annotations["masks"].keys()))
	print("the total obj num are:", len(objs))
	print(f"objs:{objs}")
	#将物体名称映射为id "cook":1 从1开始，区别于背景
	#TODO看看这个要不要修改为以obj_ref中的物体为准
	coco_id_to_cont_id = {coco_id: cont_id+1 for cont_id, coco_id in enumerate(objs)}

	#区分相机
	valid_cams = os.listdir(vid_root_path)
	#这一行必须加
	valid_cams.remove("annotation.json")

	#给相机排序，方便取出01开头的相机，因为序号小的相机对应的物体更多
	valid_cams = natsorted(valid_cams)
	print(valid_cams)


	ego_cams = []
	exo_cams = []
	for vc in valid_cams:
	if 'aria' in vc:
	ego_cams.append(vc)
	else:
	exo_cams.append(vc)

	ego = ego_cams[0]
	exo = exo_cams[0]
	# print(ego, exo)

	#ego、exo相机路径
	vid_ego_path = os.path.join(vid_root_path, ego)
	# vid_exo_path = os.path.join(vid_root_path, exo)




	#setting为ego->exo，所以ego作为第一帧，即visual prompt
	#取出第一帧ego图像的id
	#获取帧的索引时，不能简单地通过os.listdir来获取，因为路径下有的图片是没有标注的，需要以注释文件的索引为准
	#路径下图片的索引和注释文件中的索引的关系：图片名称里的subsample_idx是包含annotations里的idx的即有的图片是没有对应的注释的，所以会出现索引报错
	ego_frames = natsorted(os.listdir(vid_ego_path))
	ego_frames = [int(f.split(".")[0]) for f in ego_frames]



	#先选出两个摄像机下都出现的物体作为总的物体范围,然后再判断在该摄像机视角下每一帧中出现了哪些物体
	#也可能出现objs_both_have为空的情况，这时候就需要更换exo摄像机
	objs_both_have = []
	for obj in objs:
	if ego in annotations["masks"][obj].keys() and exo in annotations["masks"][obj].keys():
	objs_both_have.append(obj)

	if len(exo_cams) > 1:
	for cam in exo_cams[1:]:
	objs_both_have_tmp = []
	for obj in objs:
	if ego in annotations["masks"][obj].keys() and cam in annotations["masks"][obj].keys():
	objs_both_have_tmp.append(obj)
	if len(objs_both_have_tmp) > len(objs_both_have):
	exo = cam
	objs_both_have = objs_both_have_tmp
	# 如果没有物体范围，跳过本take
	print("objs_both_have num:", len(objs_both_have))
	if len(objs_both_have) == 0:
	bad_case.append(val_name)
	continue

	print(ego, exo)
	#确定exo的最终相机后，再定义exo的路径
	vid_exo_path = os.path.join(vid_root_path, exo)
	print(f"vid_exo_path:{vid_exo_path}")
	exo_frames = natsorted(os.listdir(vid_exo_path))
	# exo_frames = [int(f.split(".")[0]) for f in exo_frames]
	# exo_frames是字符串形式 be like ['1' '2' '3']
	exo_frames = [f.split(".")[0] for f in exo_frames]

	# 获取ego注释文件中的所有索引，用于后续和exo的交叉
	# 取所有ego obj annotated_frames最长的作为基准帧数
	# 后续对exo的操作以基准帧为核心，而不是以物体为核心
	# 取ego视角下出现时间最长的物体对应的所有注释帧，作为基准帧
	obj_ref = objs_both_have[0]
	for obj in objs_both_have:
	if len(list(annotations["masks"][obj_ref][ego].keys())) < len(list(annotations["masks"][obj][ego].keys())):
	obj_ref = obj
	ego_anno_frames = natsorted(list(annotations["masks"][obj_ref][ego].keys()))
	# TODO给frames排个序
	frames = natsorted(np.intersect1d(ego_anno_frames, exo_frames))
	print(f"frames:{frames}")

	#查看每一个物体下具体有哪些帧
	# ego_anno_frames = natsorted(list(annotations["masks"][objs[0]][ego].keys()))
	# ego_anno_frames2 = natsorted(list(annotations["masks"][objs[1]][ego].keys()))
	# print(f"ego_anno_frames3:{ego_anno_frames3}")

	#TODO测试一下结果是什么样的，默认最好是字符串




	#获取ego有注释的第一帧作为参考图像
	all_ref_keys = np.asarray(
	natsorted(annotations["masks"][obj_ref][ego])
	).astype(np.int64)
	#first_anno_key是ego有注释第一张图片的索引
	first_anno_key = str(all_ref_keys[0])
	rgb_name = f"{first_anno_key}.jpg"
	first_frame_img_path = os.path.join(vid_ego_path, rgb_name)
	first_frame_img_relpath = os.path.relpath(first_frame_img_path, root_path)

	#实验需改动
	# first_frame_img_relpath = "piano_test/aria01_214-1/0.jpg"

	# first_frame_annotation_img = Image.open(first_frame_annotation_path)
	# first_frame_annotation = np.array(first_frame_annotation_img)
	# height, width = first_frame_annotation.shape


	# 改为通过json文件获取ego mask大小，在我们的脚本中用不上，因为ego和exo大小不一样
	# height1, width1 = annotations["masks"][obj_ref][ego][first_anno_key]["size"]



	#np.unique存储每一帧中的所有像素类别
	# unique_instances = np.unique(first_frame_annotation)
	# unique_instances = unique_instances[unique_instances != 0]
	#这个列表用于存储第一帧的注释信息
	coco_format_annotations = []


	#统计每一帧下具体有哪些物体，这里统计的是参考帧ego的
	#追踪的物体范围以ego参考帧中的物体为准，因为你输入的mask不可能超过这个范围
	obj_list_ego = []
	for obj in objs_both_have:
	if first_anno_key in annotations["masks"][obj][ego].keys():
	mask_ego = decode(annotations["masks"][obj][ego][first_anno_key])
	area_new = mask_ego.sum().astype(float)
	if area_new != 0:
	obj_list_ego.append(obj)
	print("total obj num in ego", len(obj_list_ego))
	if len(obj_list_ego) == 0:
	bad_case.append(val_name)
	continue
	# print(obj_list_ego)

	# 因为有的exo图像的大小是(960,540)，所以ego mask缩放的大小不能写死
	idx_tmp = frames[1]
	filename_tmp = f"{idx_tmp}.jpg"
	tmp_path = os.path.join(vid_exo_path, filename_tmp)
	img_tmp = Image.open(tmp_path)
	img_tmp = np.array(img_tmp)
	h_tmp, w_tmp = img_tmp.shape[:2]
	#处理ego帧中的物体mask
	obj_list_ego_new = []
	for obj in obj_list_ego:
	#TODO看看segmentation中count和size的顺序影不影响使用
	segmentation_tmp = annotations["masks"][obj][ego][first_anno_key]
	# 可以直接从annotation中取出来

	# area可能得decode搞一下
	binary_mask = decode(segmentation_tmp)
	# print("original binary_mask_shape:", binary_mask.shape)

	#对解码后的mask进行缩放，使得可以匹配ego图像的大小
	h,w = binary_mask.shape
	binary_mask = cv2.resize(binary_mask, (w_tmp, h_tmp), interpolation=cv2.INTER_NEAREST)

	#这里计算的area是resize后的mask面积
	area = binary_mask.sum().astype(float)
	if area == 0:
	# obj_list_ego.remove(obj)
	continue
	segmentation = encode(np.asfortranarray(binary_mask))
	segmentation = {
	'counts': segmentation['counts'].decode('ascii'),
	'size': segmentation["size"],
	}
	obj_list_ego_new.append(obj)
	coco_format_annotations.append(
	{
	'segmentation': segmentation,
	'area': area,
	'category_id': float(coco_id_to_cont_id[obj]),
	}
	)
	if len(obj_list_ego_new) == 0:
	bad_case.append(val_name)
	continue
	#检查每个物体对应哪些摄像机，因为并不是每个物体对应所有的摄像机
	# for obj in objs:
	# cams = list(annotations["masks"][obj].keys())
	# print(f"{obj}:{cams}")


	#开始处理exo相机下的每一帧
	#看看索引从1开始还是从0开始
	for idx in frames[1:]:
	filename = f"{idx}.jpg"
	sample_img_path = os.path.join(vid_exo_path, filename)
	sample_img_relpath = os.path.relpath(sample_img_path, root_path)

	#统计每一exo帧下有哪些物体
	#有两种方式，第一种是统计该帧下在本take所有物体范围objs中出现的物体，可能会出现Found new target not in the first frame的错误
	#第二种方式是统计统计该帧下在参考帧范围obj_list_ego中出现的物体
	obj_list_exo = []
	for obj in obj_list_ego_new:
	if idx in annotations["masks"][obj][exo].keys():
	mask_exo = decode(annotations["masks"][obj][exo][idx])
	area_exo = mask_exo.sum().astype(float)
	if area_exo != 0:
	obj_list_exo.append(obj)
	#检查exo下每一帧的物体数量，也会碰到有的帧一个物体也没有，这种直接跳过
	print("total obj num in exo", len(obj_list_exo))
	if len(obj_list_exo) == 0:
	continue

	height, width = annotations["masks"][obj_list_exo[0]][exo][idx]["size"]
	# print("original exo mask_shape:" ,height,width)
	image_info = {
	'file_name': sample_img_relpath,
	'height': height//4,
	'width': width//4,
	}


	anns = []

	obj_list_exo_new = []
	for obj in obj_list_exo:
	assert obj in obj_list_ego_new, 'Found new target not in the first frame'
	segmentation_tmp = annotations["masks"][obj][exo][idx]
	binary_mask = decode(segmentation_tmp)
	# print("original ego binary_mask_shape", binary_mask.shape)
	h, w = binary_mask.shape
	binary_mask = cv2.resize(binary_mask, (w // 4, h // 4), interpolation=cv2.INTER_NEAREST)
	# print("binary_mask", binary_mask.shape)
	area = binary_mask.sum().astype(float)
	if area == 0:
	continue
	segmentation = encode(np.asfortranarray(binary_mask))
	segmentation = {
	'counts': segmentation['counts'].decode('ascii'),
	'size': segmentation['size'],
	}
	obj_list_exo_new.append(obj)
	anns.append(
	{
	'segmentation': segmentation,
	'area': area,
	'category_id': float(coco_id_to_cont_id[obj]),
	}
	)
	if len(obj_list_exo_new) == 0:
	continue

	# 统计本帧下物体对应的id，方便后续根据category_id调整first_frame_anns
	sample_unique_instances = [float(coco_id_to_cont_id[obj]) for obj in obj_list_exo_new]
	# 查看每一帧下有哪些物体
	print(f"sample_unique_instances in {idx}:{sample_unique_instances}")

	#deepcopy的目的是，后续要根据本exo帧中物体的数量对参考帧的注释进行调整，防止修改原始注释
	first_frame_anns = copy.deepcopy(coco_format_annotations)
	#考虑本帧物体的数量小于参考帧的情况，仅取出参考帧中本帧有的物体的注释；但是实际情况下，有可能本帧物体的数量会大于参考帧，这时候就需要调整统计本帧下有哪些物体时，总的物体范围
	if len(anns) < len(first_frame_anns):
	first_frame_anns = [ann for ann in first_frame_anns if ann['category_id'] in sample_unique_instances]
	assert len(anns) == len(first_frame_anns)
	sample = {
	'image': sample_img_relpath,
	'image_info': image_info,
	'anns': anns,
	'first_frame_image': first_frame_img_relpath,
	'first_frame_anns': first_frame_anns,
	'new_img_id': new_img_id,
	'video_name': val_name,
	}
	egoexo_dataset.append(sample)
	new_img_id += 1


	print(bad_case)
	with open(save_path, 'w') as f:
	json.dump(egoexo_dataset, f)
	print(f'Save at {save_path}. Total sample: {len(egoexo_dataset)}')