Spaces:

Jiahua0
/

vmem

Build error

App Files Files Community

vmem / extern /CUT3R /datasets_preprocess /preprocess_arkitscenes_highres.py

Jiahua0

Upload folder using huggingface_hub

ff47419 verified about 1 month ago

raw

history blame contribute delete

14.1 kB

	import os
	import json
	import os.path as osp
	import decimal
	import argparse
	import math
	from bisect import bisect_left
	from PIL import Image
	import numpy as np
	import quaternion
	from scipy import interpolate
	import cv2
	from tqdm import tqdm
	from multiprocessing import Pool


	def get_parser():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--arkitscenes_dir",
	default="",
	)
	parser.add_argument(
	"--output_dir",
	default="data/dust3r_data/processed_arkitscenes_highres",
	)
	return parser


	def value_to_decimal(value, decimal_places):
	decimal.getcontext().rounding = decimal.ROUND_HALF_UP # define rounding method
	return decimal.Decimal(str(float(value))).quantize(
	decimal.Decimal("1e-{}".format(decimal_places))
	)


	def closest(value, sorted_list):
	index = bisect_left(sorted_list, value)
	if index == 0:
	return sorted_list[0]
	elif index == len(sorted_list):
	return sorted_list[-1]
	else:
	value_before = sorted_list[index - 1]
	value_after = sorted_list[index]
	if value_after - value < value - value_before:
	return value_after
	else:
	return value_before


	def get_up_vectors(pose_device_to_world):
	return np.matmul(pose_device_to_world, np.array([[0.0], [-1.0], [0.0], [0.0]]))


	def get_right_vectors(pose_device_to_world):
	return np.matmul(pose_device_to_world, np.array([[1.0], [0.0], [0.0], [0.0]]))


	def read_traj(traj_path):
	quaternions = []
	poses = []
	timestamps = []
	poses_p_to_w = []
	with open(traj_path) as f:
	traj_lines = f.readlines()
	for line in traj_lines:
	tokens = line.split()
	assert len(tokens) == 7
	traj_timestamp = float(tokens[0])

	timestamps_decimal_value = value_to_decimal(traj_timestamp, 3)
	timestamps.append(
	float(timestamps_decimal_value)
	) # for spline interpolation

	angle_axis = [float(tokens[1]), float(tokens[2]), float(tokens[3])]
	r_w_to_p, _ = cv2.Rodrigues(np.asarray(angle_axis))
	t_w_to_p = np.asarray(
	[float(tokens[4]), float(tokens[5]), float(tokens[6])]
	)

	pose_w_to_p = np.eye(4)
	pose_w_to_p[:3, :3] = r_w_to_p
	pose_w_to_p[:3, 3] = t_w_to_p

	pose_p_to_w = np.linalg.inv(pose_w_to_p)

	r_p_to_w_as_quat = quaternion.from_rotation_matrix(pose_p_to_w[:3, :3])
	t_p_to_w = pose_p_to_w[:3, 3]
	poses_p_to_w.append(pose_p_to_w)
	poses.append(t_p_to_w)
	quaternions.append(r_p_to_w_as_quat)
	return timestamps, poses, quaternions, poses_p_to_w


	def main(rootdir, outdir):
	os.makedirs(outdir, exist_ok=True)
	subdirs = ["Validation", "Training"]
	for subdir in subdirs:
	outsubdir = osp.join(outdir, subdir)
	scene_dirs = sorted(
	[
	d
	for d in os.listdir(osp.join(rootdir, subdir))
	if osp.isdir(osp.join(rootdir, subdir, d))
	]
	)

	with Pool() as pool:
	results = list(
	tqdm(
	pool.imap(
	process_scene,
	[
	(rootdir, outdir, subdir, scene_subdir)
	for scene_subdir in scene_dirs
	],
	),
	total=len(scene_dirs),
	)
	)

	# Filter None results and other post-processing
	valid_scenes = [result for result in results if result is not None]
	outlistfile = osp.join(outsubdir, "scene_list.json")
	with open(outlistfile, "w") as f:
	json.dump(valid_scenes, f)


	def process_scene(args):
	rootdir, outdir, subdir, scene_subdir = args
	# Unpack paths
	scene_dir = osp.join(rootdir, subdir, scene_subdir)
	outsubdir = osp.join(outdir, subdir)
	out_scene_subdir = osp.join(outsubdir, scene_subdir)

	# Validation if necessary resources exist
	if (
	not osp.exists(osp.join(scene_dir, "highres_depth"))
	or not osp.exists(osp.join(scene_dir, "vga_wide"))
	or not osp.exists(osp.join(scene_dir, "vga_wide_intrinsics"))
	or not osp.exists(osp.join(scene_dir, "lowres_wide.traj"))
	):
	return None

	depth_dir = osp.join(scene_dir, "highres_depth")
	rgb_dir = osp.join(scene_dir, "vga_wide")
	intrinsics_dir = osp.join(scene_dir, "vga_wide_intrinsics")
	traj_path = osp.join(scene_dir, "lowres_wide.traj")

	depth_files = sorted(os.listdir(depth_dir))
	img_files = sorted(os.listdir(rgb_dir))

	out_scene_subdir = osp.join(outsubdir, scene_subdir)

	# STEP 3: parse the scene and export the list of valid (K, pose, rgb, depth) and convert images
	scene_metadata_path = osp.join(out_scene_subdir, "scene_metadata.npz")
	if osp.isfile(scene_metadata_path):
	print(f"Skipping {scene_subdir}")
	else:
	print(f"parsing {scene_subdir}")
	# loads traj
	timestamps, poses, quaternions, poses_cam_to_world = read_traj(traj_path)

	poses = np.array(poses)
	quaternions = np.array(quaternions, dtype=np.quaternion)
	quaternions = quaternion.unflip_rotors(quaternions)
	timestamps = np.array(timestamps)

	all_depths = sorted(
	[
	(basename, basename.split(".png")[0].split("_")[1])
	for basename in depth_files
	],
	key=lambda x: float(x[1]),
	)

	selected_depths = []
	timestamps_selected = []
	timestamp_min = timestamps.min()
	timestamp_max = timestamps.max()
	for basename, frame_id in all_depths:
	frame_id = float(frame_id)
	if frame_id < timestamp_min or frame_id > timestamp_max:
	continue
	selected_depths.append((basename, frame_id))
	timestamps_selected.append(frame_id)

	sky_direction_scene, trajectories, intrinsics, images, depths = (
	convert_scene_metadata(
	scene_subdir,
	intrinsics_dir,
	timestamps,
	quaternions,
	poses,
	poses_cam_to_world,
	img_files,
	selected_depths,
	timestamps_selected,
	)
	)

	if len(images) == 0:
	print(f"Skipping {scene_subdir}")
	return None

	os.makedirs(out_scene_subdir, exist_ok=True)

	os.makedirs(os.path.join(out_scene_subdir, "vga_wide"), exist_ok=True)
	os.makedirs(os.path.join(out_scene_subdir, "highres_depth"), exist_ok=True)
	assert isinstance(sky_direction_scene, str)

	for image_path, depth_path in zip(images, depths):
	img_out = os.path.join(
	out_scene_subdir, "vga_wide", image_path.replace(".png", ".jpg")
	)
	depth_out = os.path.join(out_scene_subdir, "highres_depth", depth_path)
	if osp.isfile(img_out) and osp.isfile(depth_out):
	continue

	vga_wide_path = osp.join(rgb_dir, image_path)
	depth_path = osp.join(depth_dir, depth_path)

	if not osp.isfile(vga_wide_path) or not osp.isfile(depth_path):
	continue

	img = Image.open(vga_wide_path)
	depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)

	# rotate the image
	if sky_direction_scene == "RIGHT":
	try:
	img = img.transpose(Image.Transpose.ROTATE_90)
	except Exception:
	img = img.transpose(Image.ROTATE_90)
	depth = cv2.rotate(depth, cv2.ROTATE_90_COUNTERCLOCKWISE)

	elif sky_direction_scene == "LEFT":
	try:
	img = img.transpose(Image.Transpose.ROTATE_270)
	except Exception:
	img = img.transpose(Image.ROTATE_270)
	depth = cv2.rotate(depth, cv2.ROTATE_90_CLOCKWISE)

	elif sky_direction_scene == "DOWN":
	try:
	img = img.transpose(Image.Transpose.ROTATE_180)
	except Exception:
	img = img.transpose(Image.ROTATE_180)
	depth = cv2.rotate(depth, cv2.ROTATE_180)

	W, H = img.size
	if not osp.isfile(img_out):
	img.save(img_out)

	depth = cv2.resize(depth, (W, H), interpolation=cv2.INTER_NEAREST)
	if not osp.isfile(
	depth_out
	): # avoid destroying the base dataset when you mess up the paths
	cv2.imwrite(depth_out, depth)

	# save at the end
	np.savez(
	scene_metadata_path,
	trajectories=trajectories,
	intrinsics=intrinsics,
	images=images,
	)


	def convert_scene_metadata(
	scene_subdir,
	intrinsics_dir,
	timestamps,
	quaternions,
	poses,
	poses_cam_to_world,
	all_images,
	selected_depths,
	timestamps_selected,
	):
	# find scene orientation
	sky_direction_scene, rotated_to_cam = find_scene_orientation(poses_cam_to_world)

	# find/compute pose for selected timestamps
	# most images have a valid timestamp / exact pose associated
	timestamps_selected = np.array(timestamps_selected)
	spline = interpolate.interp1d(timestamps, poses, kind="linear", axis=0)
	interpolated_rotations = quaternion.squad(
	quaternions, timestamps, timestamps_selected
	)
	interpolated_positions = spline(timestamps_selected)

	trajectories = []
	intrinsics = []
	images = []
	depths = []
	for i, (basename, frame_id) in enumerate(selected_depths):
	intrinsic_fn = osp.join(intrinsics_dir, f"{scene_subdir}_{frame_id}.pincam")
	search_interval = int(0.1 / 0.001)
	for timestamp in range(-search_interval, search_interval + 1):
	if osp.exists(intrinsic_fn):
	break
	intrinsic_fn = osp.join(
	intrinsics_dir,
	f"{scene_subdir}_{float(frame_id) + timestamp * 0.001:.3f}.pincam",
	)
	if not osp.exists(intrinsic_fn):
	print(f"Skipping {intrinsic_fn}")
	continue

	image_path = "{}_{}.png".format(scene_subdir, frame_id)
	search_interval = int(0.001 / 0.001)
	for timestamp in range(-search_interval, search_interval + 1):
	if image_path in all_images:
	break
	image_path = "{}_{}.png".format(
	scene_subdir, float(frame_id) + timestamp * 0.001
	)
	if image_path not in all_images:
	print(f"Skipping {scene_subdir} {frame_id}")
	continue

	w, h, fx, fy, hw, hh = np.loadtxt(intrinsic_fn) # PINHOLE

	pose = np.eye(4)
	pose[:3, :3] = quaternion.as_rotation_matrix(interpolated_rotations[i])
	pose[:3, 3] = interpolated_positions[i]

	images.append(basename)
	depths.append(basename)
	if sky_direction_scene == "RIGHT" or sky_direction_scene == "LEFT":
	intrinsics.append([h, w, fy, fx, hh, hw]) # swapped intrinsics
	else:
	intrinsics.append([w, h, fx, fy, hw, hh])
	trajectories.append(
	pose @ rotated_to_cam
	) # pose_cam_to_world @ rotated_to_cam = rotated(cam) to world

	return sky_direction_scene, trajectories, intrinsics, images, depths


	def find_scene_orientation(poses_cam_to_world):
	if len(poses_cam_to_world) > 0:
	up_vector = sum(get_up_vectors(p) for p in poses_cam_to_world) / len(
	poses_cam_to_world
	)
	right_vector = sum(get_right_vectors(p) for p in poses_cam_to_world) / len(
	poses_cam_to_world
	)
	up_world = np.array([[0.0], [0.0], [1.0], [0.0]])
	else:
	up_vector = np.array([[0.0], [-1.0], [0.0], [0.0]])
	right_vector = np.array([[1.0], [0.0], [0.0], [0.0]])
	up_world = np.array([[0.0], [0.0], [1.0], [0.0]])

	# value between 0, 180
	device_up_to_world_up_angle = (
	np.arccos(np.clip(np.dot(np.transpose(up_world), up_vector), -1.0, 1.0)).item()
	* 180.0
	/ np.pi
	)
	device_right_to_world_up_angle = (
	np.arccos(
	np.clip(np.dot(np.transpose(up_world), right_vector), -1.0, 1.0)
	).item()
	* 180.0
	/ np.pi
	)

	up_closest_to_90 = abs(device_up_to_world_up_angle - 90.0) < abs(
	device_right_to_world_up_angle - 90.0
	)
	if up_closest_to_90:
	assert abs(device_up_to_world_up_angle - 90.0) < 45.0
	# LEFT
	if device_right_to_world_up_angle > 90.0:
	sky_direction_scene = "LEFT"
	cam_to_rotated_q = quaternion.from_rotation_vector(
	[0.0, 0.0, math.pi / 2.0]
	)
	else:
	# note that in metadata.csv RIGHT does not exist, but again it's not accurate...
	# well, turns out there are scenes oriented like this
	# for example Training/41124801
	sky_direction_scene = "RIGHT"
	cam_to_rotated_q = quaternion.from_rotation_vector(
	[0.0, 0.0, -math.pi / 2.0]
	)
	else:
	# right is close to 90
	assert abs(device_right_to_world_up_angle - 90.0) < 45.0
	if device_up_to_world_up_angle > 90.0:
	sky_direction_scene = "DOWN"
	cam_to_rotated_q = quaternion.from_rotation_vector([0.0, 0.0, math.pi])
	else:
	sky_direction_scene = "UP"
	cam_to_rotated_q = quaternion.quaternion(1, 0, 0, 0)
	cam_to_rotated = np.eye(4)
	cam_to_rotated[:3, :3] = quaternion.as_rotation_matrix(cam_to_rotated_q)
	rotated_to_cam = np.linalg.inv(cam_to_rotated)
	return sky_direction_scene, rotated_to_cam


	if __name__ == "__main__":
	parser = get_parser()
	args = parser.parse_args()
	main(args.arkitscenes_dir, args.output_dir)