vmem / extern /CUT3R /datasets_preprocess /preprocess_arkitscenes_highres.py
Jiahua0's picture
Upload folder using huggingface_hub
ff47419 verified
import os
import json
import os.path as osp
import decimal
import argparse
import math
from bisect import bisect_left
from PIL import Image
import numpy as np
import quaternion
from scipy import interpolate
import cv2
from tqdm import tqdm
from multiprocessing import Pool
def get_parser():
parser = argparse.ArgumentParser()
parser.add_argument(
"--arkitscenes_dir",
default="",
)
parser.add_argument(
"--output_dir",
default="data/dust3r_data/processed_arkitscenes_highres",
)
return parser
def value_to_decimal(value, decimal_places):
decimal.getcontext().rounding = decimal.ROUND_HALF_UP # define rounding method
return decimal.Decimal(str(float(value))).quantize(
decimal.Decimal("1e-{}".format(decimal_places))
)
def closest(value, sorted_list):
index = bisect_left(sorted_list, value)
if index == 0:
return sorted_list[0]
elif index == len(sorted_list):
return sorted_list[-1]
else:
value_before = sorted_list[index - 1]
value_after = sorted_list[index]
if value_after - value < value - value_before:
return value_after
else:
return value_before
def get_up_vectors(pose_device_to_world):
return np.matmul(pose_device_to_world, np.array([[0.0], [-1.0], [0.0], [0.0]]))
def get_right_vectors(pose_device_to_world):
return np.matmul(pose_device_to_world, np.array([[1.0], [0.0], [0.0], [0.0]]))
def read_traj(traj_path):
quaternions = []
poses = []
timestamps = []
poses_p_to_w = []
with open(traj_path) as f:
traj_lines = f.readlines()
for line in traj_lines:
tokens = line.split()
assert len(tokens) == 7
traj_timestamp = float(tokens[0])
timestamps_decimal_value = value_to_decimal(traj_timestamp, 3)
timestamps.append(
float(timestamps_decimal_value)
) # for spline interpolation
angle_axis = [float(tokens[1]), float(tokens[2]), float(tokens[3])]
r_w_to_p, _ = cv2.Rodrigues(np.asarray(angle_axis))
t_w_to_p = np.asarray(
[float(tokens[4]), float(tokens[5]), float(tokens[6])]
)
pose_w_to_p = np.eye(4)
pose_w_to_p[:3, :3] = r_w_to_p
pose_w_to_p[:3, 3] = t_w_to_p
pose_p_to_w = np.linalg.inv(pose_w_to_p)
r_p_to_w_as_quat = quaternion.from_rotation_matrix(pose_p_to_w[:3, :3])
t_p_to_w = pose_p_to_w[:3, 3]
poses_p_to_w.append(pose_p_to_w)
poses.append(t_p_to_w)
quaternions.append(r_p_to_w_as_quat)
return timestamps, poses, quaternions, poses_p_to_w
def main(rootdir, outdir):
os.makedirs(outdir, exist_ok=True)
subdirs = ["Validation", "Training"]
for subdir in subdirs:
outsubdir = osp.join(outdir, subdir)
scene_dirs = sorted(
[
d
for d in os.listdir(osp.join(rootdir, subdir))
if osp.isdir(osp.join(rootdir, subdir, d))
]
)
with Pool() as pool:
results = list(
tqdm(
pool.imap(
process_scene,
[
(rootdir, outdir, subdir, scene_subdir)
for scene_subdir in scene_dirs
],
),
total=len(scene_dirs),
)
)
# Filter None results and other post-processing
valid_scenes = [result for result in results if result is not None]
outlistfile = osp.join(outsubdir, "scene_list.json")
with open(outlistfile, "w") as f:
json.dump(valid_scenes, f)
def process_scene(args):
rootdir, outdir, subdir, scene_subdir = args
# Unpack paths
scene_dir = osp.join(rootdir, subdir, scene_subdir)
outsubdir = osp.join(outdir, subdir)
out_scene_subdir = osp.join(outsubdir, scene_subdir)
# Validation if necessary resources exist
if (
not osp.exists(osp.join(scene_dir, "highres_depth"))
or not osp.exists(osp.join(scene_dir, "vga_wide"))
or not osp.exists(osp.join(scene_dir, "vga_wide_intrinsics"))
or not osp.exists(osp.join(scene_dir, "lowres_wide.traj"))
):
return None
depth_dir = osp.join(scene_dir, "highres_depth")
rgb_dir = osp.join(scene_dir, "vga_wide")
intrinsics_dir = osp.join(scene_dir, "vga_wide_intrinsics")
traj_path = osp.join(scene_dir, "lowres_wide.traj")
depth_files = sorted(os.listdir(depth_dir))
img_files = sorted(os.listdir(rgb_dir))
out_scene_subdir = osp.join(outsubdir, scene_subdir)
# STEP 3: parse the scene and export the list of valid (K, pose, rgb, depth) and convert images
scene_metadata_path = osp.join(out_scene_subdir, "scene_metadata.npz")
if osp.isfile(scene_metadata_path):
print(f"Skipping {scene_subdir}")
else:
print(f"parsing {scene_subdir}")
# loads traj
timestamps, poses, quaternions, poses_cam_to_world = read_traj(traj_path)
poses = np.array(poses)
quaternions = np.array(quaternions, dtype=np.quaternion)
quaternions = quaternion.unflip_rotors(quaternions)
timestamps = np.array(timestamps)
all_depths = sorted(
[
(basename, basename.split(".png")[0].split("_")[1])
for basename in depth_files
],
key=lambda x: float(x[1]),
)
selected_depths = []
timestamps_selected = []
timestamp_min = timestamps.min()
timestamp_max = timestamps.max()
for basename, frame_id in all_depths:
frame_id = float(frame_id)
if frame_id < timestamp_min or frame_id > timestamp_max:
continue
selected_depths.append((basename, frame_id))
timestamps_selected.append(frame_id)
sky_direction_scene, trajectories, intrinsics, images, depths = (
convert_scene_metadata(
scene_subdir,
intrinsics_dir,
timestamps,
quaternions,
poses,
poses_cam_to_world,
img_files,
selected_depths,
timestamps_selected,
)
)
if len(images) == 0:
print(f"Skipping {scene_subdir}")
return None
os.makedirs(out_scene_subdir, exist_ok=True)
os.makedirs(os.path.join(out_scene_subdir, "vga_wide"), exist_ok=True)
os.makedirs(os.path.join(out_scene_subdir, "highres_depth"), exist_ok=True)
assert isinstance(sky_direction_scene, str)
for image_path, depth_path in zip(images, depths):
img_out = os.path.join(
out_scene_subdir, "vga_wide", image_path.replace(".png", ".jpg")
)
depth_out = os.path.join(out_scene_subdir, "highres_depth", depth_path)
if osp.isfile(img_out) and osp.isfile(depth_out):
continue
vga_wide_path = osp.join(rgb_dir, image_path)
depth_path = osp.join(depth_dir, depth_path)
if not osp.isfile(vga_wide_path) or not osp.isfile(depth_path):
continue
img = Image.open(vga_wide_path)
depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)
# rotate the image
if sky_direction_scene == "RIGHT":
try:
img = img.transpose(Image.Transpose.ROTATE_90)
except Exception:
img = img.transpose(Image.ROTATE_90)
depth = cv2.rotate(depth, cv2.ROTATE_90_COUNTERCLOCKWISE)
elif sky_direction_scene == "LEFT":
try:
img = img.transpose(Image.Transpose.ROTATE_270)
except Exception:
img = img.transpose(Image.ROTATE_270)
depth = cv2.rotate(depth, cv2.ROTATE_90_CLOCKWISE)
elif sky_direction_scene == "DOWN":
try:
img = img.transpose(Image.Transpose.ROTATE_180)
except Exception:
img = img.transpose(Image.ROTATE_180)
depth = cv2.rotate(depth, cv2.ROTATE_180)
W, H = img.size
if not osp.isfile(img_out):
img.save(img_out)
depth = cv2.resize(depth, (W, H), interpolation=cv2.INTER_NEAREST)
if not osp.isfile(
depth_out
): # avoid destroying the base dataset when you mess up the paths
cv2.imwrite(depth_out, depth)
# save at the end
np.savez(
scene_metadata_path,
trajectories=trajectories,
intrinsics=intrinsics,
images=images,
)
def convert_scene_metadata(
scene_subdir,
intrinsics_dir,
timestamps,
quaternions,
poses,
poses_cam_to_world,
all_images,
selected_depths,
timestamps_selected,
):
# find scene orientation
sky_direction_scene, rotated_to_cam = find_scene_orientation(poses_cam_to_world)
# find/compute pose for selected timestamps
# most images have a valid timestamp / exact pose associated
timestamps_selected = np.array(timestamps_selected)
spline = interpolate.interp1d(timestamps, poses, kind="linear", axis=0)
interpolated_rotations = quaternion.squad(
quaternions, timestamps, timestamps_selected
)
interpolated_positions = spline(timestamps_selected)
trajectories = []
intrinsics = []
images = []
depths = []
for i, (basename, frame_id) in enumerate(selected_depths):
intrinsic_fn = osp.join(intrinsics_dir, f"{scene_subdir}_{frame_id}.pincam")
search_interval = int(0.1 / 0.001)
for timestamp in range(-search_interval, search_interval + 1):
if osp.exists(intrinsic_fn):
break
intrinsic_fn = osp.join(
intrinsics_dir,
f"{scene_subdir}_{float(frame_id) + timestamp * 0.001:.3f}.pincam",
)
if not osp.exists(intrinsic_fn):
print(f"Skipping {intrinsic_fn}")
continue
image_path = "{}_{}.png".format(scene_subdir, frame_id)
search_interval = int(0.001 / 0.001)
for timestamp in range(-search_interval, search_interval + 1):
if image_path in all_images:
break
image_path = "{}_{}.png".format(
scene_subdir, float(frame_id) + timestamp * 0.001
)
if image_path not in all_images:
print(f"Skipping {scene_subdir} {frame_id}")
continue
w, h, fx, fy, hw, hh = np.loadtxt(intrinsic_fn) # PINHOLE
pose = np.eye(4)
pose[:3, :3] = quaternion.as_rotation_matrix(interpolated_rotations[i])
pose[:3, 3] = interpolated_positions[i]
images.append(basename)
depths.append(basename)
if sky_direction_scene == "RIGHT" or sky_direction_scene == "LEFT":
intrinsics.append([h, w, fy, fx, hh, hw]) # swapped intrinsics
else:
intrinsics.append([w, h, fx, fy, hw, hh])
trajectories.append(
pose @ rotated_to_cam
) # pose_cam_to_world @ rotated_to_cam = rotated(cam) to world
return sky_direction_scene, trajectories, intrinsics, images, depths
def find_scene_orientation(poses_cam_to_world):
if len(poses_cam_to_world) > 0:
up_vector = sum(get_up_vectors(p) for p in poses_cam_to_world) / len(
poses_cam_to_world
)
right_vector = sum(get_right_vectors(p) for p in poses_cam_to_world) / len(
poses_cam_to_world
)
up_world = np.array([[0.0], [0.0], [1.0], [0.0]])
else:
up_vector = np.array([[0.0], [-1.0], [0.0], [0.0]])
right_vector = np.array([[1.0], [0.0], [0.0], [0.0]])
up_world = np.array([[0.0], [0.0], [1.0], [0.0]])
# value between 0, 180
device_up_to_world_up_angle = (
np.arccos(np.clip(np.dot(np.transpose(up_world), up_vector), -1.0, 1.0)).item()
* 180.0
/ np.pi
)
device_right_to_world_up_angle = (
np.arccos(
np.clip(np.dot(np.transpose(up_world), right_vector), -1.0, 1.0)
).item()
* 180.0
/ np.pi
)
up_closest_to_90 = abs(device_up_to_world_up_angle - 90.0) < abs(
device_right_to_world_up_angle - 90.0
)
if up_closest_to_90:
assert abs(device_up_to_world_up_angle - 90.0) < 45.0
# LEFT
if device_right_to_world_up_angle > 90.0:
sky_direction_scene = "LEFT"
cam_to_rotated_q = quaternion.from_rotation_vector(
[0.0, 0.0, math.pi / 2.0]
)
else:
# note that in metadata.csv RIGHT does not exist, but again it's not accurate...
# well, turns out there are scenes oriented like this
# for example Training/41124801
sky_direction_scene = "RIGHT"
cam_to_rotated_q = quaternion.from_rotation_vector(
[0.0, 0.0, -math.pi / 2.0]
)
else:
# right is close to 90
assert abs(device_right_to_world_up_angle - 90.0) < 45.0
if device_up_to_world_up_angle > 90.0:
sky_direction_scene = "DOWN"
cam_to_rotated_q = quaternion.from_rotation_vector([0.0, 0.0, math.pi])
else:
sky_direction_scene = "UP"
cam_to_rotated_q = quaternion.quaternion(1, 0, 0, 0)
cam_to_rotated = np.eye(4)
cam_to_rotated[:3, :3] = quaternion.as_rotation_matrix(cam_to_rotated_q)
rotated_to_cam = np.linalg.inv(cam_to_rotated)
return sky_direction_scene, rotated_to_cam
if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()
main(args.arkitscenes_dir, args.output_dir)