|
|
import os |
|
|
import cv2 |
|
|
import json |
|
|
import numpy as np |
|
|
import os.path as osp |
|
|
from collections import deque |
|
|
import random |
|
|
from eval.mv_recon.base import BaseStereoViewDataset |
|
|
from dust3r.utils.image import imread_cv2 |
|
|
import eval.mv_recon.dataset_utils.cropping as cropping |
|
|
|
|
|
|
|
|
def shuffle_deque(dq, seed=None): |
|
|
|
|
|
if seed is not None: |
|
|
random.seed(seed) |
|
|
|
|
|
|
|
|
shuffled_list = list(dq) |
|
|
random.shuffle(shuffled_list) |
|
|
return deque(shuffled_list) |
|
|
|
|
|
|
|
|
class SevenScenes(BaseStereoViewDataset): |
|
|
def __init__( |
|
|
self, |
|
|
num_seq=1, |
|
|
num_frames=5, |
|
|
min_thresh=10, |
|
|
max_thresh=100, |
|
|
test_id=None, |
|
|
full_video=False, |
|
|
tuple_list=None, |
|
|
seq_id=None, |
|
|
rebuttal=False, |
|
|
shuffle_seed=-1, |
|
|
kf_every=1, |
|
|
*args, |
|
|
ROOT, |
|
|
**kwargs, |
|
|
): |
|
|
self.ROOT = ROOT |
|
|
super().__init__(*args, **kwargs) |
|
|
self.num_seq = num_seq |
|
|
self.num_frames = num_frames |
|
|
self.max_thresh = max_thresh |
|
|
self.min_thresh = min_thresh |
|
|
self.test_id = test_id |
|
|
self.full_video = full_video |
|
|
self.kf_every = kf_every |
|
|
self.seq_id = seq_id |
|
|
self.rebuttal = rebuttal |
|
|
self.shuffle_seed = shuffle_seed |
|
|
|
|
|
|
|
|
self.load_all_tuples(tuple_list) |
|
|
self.load_all_scenes(ROOT) |
|
|
|
|
|
def __len__(self): |
|
|
if self.tuple_list is not None: |
|
|
return len(self.tuple_list) |
|
|
return len(self.scene_list) * self.num_seq |
|
|
|
|
|
def load_all_tuples(self, tuple_list): |
|
|
if tuple_list is not None: |
|
|
self.tuple_list = tuple_list |
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
self.tuple_list = None |
|
|
|
|
|
def load_all_scenes(self, base_dir): |
|
|
|
|
|
if self.tuple_list is not None: |
|
|
|
|
|
self.scene_list = [ |
|
|
"stairs/seq-06", |
|
|
"stairs/seq-02", |
|
|
"pumpkin/seq-06", |
|
|
"chess/seq-01", |
|
|
"heads/seq-02", |
|
|
"fire/seq-02", |
|
|
"office/seq-03", |
|
|
"pumpkin/seq-03", |
|
|
"redkitchen/seq-07", |
|
|
"chess/seq-02", |
|
|
"office/seq-01", |
|
|
"redkitchen/seq-01", |
|
|
"fire/seq-01", |
|
|
] |
|
|
print(f"Found {len(self.scene_list)} sequences in split {self.split}") |
|
|
return |
|
|
|
|
|
scenes = os.listdir(base_dir) |
|
|
|
|
|
file_split = {"train": "TrainSplit.txt", "test": "TestSplit.txt"}[self.split] |
|
|
|
|
|
self.scene_list = [] |
|
|
for scene in scenes: |
|
|
if self.test_id is not None and scene != self.test_id: |
|
|
continue |
|
|
|
|
|
with open(osp.join(base_dir, scene, file_split)) as f: |
|
|
seq_ids = f.read().splitlines() |
|
|
|
|
|
for seq_id in seq_ids: |
|
|
|
|
|
|
|
|
num_part = "".join(filter(str.isdigit, seq_id)) |
|
|
seq_id = f"seq-{num_part.zfill(2)}" |
|
|
if self.seq_id is not None and seq_id != self.seq_id: |
|
|
continue |
|
|
self.scene_list.append(f"{scene}/{seq_id}") |
|
|
|
|
|
print(f"Found {len(self.scene_list)} sequences in split {self.split}") |
|
|
|
|
|
def _get_views(self, idx, resolution, rng): |
|
|
|
|
|
if self.tuple_list is not None: |
|
|
line = self.tuple_list[idx].split(" ") |
|
|
scene_id = line[0] |
|
|
img_idxs = line[1:] |
|
|
|
|
|
else: |
|
|
scene_id = self.scene_list[idx // self.num_seq] |
|
|
seq_id = idx % self.num_seq |
|
|
|
|
|
data_path = osp.join(self.ROOT, scene_id) |
|
|
num_files = len([name for name in os.listdir(data_path) if "color" in name]) |
|
|
img_idxs = [f"{i:06d}" for i in range(num_files)] |
|
|
img_idxs = img_idxs[:: self.kf_every] |
|
|
|
|
|
|
|
|
fx, fy, cx, cy = 525, 525, 320, 240 |
|
|
intrinsics_ = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32) |
|
|
|
|
|
views = [] |
|
|
imgs_idxs = deque(img_idxs) |
|
|
if self.shuffle_seed >= 0: |
|
|
imgs_idxs = shuffle_deque(imgs_idxs) |
|
|
|
|
|
while len(imgs_idxs) > 0: |
|
|
im_idx = imgs_idxs.popleft() |
|
|
impath = osp.join(self.ROOT, scene_id, f"frame-{im_idx}.color.png") |
|
|
depthpath = osp.join(self.ROOT, scene_id, f"frame-{im_idx}.depth.proj.png") |
|
|
posepath = osp.join(self.ROOT, scene_id, f"frame-{im_idx}.pose.txt") |
|
|
|
|
|
rgb_image = imread_cv2(impath) |
|
|
depthmap = imread_cv2(depthpath, cv2.IMREAD_UNCHANGED) |
|
|
rgb_image = cv2.resize(rgb_image, (depthmap.shape[1], depthmap.shape[0])) |
|
|
|
|
|
depthmap[depthmap == 65535] = 0 |
|
|
depthmap = np.nan_to_num(depthmap.astype(np.float32), 0.0) / 1000.0 |
|
|
depthmap[depthmap > 10] = 0 |
|
|
depthmap[depthmap < 1e-3] = 0 |
|
|
|
|
|
camera_pose = np.loadtxt(posepath).astype(np.float32) |
|
|
|
|
|
if resolution != (224, 224) or self.rebuttal: |
|
|
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( |
|
|
rgb_image, depthmap, intrinsics_, resolution, rng=rng, info=impath |
|
|
) |
|
|
else: |
|
|
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( |
|
|
rgb_image, depthmap, intrinsics_, (512, 384), rng=rng, info=impath |
|
|
) |
|
|
W, H = rgb_image.size |
|
|
cx = W // 2 |
|
|
cy = H // 2 |
|
|
l, t = cx - 112, cy - 112 |
|
|
r, b = cx + 112, cy + 112 |
|
|
crop_bbox = (l, t, r, b) |
|
|
rgb_image, depthmap, intrinsics = cropping.crop_image_depthmap( |
|
|
rgb_image, depthmap, intrinsics, crop_bbox |
|
|
) |
|
|
|
|
|
views.append( |
|
|
dict( |
|
|
img=rgb_image, |
|
|
depthmap=depthmap, |
|
|
camera_pose=camera_pose, |
|
|
camera_intrinsics=intrinsics, |
|
|
dataset="7scenes", |
|
|
label=osp.join(scene_id, im_idx), |
|
|
instance=impath, |
|
|
) |
|
|
) |
|
|
return views |
|
|
|
|
|
|
|
|
class DTU(BaseStereoViewDataset): |
|
|
def __init__( |
|
|
self, |
|
|
num_seq=49, |
|
|
num_frames=5, |
|
|
min_thresh=10, |
|
|
max_thresh=30, |
|
|
test_id=None, |
|
|
full_video=False, |
|
|
sample_pairs=False, |
|
|
kf_every=1, |
|
|
*args, |
|
|
ROOT, |
|
|
**kwargs, |
|
|
): |
|
|
self.ROOT = ROOT |
|
|
super().__init__(*args, **kwargs) |
|
|
|
|
|
self.num_seq = num_seq |
|
|
self.num_frames = num_frames |
|
|
self.max_thresh = max_thresh |
|
|
self.min_thresh = min_thresh |
|
|
self.test_id = test_id |
|
|
self.full_video = full_video |
|
|
self.kf_every = kf_every |
|
|
self.sample_pairs = sample_pairs |
|
|
|
|
|
|
|
|
self.load_all_scenes(ROOT) |
|
|
|
|
|
def __len__(self): |
|
|
return len(self.scene_list) * self.num_seq |
|
|
|
|
|
def load_all_scenes(self, base_dir): |
|
|
|
|
|
if self.test_id is None: |
|
|
self.scene_list = os.listdir(osp.join(base_dir)) |
|
|
print(f"Found {len(self.scene_list)} scenes in split {self.split}") |
|
|
|
|
|
else: |
|
|
if isinstance(self.test_id, list): |
|
|
self.scene_list = self.test_id |
|
|
else: |
|
|
self.scene_list = [self.test_id] |
|
|
|
|
|
print(f"Test_id: {self.test_id}") |
|
|
|
|
|
def load_cam_mvsnet(self, file, interval_scale=1): |
|
|
"""read camera txt file""" |
|
|
cam = np.zeros((2, 4, 4)) |
|
|
words = file.read().split() |
|
|
|
|
|
for i in range(0, 4): |
|
|
for j in range(0, 4): |
|
|
extrinsic_index = 4 * i + j + 1 |
|
|
cam[0][i][j] = words[extrinsic_index] |
|
|
|
|
|
|
|
|
for i in range(0, 3): |
|
|
for j in range(0, 3): |
|
|
intrinsic_index = 3 * i + j + 18 |
|
|
cam[1][i][j] = words[intrinsic_index] |
|
|
|
|
|
if len(words) == 29: |
|
|
cam[1][3][0] = words[27] |
|
|
cam[1][3][1] = float(words[28]) * interval_scale |
|
|
cam[1][3][2] = 192 |
|
|
cam[1][3][3] = cam[1][3][0] + cam[1][3][1] * cam[1][3][2] |
|
|
elif len(words) == 30: |
|
|
cam[1][3][0] = words[27] |
|
|
cam[1][3][1] = float(words[28]) * interval_scale |
|
|
cam[1][3][2] = words[29] |
|
|
cam[1][3][3] = cam[1][3][0] + cam[1][3][1] * cam[1][3][2] |
|
|
elif len(words) == 31: |
|
|
cam[1][3][0] = words[27] |
|
|
cam[1][3][1] = float(words[28]) * interval_scale |
|
|
cam[1][3][2] = words[29] |
|
|
cam[1][3][3] = words[30] |
|
|
else: |
|
|
cam[1][3][0] = 0 |
|
|
cam[1][3][1] = 0 |
|
|
cam[1][3][2] = 0 |
|
|
cam[1][3][3] = 0 |
|
|
|
|
|
extrinsic = cam[0].astype(np.float32) |
|
|
intrinsic = cam[1].astype(np.float32) |
|
|
|
|
|
return intrinsic, extrinsic |
|
|
|
|
|
def _get_views(self, idx, resolution, rng): |
|
|
scene_id = self.scene_list[idx // self.num_seq] |
|
|
seq_id = idx % self.num_seq |
|
|
|
|
|
print("Scene ID:", scene_id) |
|
|
|
|
|
image_path = osp.join(self.ROOT, scene_id, "images") |
|
|
depth_path = osp.join(self.ROOT, scene_id, "depths") |
|
|
mask_path = osp.join(self.ROOT, scene_id, "binary_masks") |
|
|
cam_path = osp.join(self.ROOT, scene_id, "cams") |
|
|
pairs_path = osp.join(self.ROOT, scene_id, "pair.txt") |
|
|
|
|
|
if not self.full_video: |
|
|
img_idxs = self.sample_pairs(pairs_path, seq_id) |
|
|
else: |
|
|
img_idxs = sorted(os.listdir(image_path)) |
|
|
img_idxs = img_idxs[:: self.kf_every] |
|
|
|
|
|
views = [] |
|
|
imgs_idxs = deque(img_idxs) |
|
|
|
|
|
while len(imgs_idxs) > 0: |
|
|
im_idx = imgs_idxs.pop() |
|
|
impath = osp.join(image_path, im_idx) |
|
|
depthpath = osp.join(depth_path, im_idx.replace(".jpg", ".npy")) |
|
|
campath = osp.join(cam_path, im_idx.replace(".jpg", "_cam.txt")) |
|
|
maskpath = osp.join(mask_path, im_idx.replace(".jpg", ".png")) |
|
|
|
|
|
rgb_image = imread_cv2(impath) |
|
|
depthmap = np.load(depthpath) |
|
|
depthmap = np.nan_to_num(depthmap.astype(np.float32), 0.0) |
|
|
|
|
|
mask = imread_cv2(maskpath, cv2.IMREAD_UNCHANGED) / 255.0 |
|
|
mask = mask.astype(np.float32) |
|
|
|
|
|
mask[mask > 0.5] = 1.0 |
|
|
mask[mask < 0.5] = 0.0 |
|
|
|
|
|
mask = cv2.resize( |
|
|
mask, |
|
|
(depthmap.shape[1], depthmap.shape[0]), |
|
|
interpolation=cv2.INTER_NEAREST, |
|
|
) |
|
|
kernel = np.ones((10, 10), np.uint8) |
|
|
mask = cv2.erode(mask, kernel, iterations=1) |
|
|
depthmap = depthmap * mask |
|
|
|
|
|
cur_intrinsics, camera_pose = self.load_cam_mvsnet(open(campath, "r")) |
|
|
intrinsics = cur_intrinsics[:3, :3] |
|
|
camera_pose = np.linalg.inv(camera_pose) |
|
|
|
|
|
if resolution != (224, 224): |
|
|
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( |
|
|
rgb_image, depthmap, intrinsics, resolution, rng=rng, info=impath |
|
|
) |
|
|
else: |
|
|
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( |
|
|
rgb_image, depthmap, intrinsics, (512, 384), rng=rng, info=impath |
|
|
) |
|
|
W, H = rgb_image.size |
|
|
cx = W // 2 |
|
|
cy = H // 2 |
|
|
l, t = cx - 112, cy - 112 |
|
|
r, b = cx + 112, cy + 112 |
|
|
crop_bbox = (l, t, r, b) |
|
|
rgb_image, depthmap, intrinsics = cropping.crop_image_depthmap( |
|
|
rgb_image, depthmap, intrinsics, crop_bbox |
|
|
) |
|
|
|
|
|
views.append( |
|
|
dict( |
|
|
img=rgb_image, |
|
|
depthmap=depthmap, |
|
|
camera_pose=camera_pose, |
|
|
camera_intrinsics=intrinsics, |
|
|
dataset="dtu", |
|
|
label=osp.join(scene_id, im_idx), |
|
|
instance=impath, |
|
|
) |
|
|
) |
|
|
|
|
|
return views |
|
|
|
|
|
|
|
|
class NRGBD(BaseStereoViewDataset): |
|
|
def __init__( |
|
|
self, |
|
|
num_seq=1, |
|
|
num_frames=5, |
|
|
min_thresh=10, |
|
|
max_thresh=100, |
|
|
test_id=None, |
|
|
full_video=False, |
|
|
tuple_list=None, |
|
|
seq_id=None, |
|
|
rebuttal=False, |
|
|
shuffle_seed=-1, |
|
|
kf_every=1, |
|
|
*args, |
|
|
ROOT, |
|
|
**kwargs, |
|
|
): |
|
|
|
|
|
self.ROOT = ROOT |
|
|
super().__init__(*args, **kwargs) |
|
|
self.num_seq = num_seq |
|
|
self.num_frames = num_frames |
|
|
self.max_thresh = max_thresh |
|
|
self.min_thresh = min_thresh |
|
|
self.test_id = test_id |
|
|
self.full_video = full_video |
|
|
self.kf_every = kf_every |
|
|
self.seq_id = seq_id |
|
|
self.rebuttal = rebuttal |
|
|
self.shuffle_seed = shuffle_seed |
|
|
|
|
|
|
|
|
self.load_all_tuples(tuple_list) |
|
|
self.load_all_scenes(ROOT) |
|
|
|
|
|
def __len__(self): |
|
|
if self.tuple_list is not None: |
|
|
return len(self.tuple_list) |
|
|
return len(self.scene_list) * self.num_seq |
|
|
|
|
|
def load_all_tuples(self, tuple_list): |
|
|
if tuple_list is not None: |
|
|
self.tuple_list = tuple_list |
|
|
|
|
|
|
|
|
|
|
|
else: |
|
|
self.tuple_list = None |
|
|
|
|
|
def load_all_scenes(self, base_dir): |
|
|
|
|
|
scenes = [ |
|
|
d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d)) |
|
|
] |
|
|
|
|
|
if self.test_id is not None: |
|
|
self.scene_list = [self.test_id] |
|
|
|
|
|
else: |
|
|
self.scene_list = scenes |
|
|
|
|
|
print(f"Found {len(self.scene_list)} sequences in split {self.split}") |
|
|
|
|
|
def load_poses(self, path): |
|
|
file = open(path, "r") |
|
|
lines = file.readlines() |
|
|
file.close() |
|
|
poses = [] |
|
|
valid = [] |
|
|
lines_per_matrix = 4 |
|
|
for i in range(0, len(lines), lines_per_matrix): |
|
|
if "nan" in lines[i]: |
|
|
valid.append(False) |
|
|
poses.append(np.eye(4, 4, dtype=np.float32).tolist()) |
|
|
else: |
|
|
valid.append(True) |
|
|
pose_floats = [ |
|
|
[float(x) for x in line.split()] |
|
|
for line in lines[i : i + lines_per_matrix] |
|
|
] |
|
|
poses.append(pose_floats) |
|
|
|
|
|
return np.array(poses, dtype=np.float32), valid |
|
|
|
|
|
def _get_views(self, idx, resolution, rng): |
|
|
|
|
|
if self.tuple_list is not None: |
|
|
line = self.tuple_list[idx].split(" ") |
|
|
scene_id = line[0] |
|
|
img_idxs = line[1:] |
|
|
|
|
|
else: |
|
|
scene_id = self.scene_list[idx // self.num_seq] |
|
|
|
|
|
num_files = len(os.listdir(os.path.join(self.ROOT, scene_id, "images"))) |
|
|
img_idxs = [f"{i}" for i in range(num_files)] |
|
|
img_idxs = img_idxs[:: min(self.kf_every, len(img_idxs) // 2)] |
|
|
|
|
|
fx, fy, cx, cy = 554.2562584220408, 554.2562584220408, 320, 240 |
|
|
intrinsics_ = np.array([[fx, 0, cx], [0, fy, cy], [0, 0, 1]], dtype=np.float32) |
|
|
|
|
|
posepath = osp.join(self.ROOT, scene_id, f"poses.txt") |
|
|
camera_poses, valids = self.load_poses(posepath) |
|
|
|
|
|
imgs_idxs = deque(img_idxs) |
|
|
if self.shuffle_seed >= 0: |
|
|
imgs_idxs = shuffle_deque(imgs_idxs) |
|
|
views = [] |
|
|
|
|
|
while len(imgs_idxs) > 0: |
|
|
im_idx = imgs_idxs.popleft() |
|
|
|
|
|
impath = osp.join(self.ROOT, scene_id, "images", f"img{im_idx}.png") |
|
|
depthpath = osp.join(self.ROOT, scene_id, "depth", f"depth{im_idx}.png") |
|
|
|
|
|
rgb_image = imread_cv2(impath) |
|
|
depthmap = imread_cv2(depthpath, cv2.IMREAD_UNCHANGED) |
|
|
depthmap = np.nan_to_num(depthmap.astype(np.float32), 0.0) / 1000.0 |
|
|
depthmap[depthmap > 10] = 0 |
|
|
depthmap[depthmap < 1e-3] = 0 |
|
|
|
|
|
rgb_image = cv2.resize(rgb_image, (depthmap.shape[1], depthmap.shape[0])) |
|
|
|
|
|
camera_pose = camera_poses[int(im_idx)] |
|
|
|
|
|
camera_pose[:, 1:3] *= -1.0 |
|
|
if resolution != (224, 224) or self.rebuttal: |
|
|
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( |
|
|
rgb_image, depthmap, intrinsics_, resolution, rng=rng, info=impath |
|
|
) |
|
|
else: |
|
|
rgb_image, depthmap, intrinsics = self._crop_resize_if_necessary( |
|
|
rgb_image, depthmap, intrinsics_, (512, 384), rng=rng, info=impath |
|
|
) |
|
|
W, H = rgb_image.size |
|
|
cx = W // 2 |
|
|
cy = H // 2 |
|
|
l, t = cx - 112, cy - 112 |
|
|
r, b = cx + 112, cy + 112 |
|
|
crop_bbox = (l, t, r, b) |
|
|
rgb_image, depthmap, intrinsics = cropping.crop_image_depthmap( |
|
|
rgb_image, depthmap, intrinsics, crop_bbox |
|
|
) |
|
|
|
|
|
views.append( |
|
|
dict( |
|
|
img=rgb_image, |
|
|
depthmap=depthmap, |
|
|
camera_pose=camera_pose, |
|
|
camera_intrinsics=intrinsics, |
|
|
dataset="nrgbd", |
|
|
label=osp.join(scene_id, im_idx), |
|
|
instance=impath, |
|
|
) |
|
|
) |
|
|
|
|
|
return views |
|
|
|