Spaces:

CgvKodai
/

_vggt

Runtime error

App Files Files Community

_vggt / training /data /dataset_util.py

CgvKodai

Upload folder using huggingface_hub

66003a2 verified about 1 month ago

raw

history blame contribute delete

24.6 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.
	#
	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	import os
	os.environ["OPENCV_IO_ENABLE_OPENEXR"] = "1"
	import cv2
	import math
	import numpy as np
	from PIL import Image
	import PIL
	try:
	lanczos = PIL.Image.Resampling.LANCZOS
	bicubic = PIL.Image.Resampling.BICUBIC
	except AttributeError:
	lanczos = PIL.Image.LANCZOS
	bicubic = PIL.Image.BICUBIC

	from vggt.utils.geometry import closed_form_inverse_se3



	#####################################################################################################################
	def crop_image_depth_and_intrinsic_by_pp(
	image, depth_map, intrinsic, target_shape, track=None, filepath=None, strict=False
	):
	"""
	TODO: some names of width and height seem not consistent. Need to check.


	Crops the given image and depth map around the camera's principal point, as defined by `intrinsic`.
	Specifically:
	- Ensures that the crop is centered on (cx, cy).
	- Optionally pads the image (and depth map) if `strict=True` and the result is smaller than `target_shape`.
	- Shifts the camera intrinsic matrix (and `track` if provided) accordingly.

	Args:
	image (np.ndarray):
	Input image array of shape (H, W, 3).
	depth_map (np.ndarray or None):
	Depth map array of shape (H, W), or None if not available.
	intrinsic (np.ndarray):
	Camera intrinsic matrix (3x3). The principal point is assumed to be at (intrinsic[1,2], intrinsic[0,2]).
	target_shape (tuple[int, int]):
	Desired output shape.
	track (np.ndarray or None):
	Optional array of shape (N, 2). Interpreted as (x, y) pixel coordinates. Will be shifted after cropping.
	filepath (str or None):
	An optional file path for debug logging (only used if strict mode triggers warnings).
	strict (bool):
	If True, will zero-pad to ensure the exact target_shape even if the cropped region is smaller.

	Raises:
	AssertionError:
	If the input image is smaller than `target_shape`.
	ValueError:
	If the cropped image is larger than `target_shape` (in strict mode), which should not normally happen.

	Returns:
	tuple:
	(cropped_image, cropped_depth_map, updated_intrinsic, updated_track)

	- cropped_image (np.ndarray): Cropped (and optionally padded) image.
	- cropped_depth_map (np.ndarray or None): Cropped (and optionally padded) depth map.
	- updated_intrinsic (np.ndarray): Intrinsic matrix adjusted for the crop.
	- updated_track (np.ndarray or None): Track array adjusted for the crop, or None if track was not provided.
	"""
	original_size = np.array(image.shape)
	intrinsic = np.copy(intrinsic)

	if original_size[0] < target_shape[0]:
	error_message = (
	f"Width check failed: original width {original_size[0]} "
	f"is less than target width {target_shape[0]}."
	)
	print(error_message)
	raise AssertionError(error_message)

	if original_size[1] < target_shape[1]:
	error_message = (
	f"Height check failed: original height {original_size[1]} "
	f"is less than target height {target_shape[1]}."
	)
	print(error_message)
	raise AssertionError(error_message)

	# Identify principal point (cx, cy) from intrinsic
	cx = (intrinsic[1, 2])
	cy = (intrinsic[0, 2])

	# Compute how far we can crop in each direction
	if strict:
	half_x = min((target_shape[0] / 2), cx)
	half_y = min((target_shape[1] / 2), cy)
	else:
	half_x = min((target_shape[0] / 2), cx, original_size[0] - cx)
	half_y = min((target_shape[1] / 2), cy, original_size[1] - cy)

	# Compute starting indices
	start_x = math.floor(cx) - math.floor(half_x)
	start_y = math.floor(cy) - math.floor(half_y)

	assert start_x >= 0
	assert start_y >= 0

	# Compute ending indices
	if strict:
	end_x = start_x + target_shape[0]
	end_y = start_y + target_shape[1]
	else:
	end_x = start_x + 2 * math.floor(half_x)
	end_y = start_y + 2 * math.floor(half_y)

	# Perform the crop
	image = image[start_x:end_x, start_y:end_y, :]
	if depth_map is not None:
	depth_map = depth_map[start_x:end_x, start_y:end_y]

	# Shift the principal point in the intrinsic
	intrinsic[1, 2] = intrinsic[1, 2] - start_x
	intrinsic[0, 2] = intrinsic[0, 2] - start_y

	# Adjust track if provided
	if track is not None:
	track[:, 1] = track[:, 1] - start_x
	track[:, 0] = track[:, 0] - start_y

	# If strict, zero-pad if the new shape is smaller than target_shape
	if strict:
	if (image.shape[:2] != target_shape).any():
	print(f"{filepath} does not meet the target shape")
	current_h, current_w = image.shape[:2]
	target_h, target_w = target_shape[0], target_shape[1]
	pad_h = target_h - current_h
	pad_w = target_w - current_w
	if pad_h < 0 or pad_w < 0:
	raise ValueError(
	f"The cropped image is bigger than the target shape: "
	f"cropped=({current_h},{current_w}), "
	f"target=({target_h},{target_w})."
	)
	image = np.pad(
	image,
	pad_width=((0, pad_h), (0, pad_w), (0, 0)),
	mode="constant",
	constant_values=0,
	)
	if depth_map is not None:
	depth_map = np.pad(
	depth_map,
	pad_width=((0, pad_h), (0, pad_w)),
	mode="constant",
	constant_values=0,
	)

	return image, depth_map, intrinsic, track


	def resize_image_depth_and_intrinsic(
	image,
	depth_map,
	intrinsic,
	target_shape,
	original_size,
	track=None,
	pixel_center=True,
	safe_bound=4,
	rescale_aug=True,
	):
	"""
	Resizes the given image and depth map (if provided) to slightly larger than `target_shape`,
	updating the intrinsic matrix (and track array if present). Optionally uses random rescaling
	to create some additional margin (based on `rescale_aug`).

	Steps:
	1. Compute a scaling factor so that the resized result is at least `target_shape + safe_bound`.
	2. Apply an optional triangular random factor if `rescale_aug=True`.
	3. Resize the image with LANCZOS if downscaling, BICUBIC if upscaling.
	4. Resize the depth map with nearest-neighbor.
	5. Update the camera intrinsic and track coordinates (if any).

	Args:
	image (np.ndarray):
	Input image array (H, W, 3).
	depth_map (np.ndarray or None):
	Depth map array (H, W), or None if unavailable.
	intrinsic (np.ndarray):
	Camera intrinsic matrix (3x3).
	target_shape (np.ndarray or tuple[int, int]):
	Desired final shape (height, width).
	original_size (np.ndarray or tuple[int, int]):
	Original size of the image in (height, width).
	track (np.ndarray or None):
	Optional (N, 2) array of pixel coordinates. Will be scaled.
	pixel_center (bool):
	If True, accounts for 0.5 pixel center shift during resizing.
	safe_bound (int or float):
	Additional margin (in pixels) to add to target_shape before resizing.
	rescale_aug (bool):
	If True, randomly increase the `safe_bound` within a certain range to simulate augmentation.

	Returns:
	tuple:
	(resized_image, resized_depth_map, updated_intrinsic, updated_track)

	- resized_image (np.ndarray): The resized image.
	- resized_depth_map (np.ndarray or None): The resized depth map.
	- updated_intrinsic (np.ndarray): Camera intrinsic updated for new resolution.
	- updated_track (np.ndarray or None): Track array updated or None if not provided.

	Raises:
	AssertionError:
	If the shapes of the resized image and depth map do not match.
	"""
	if rescale_aug:
	random_boundary = np.random.triangular(0, 0, 0.3)
	safe_bound = safe_bound + random_boundary * target_shape.max()

	resize_scales = (target_shape + safe_bound) / original_size
	max_resize_scale = np.max(resize_scales)
	intrinsic = np.copy(intrinsic)

	# Convert image to PIL for resizing
	image = Image.fromarray(image)
	input_resolution = np.array(image.size)
	output_resolution = np.floor(input_resolution * max_resize_scale).astype(int)
	image = image.resize(tuple(output_resolution), resample=lanczos if max_resize_scale < 1 else bicubic)
	image = np.array(image)

	if depth_map is not None:
	depth_map = cv2.resize(
	depth_map,
	output_resolution,
	fx=max_resize_scale,
	fy=max_resize_scale,
	interpolation=cv2.INTER_NEAREST,
	)

	actual_size = np.array(image.shape[:2])
	actual_resize_scale = np.max(actual_size / original_size)

	if pixel_center:
	intrinsic[0, 2] = intrinsic[0, 2] + 0.5
	intrinsic[1, 2] = intrinsic[1, 2] + 0.5

	intrinsic[:2, :] = intrinsic[:2, :] * actual_resize_scale

	if track is not None:
	track = track * actual_resize_scale

	if pixel_center:
	intrinsic[0, 2] = intrinsic[0, 2] - 0.5
	intrinsic[1, 2] = intrinsic[1, 2] - 0.5

	assert image.shape[:2] == depth_map.shape[:2]
	return image, depth_map, intrinsic, track


	def threshold_depth_map(
	depth_map: np.ndarray,
	max_percentile: float = 99,
	min_percentile: float = 1,
	max_depth: float = -1,
	) -> np.ndarray:
	"""
	Thresholds a depth map using percentile-based limits and optional maximum depth clamping.

	Steps:
	1. If `max_depth > 0`, clamp all values above `max_depth` to zero.
	2. Compute `max_percentile` and `min_percentile` thresholds using nanpercentile.
	3. Zero out values above/below these thresholds, if thresholds are > 0.

	Args:
	depth_map (np.ndarray):
	Input depth map (H, W).
	max_percentile (float):
	Upper percentile (0-100). Values above this will be set to zero.
	min_percentile (float):
	Lower percentile (0-100). Values below this will be set to zero.
	max_depth (float):
	Absolute maximum depth. If > 0, any depth above this is set to zero.
	If <= 0, no maximum-depth clamp is applied.

	Returns:
	np.ndarray:
	Depth map (H, W) after thresholding. Some or all values may be zero.
	Returns None if depth_map is None.
	"""
	if depth_map is None:
	return None

	depth_map = depth_map.astype(float, copy=True)

	# Optional clamp by max_depth
	if max_depth > 0:
	depth_map[depth_map > max_depth] = 0.0

	# Percentile-based thresholds
	depth_max_thres = (
	np.nanpercentile(depth_map, max_percentile) if max_percentile > 0 else None
	)
	depth_min_thres = (
	np.nanpercentile(depth_map, min_percentile) if min_percentile > 0 else None
	)

	# Apply the thresholds if they are > 0
	if depth_max_thres is not None and depth_max_thres > 0:
	depth_map[depth_map > depth_max_thres] = 0.0
	if depth_min_thres is not None and depth_min_thres > 0:
	depth_map[depth_map < depth_min_thres] = 0.0

	return depth_map


	def depth_to_world_coords_points(
	depth_map: np.ndarray,
	extrinsic: np.ndarray,
	intrinsic: np.ndarray,
	eps=1e-8,
	) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
	"""
	Converts a depth map to world coordinates (HxWx3) given the camera extrinsic and intrinsic.
	Returns both the world coordinates and the intermediate camera coordinates,
	as well as a mask for valid depth.

	Args:
	depth_map (np.ndarray):
	Depth map of shape (H, W).
	extrinsic (np.ndarray):
	Extrinsic matrix of shape (3, 4), representing the camera pose in OpenCV convention (camera-from-world).
	intrinsic (np.ndarray):
	Intrinsic matrix of shape (3, 3).
	eps (float):
	Small epsilon for thresholding valid depth.

	Returns:
	tuple[np.ndarray, np.ndarray, np.ndarray]:
	(world_coords_points, cam_coords_points, point_mask)

	- world_coords_points: (H, W, 3) array of 3D points in world frame.
	- cam_coords_points: (H, W, 3) array of 3D points in camera frame.
	- point_mask: (H, W) boolean array where True indicates valid (non-zero) depth.
	"""
	if depth_map is None:
	return None, None, None

	# Valid depth mask
	point_mask = depth_map > eps

	# Convert depth map to camera coordinates
	cam_coords_points = depth_to_cam_coords_points(depth_map, intrinsic)

	# The extrinsic is camera-from-world, so invert it to transform camera->world
	cam_to_world_extrinsic = closed_form_inverse_se3(extrinsic[None])[0]
	R_cam_to_world = cam_to_world_extrinsic[:3, :3]
	t_cam_to_world = cam_to_world_extrinsic[:3, 3]

	# Apply the rotation and translation to the camera coordinates
	world_coords_points = (
	np.dot(cam_coords_points, R_cam_to_world.T) + t_cam_to_world
	) # HxWx3, 3x3 -> HxWx3
	# world_coords_points = np.einsum("ij,hwj->hwi", R_cam_to_world, cam_coords_points) + t_cam_to_world

	return world_coords_points, cam_coords_points, point_mask


	def depth_to_cam_coords_points(
	depth_map: np.ndarray, intrinsic: np.ndarray
	) -> np.ndarray:
	"""
	Unprojects a depth map into camera coordinates, returning (H, W, 3).

	Args:
	depth_map (np.ndarray):
	Depth map of shape (H, W).
	intrinsic (np.ndarray):
	3x3 camera intrinsic matrix.
	Assumes zero skew and standard OpenCV layout:
	[ fx 0 cx ]
	[ 0 fy cy ]
	[ 0 0 1 ]

	Returns:
	np.ndarray:
	An (H, W, 3) array, where each pixel is mapped to (x, y, z) in the camera frame.
	"""
	H, W = depth_map.shape
	assert intrinsic.shape == (3, 3), "Intrinsic matrix must be 3x3"
	assert (
	intrinsic[0, 1] == 0 and intrinsic[1, 0] == 0
	), "Intrinsic matrix must have zero skew"

	# Intrinsic parameters
	fu, fv = intrinsic[0, 0], intrinsic[1, 1]
	cu, cv = intrinsic[0, 2], intrinsic[1, 2]

	# Generate grid of pixel coordinates
	u, v = np.meshgrid(np.arange(W), np.arange(H))

	# Unproject to camera coordinates
	x_cam = (u - cu) * depth_map / fu
	y_cam = (v - cv) * depth_map / fv
	z_cam = depth_map

	# Stack to form camera coordinates
	return np.stack((x_cam, y_cam, z_cam), axis=-1).astype(np.float32)


	def rotate_90_degrees(
	image, depth_map, extri_opencv, intri_opencv, clockwise=True, track=None
	):
	"""
	Rotates the input image, depth map, and camera parameters by 90 degrees.

	Applies one of two 90-degree rotations:
	- Clockwise
	- Counterclockwise (if clockwise=False)

	The extrinsic and intrinsic matrices are adjusted accordingly to maintain
	correct camera geometry. Track coordinates are also updated if provided.

	Args:
	image (np.ndarray):
	Input image of shape (H, W, 3).
	depth_map (np.ndarray or None):
	Depth map of shape (H, W), or None if not available.
	extri_opencv (np.ndarray):
	Extrinsic matrix (3x4) in OpenCV convention.
	intri_opencv (np.ndarray):
	Intrinsic matrix (3x3).
	clockwise (bool):
	If True, rotates the image 90 degrees clockwise; else 90 degrees counterclockwise.
	track (np.ndarray or None):
	Optional (N, 2) track array. Will be rotated accordingly.

	Returns:
	tuple:
	(
	rotated_image,
	rotated_depth_map,
	new_extri_opencv,
	new_intri_opencv,
	new_track
	)

	Where each is the updated version after the rotation.
	"""
	image_height, image_width = image.shape[:2]

	# Rotate the image and depth map
	rotated_image, rotated_depth_map = rotate_image_and_depth_rot90(image, depth_map, clockwise)
	# Adjust the intrinsic matrix
	new_intri_opencv = adjust_intrinsic_matrix_rot90(intri_opencv, image_width, image_height, clockwise)

	if track is not None:
	new_track = adjust_track_rot90(track, image_width, image_height, clockwise)
	else:
	new_track = None

	# Adjust the extrinsic matrix
	new_extri_opencv = adjust_extrinsic_matrix_rot90(extri_opencv, clockwise)

	return (
	rotated_image,
	rotated_depth_map,
	new_extri_opencv,
	new_intri_opencv,
	new_track,
	)


	def rotate_image_and_depth_rot90(image, depth_map, clockwise):
	"""
	Rotates the given image and depth map by 90 degrees (clockwise or counterclockwise),
	using a transpose+flip pattern.

	Args:
	image (np.ndarray):
	Input image of shape (H, W, 3).
	depth_map (np.ndarray or None):
	Depth map of shape (H, W), or None if not available.
	clockwise (bool):
	If True, rotate 90 degrees clockwise; else 90 degrees counterclockwise.

	Returns:
	tuple:
	(rotated_image, rotated_depth_map)
	"""
	rotated_depth_map = None
	if clockwise:
	rotated_image = np.transpose(image, (1, 0, 2)) # Transpose height and width
	rotated_image = np.flip(rotated_image, axis=1) # Flip horizontally
	if depth_map is not None:
	rotated_depth_map = np.transpose(depth_map, (1, 0))
	rotated_depth_map = np.flip(rotated_depth_map, axis=1)
	else:
	rotated_image = np.transpose(image, (1, 0, 2)) # Transpose height and width
	rotated_image = np.flip(rotated_image, axis=0) # Flip vertically
	if depth_map is not None:
	rotated_depth_map = np.transpose(depth_map, (1, 0))
	rotated_depth_map = np.flip(rotated_depth_map, axis=0)
	return np.copy(rotated_image), np.copy(rotated_depth_map)


	def adjust_extrinsic_matrix_rot90(extri_opencv, clockwise):
	"""
	Adjusts the extrinsic matrix (3x4) for a 90-degree rotation of the image.

	The rotation is in the image plane. This modifies the camera orientation
	accordingly. The function applies either a clockwise or counterclockwise
	90-degree rotation.

	Args:
	extri_opencv (np.ndarray):
	Extrinsic matrix (3x4) in OpenCV convention.
	clockwise (bool):
	If True, rotate extrinsic for a 90-degree clockwise image rotation;
	otherwise, counterclockwise.

	Returns:
	np.ndarray:
	A new 3x4 extrinsic matrix after the rotation.
	"""
	R = extri_opencv[:, :3]
	t = extri_opencv[:, 3]

	if clockwise:
	R_rotation = np.array([
	[0, -1, 0],
	[1, 0, 0],
	[0, 0, 1]
	])
	else:
	R_rotation = np.array([
	[0, 1, 0],
	[-1, 0, 0],
	[0, 0, 1]
	])

	new_R = np.dot(R_rotation, R)
	new_t = np.dot(R_rotation, t)
	new_extri_opencv = np.hstack((new_R, new_t.reshape(-1, 1)))
	return new_extri_opencv


	def adjust_intrinsic_matrix_rot90(intri_opencv, image_width, image_height, clockwise):
	"""
	Adjusts the intrinsic matrix (3x3) for a 90-degree rotation of the image in the image plane.

	Args:
	intri_opencv (np.ndarray):
	Intrinsic matrix (3x3).
	image_width (int):
	Original width of the image.
	image_height (int):
	Original height of the image.
	clockwise (bool):
	If True, rotate 90 degrees clockwise; else 90 degrees counterclockwise.

	Returns:
	np.ndarray:
	A new 3x3 intrinsic matrix after the rotation.
	"""
	fx, fy, cx, cy = (
	intri_opencv[0, 0],
	intri_opencv[1, 1],
	intri_opencv[0, 2],
	intri_opencv[1, 2],
	)

	new_intri_opencv = np.eye(3)
	if clockwise:
	new_intri_opencv[0, 0] = fy
	new_intri_opencv[1, 1] = fx
	new_intri_opencv[0, 2] = image_height - cy
	new_intri_opencv[1, 2] = cx
	else:
	new_intri_opencv[0, 0] = fy
	new_intri_opencv[1, 1] = fx
	new_intri_opencv[0, 2] = cy
	new_intri_opencv[1, 2] = image_width - cx

	return new_intri_opencv


	def adjust_track_rot90(track, image_width, image_height, clockwise):
	"""
	Adjusts a track (N, 2) for a 90-degree rotation of the image in the image plane.

	Args:
	track (np.ndarray):
	(N, 2) array of pixel coordinates, each row is (x, y).
	image_width (int):
	Original image width.
	image_height (int):
	Original image height.
	clockwise (bool):
	Whether the rotation is 90 degrees clockwise or counterclockwise.

	Returns:
	np.ndarray:
	A new track of shape (N, 2) after rotation.
	"""
	if clockwise:
	# (x, y) -> (y, image_width - 1 - x)
	new_track = np.stack((track[:, 1], image_width - 1 - track[:, 0]), axis=-1)
	else:
	# (x, y) -> (image_height - 1 - y, x)
	new_track = np.stack((image_height - 1 - track[:, 1], track[:, 0]), axis=-1)

	return new_track


	def read_image_cv2(path: str, rgb: bool = True) -> np.ndarray:
	"""
	Reads an image from disk using OpenCV, returning it as an RGB image array (H, W, 3).

	Args:
	path (str):
	File path to the image.
	rgb (bool):
	If True, convert the image to RGB.
	If False, leave the image in BGR/grayscale.

	Returns:
	np.ndarray or None:
	A numpy array of shape (H, W, 3) if successful,
	or None if the file does not exist or could not be read.
	"""
	if not os.path.exists(path) or os.path.getsize(path) == 0:
	print(f"File does not exist or is empty: {path}")
	return None

	img = cv2.imread(path)
	if img is None:
	print(f"Could not load image={path}. Retrying...")
	img = cv2.imread(path)
	if img is None:
	print("Retry failed.")
	return None

	if rgb:
	if len(img.shape) == 2:
	img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
	else:
	img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

	return img


	def read_depth(path: str, scale_adjustment=1.0) -> np.ndarray:
	"""
	Reads a depth map from disk in either .exr or .png format. The .exr is loaded using OpenCV
	with the environment variable OPENCV_IO_ENABLE_OPENEXR=1. The .png is assumed to be a 16-bit
	PNG (converted from half float).

	Args:
	path (str):
	File path to the depth image. Must end with .exr or .png.
	scale_adjustment (float):
	A multiplier for adjusting the loaded depth values (default=1.0).

	Returns:
	np.ndarray:
	A float32 array (H, W) containing the loaded depth. Zeros or non-finite values
	may indicate invalid regions.

	Raises:
	ValueError:
	If the file extension is not supported.
	"""
	if path.lower().endswith(".exr"):
	# Ensure OPENCV_IO_ENABLE_OPENEXR is set to "1"
	d = cv2.imread(path, cv2.IMREAD_ANYCOLOR \| cv2.IMREAD_ANYDEPTH)[..., 0]
	d[d > 1e9] = 0.0
	elif path.lower().endswith(".png"):
	d = load_16big_png_depth(path)
	else:
	raise ValueError(f'unsupported depth file name "{path}"')

	d = d * scale_adjustment
	d[~np.isfinite(d)] = 0.0

	return d


	def load_16big_png_depth(depth_png: str) -> np.ndarray:
	"""
	Loads a 16-bit PNG as a half-float depth map (H, W), returning a float32 NumPy array.

	Implementation detail:
	- PIL loads 16-bit data as 32-bit "I" mode.
	- We reinterpret the bits as float16, then cast to float32.

	Args:
	depth_png (str):
	File path to the 16-bit PNG.

	Returns:
	np.ndarray:
	A float32 depth array of shape (H, W).
	"""
	with Image.open(depth_png) as depth_pil:
	depth = (
	np.frombuffer(np.array(depth_pil, dtype=np.uint16), dtype=np.float16)
	.astype(np.float32)
	.reshape((depth_pil.size[1], depth_pil.size[0]))
	)
	return depth