Spaces:

uva-cv-lab
/

wildrayzer

Running on Zero

App Files Files Community

wildrayzer / utils /pose_utils.py

Xuweiyi

Initial upload: WildRayZer Gradio Space

13f3c3a verified about 1 month ago

raw

history blame contribute delete

4.94 kB

	import torch
	import torch.nn.functional as F


	def rot6d2mat(x):
	"""Convert 6D rotation representation to 3x3 rotation matrix.
	Based on Zhou et al., "On the Continuity of Rotation Representations in Neural Networks", CVPR 2019
	"""
	device = x.device
	B = x.shape[0]

	a1 = x[:, 0:3]
	a2 = x[:, 3:6]
	b1 = F.normalize(a1)
	b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1)
	b3 = torch.cross(b1, b2, dim=1)
	rotMat = torch.stack((b1, b2, b3), dim=-1) # [B,3,3]
	return rotMat


	def quat2mat(quat):
	"""Convert quaternion coefficients to rotation matrix.
	"""
	norm_quat = quat
	norm_quat = norm_quat / norm_quat.norm(p=2, dim=1, keepdim=True)
	w, x, y, z = norm_quat[:, 0], norm_quat[:, 1], norm_quat[:, 2], norm_quat[:, 3]

	B = quat.size(0)

	w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2)
	wx, wy, wz = w * x, w * y, w * z
	xy, xz, yz = x * y, x * z, y * z

	rotMat = torch.stack([w2 + x2 - y2 - z2, 2 * xy - 2 * wz, 2 * wy + 2 * xz,
	2 * wz + 2 * xy, w2 - x2 + y2 - z2, 2 * yz - 2 * wx,
	2 * xz - 2 * wy, 2 * wx + 2 * yz, w2 - x2 - y2 + z2], dim=1).view(B, 3, 3)
	return rotMat


	def mat2quat(x):
	# x: SE3 matrix in shape [B,4,4]
	trans = x[:,:3,3]
	rot = x[:,:3,:3]
	quat = mat2quat_transform(rot)
	return torch.cat([quat, trans], dim=1)


	def mat2quat_transform(rotation_matrix, eps=1e-6):
	"""Convert 3x3 rotation matrix to 4d quaternion vector"""
	if not torch.is_tensor(rotation_matrix):
	raise TypeError("Input type is not a torch.Tensor. Got {}".format(
	type(rotation_matrix)))

	if len(rotation_matrix.shape) > 3:
	raise ValueError(
	"Input size must be a three dimensional tensor. Got {}".format(
	rotation_matrix.shape))
	if not rotation_matrix.shape[-2:] == (3, 3):
	raise ValueError(
	"Input size must be a N x 3 x 4 tensor. Got {}".format(
	rotation_matrix.shape))

	rmat_t = torch.transpose(rotation_matrix, 1, 2)

	mask_d2 = rmat_t[:, 2, 2] < eps

	mask_d0_d1 = rmat_t[:, 0, 0] > rmat_t[:, 1, 1]
	mask_d0_nd1 = rmat_t[:, 0, 0] < -rmat_t[:, 1, 1]

	t0 = 1 + rmat_t[:, 0, 0] - rmat_t[:, 1, 1] - rmat_t[:, 2, 2]
	q0 = torch.stack([rmat_t[:, 1, 2] - rmat_t[:, 2, 1],
	t0, rmat_t[:, 0, 1] + rmat_t[:, 1, 0],
	rmat_t[:, 2, 0] + rmat_t[:, 0, 2]], -1)
	t0_rep = t0.repeat(4, 1).t()

	t1 = 1 - rmat_t[:, 0, 0] + rmat_t[:, 1, 1] - rmat_t[:, 2, 2]
	q1 = torch.stack([rmat_t[:, 2, 0] - rmat_t[:, 0, 2],
	rmat_t[:, 0, 1] + rmat_t[:, 1, 0],
	t1, rmat_t[:, 1, 2] + rmat_t[:, 2, 1]], -1)
	t1_rep = t1.repeat(4, 1).t()

	t2 = 1 - rmat_t[:, 0, 0] - rmat_t[:, 1, 1] + rmat_t[:, 2, 2]
	q2 = torch.stack([rmat_t[:, 0, 1] - rmat_t[:, 1, 0],
	rmat_t[:, 2, 0] + rmat_t[:, 0, 2],
	rmat_t[:, 1, 2] + rmat_t[:, 2, 1], t2], -1)
	t2_rep = t2.repeat(4, 1).t()

	t3 = 1 + rmat_t[:, 0, 0] + rmat_t[:, 1, 1] + rmat_t[:, 2, 2]
	q3 = torch.stack([t3, rmat_t[:, 1, 2] - rmat_t[:, 2, 1],
	rmat_t[:, 2, 0] - rmat_t[:, 0, 2],
	rmat_t[:, 0, 1] - rmat_t[:, 1, 0]], -1)
	t3_rep = t3.repeat(4, 1).t()

	mask_c0 = mask_d2 * mask_d0_d1
	mask_c1 = mask_d2 * ~mask_d0_d1
	mask_c2 = ~mask_d2 * mask_d0_nd1
	mask_c3 = ~mask_d2 * ~mask_d0_nd1
	mask_c0 = mask_c0.view(-1, 1).type_as(q0)
	mask_c1 = mask_c1.view(-1, 1).type_as(q1)
	mask_c2 = mask_c2.view(-1, 1).type_as(q2)
	mask_c3 = mask_c3.view(-1, 1).type_as(q3)

	q = q0 * mask_c0 + q1 * mask_c1 + q2 * mask_c2 + q3 * mask_c3
	q /= torch.sqrt(t0_rep * mask_c0 + t1_rep * mask_c1 + # noqa
	t2_rep * mask_c2 + t3_rep * mask_c3) # noqa
	q *= 0.5
	return q


	def compute_scene_scale(cam_translations):
	"""
	Computes the scale for each scene based on camera translations.

	Args:
	cam_translations (Tensor): Tensor of shape [B, V, 3] representing camera translations
	for B scenes with V cameras per scene.

	Returns:
	Tensor: A tensor of shape [B] containing the scale for each scene.
	"""
	# Compute the camera center for each scene as the mean translation across all cameras
	cam_centers = cam_translations.mean(dim=1) # shape: [B, 3]

	# Compute the Euclidean distances from each camera to the scene's center
	# Expand cam_centers to [B, 1, 3] so that it can be broadcasted against cam_translations [B, V, 3]
	distances = torch.norm(cam_translations - cam_centers.unsqueeze(1), dim=2) # shape: [B, V]

	# The scale is the maximum distance from the center for each scene
	scene_scales = distances.max(dim=1)[0] # shape: [B]

	return scene_scales