wildrayzer / utils /pose_utils.py
Xuweiyi's picture
Initial upload: WildRayZer Gradio Space
13f3c3a verified
import torch
import torch.nn.functional as F
def rot6d2mat(x):
"""Convert 6D rotation representation to 3x3 rotation matrix.
Based on Zhou et al., "On the Continuity of Rotation Representations in Neural Networks", CVPR 2019
"""
device = x.device
B = x.shape[0]
a1 = x[:, 0:3]
a2 = x[:, 3:6]
b1 = F.normalize(a1)
b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1)
b3 = torch.cross(b1, b2, dim=1)
rotMat = torch.stack((b1, b2, b3), dim=-1) # [B,3,3]
return rotMat
def quat2mat(quat):
"""Convert quaternion coefficients to rotation matrix.
"""
norm_quat = quat
norm_quat = norm_quat / norm_quat.norm(p=2, dim=1, keepdim=True)
w, x, y, z = norm_quat[:, 0], norm_quat[:, 1], norm_quat[:, 2], norm_quat[:, 3]
B = quat.size(0)
w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2)
wx, wy, wz = w * x, w * y, w * z
xy, xz, yz = x * y, x * z, y * z
rotMat = torch.stack([w2 + x2 - y2 - z2, 2 * xy - 2 * wz, 2 * wy + 2 * xz,
2 * wz + 2 * xy, w2 - x2 + y2 - z2, 2 * yz - 2 * wx,
2 * xz - 2 * wy, 2 * wx + 2 * yz, w2 - x2 - y2 + z2], dim=1).view(B, 3, 3)
return rotMat
def mat2quat(x):
# x: SE3 matrix in shape [B,4,4]
trans = x[:,:3,3]
rot = x[:,:3,:3]
quat = mat2quat_transform(rot)
return torch.cat([quat, trans], dim=1)
def mat2quat_transform(rotation_matrix, eps=1e-6):
"""Convert 3x3 rotation matrix to 4d quaternion vector"""
if not torch.is_tensor(rotation_matrix):
raise TypeError("Input type is not a torch.Tensor. Got {}".format(
type(rotation_matrix)))
if len(rotation_matrix.shape) > 3:
raise ValueError(
"Input size must be a three dimensional tensor. Got {}".format(
rotation_matrix.shape))
if not rotation_matrix.shape[-2:] == (3, 3):
raise ValueError(
"Input size must be a N x 3 x 4 tensor. Got {}".format(
rotation_matrix.shape))
rmat_t = torch.transpose(rotation_matrix, 1, 2)
mask_d2 = rmat_t[:, 2, 2] < eps
mask_d0_d1 = rmat_t[:, 0, 0] > rmat_t[:, 1, 1]
mask_d0_nd1 = rmat_t[:, 0, 0] < -rmat_t[:, 1, 1]
t0 = 1 + rmat_t[:, 0, 0] - rmat_t[:, 1, 1] - rmat_t[:, 2, 2]
q0 = torch.stack([rmat_t[:, 1, 2] - rmat_t[:, 2, 1],
t0, rmat_t[:, 0, 1] + rmat_t[:, 1, 0],
rmat_t[:, 2, 0] + rmat_t[:, 0, 2]], -1)
t0_rep = t0.repeat(4, 1).t()
t1 = 1 - rmat_t[:, 0, 0] + rmat_t[:, 1, 1] - rmat_t[:, 2, 2]
q1 = torch.stack([rmat_t[:, 2, 0] - rmat_t[:, 0, 2],
rmat_t[:, 0, 1] + rmat_t[:, 1, 0],
t1, rmat_t[:, 1, 2] + rmat_t[:, 2, 1]], -1)
t1_rep = t1.repeat(4, 1).t()
t2 = 1 - rmat_t[:, 0, 0] - rmat_t[:, 1, 1] + rmat_t[:, 2, 2]
q2 = torch.stack([rmat_t[:, 0, 1] - rmat_t[:, 1, 0],
rmat_t[:, 2, 0] + rmat_t[:, 0, 2],
rmat_t[:, 1, 2] + rmat_t[:, 2, 1], t2], -1)
t2_rep = t2.repeat(4, 1).t()
t3 = 1 + rmat_t[:, 0, 0] + rmat_t[:, 1, 1] + rmat_t[:, 2, 2]
q3 = torch.stack([t3, rmat_t[:, 1, 2] - rmat_t[:, 2, 1],
rmat_t[:, 2, 0] - rmat_t[:, 0, 2],
rmat_t[:, 0, 1] - rmat_t[:, 1, 0]], -1)
t3_rep = t3.repeat(4, 1).t()
mask_c0 = mask_d2 * mask_d0_d1
mask_c1 = mask_d2 * ~mask_d0_d1
mask_c2 = ~mask_d2 * mask_d0_nd1
mask_c3 = ~mask_d2 * ~mask_d0_nd1
mask_c0 = mask_c0.view(-1, 1).type_as(q0)
mask_c1 = mask_c1.view(-1, 1).type_as(q1)
mask_c2 = mask_c2.view(-1, 1).type_as(q2)
mask_c3 = mask_c3.view(-1, 1).type_as(q3)
q = q0 * mask_c0 + q1 * mask_c1 + q2 * mask_c2 + q3 * mask_c3
q /= torch.sqrt(t0_rep * mask_c0 + t1_rep * mask_c1 + # noqa
t2_rep * mask_c2 + t3_rep * mask_c3) # noqa
q *= 0.5
return q
def compute_scene_scale(cam_translations):
"""
Computes the scale for each scene based on camera translations.
Args:
cam_translations (Tensor): Tensor of shape [B, V, 3] representing camera translations
for B scenes with V cameras per scene.
Returns:
Tensor: A tensor of shape [B] containing the scale for each scene.
"""
# Compute the camera center for each scene as the mean translation across all cameras
cam_centers = cam_translations.mean(dim=1) # shape: [B, 3]
# Compute the Euclidean distances from each camera to the scene's center
# Expand cam_centers to [B, 1, 3] so that it can be broadcasted against cam_translations [B, V, 3]
distances = torch.norm(cam_translations - cam_centers.unsqueeze(1), dim=2) # shape: [B, V]
# The scale is the maximum distance from the center for each scene
scene_scales = distances.max(dim=1)[0] # shape: [B]
return scene_scales