Spaces:
Running on Zero
Running on Zero
| import torch | |
| import torch.nn.functional as F | |
| def rot6d2mat(x): | |
| """Convert 6D rotation representation to 3x3 rotation matrix. | |
| Based on Zhou et al., "On the Continuity of Rotation Representations in Neural Networks", CVPR 2019 | |
| """ | |
| device = x.device | |
| B = x.shape[0] | |
| a1 = x[:, 0:3] | |
| a2 = x[:, 3:6] | |
| b1 = F.normalize(a1) | |
| b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1) | |
| b3 = torch.cross(b1, b2, dim=1) | |
| rotMat = torch.stack((b1, b2, b3), dim=-1) # [B,3,3] | |
| return rotMat | |
| def quat2mat(quat): | |
| """Convert quaternion coefficients to rotation matrix. | |
| """ | |
| norm_quat = quat | |
| norm_quat = norm_quat / norm_quat.norm(p=2, dim=1, keepdim=True) | |
| w, x, y, z = norm_quat[:, 0], norm_quat[:, 1], norm_quat[:, 2], norm_quat[:, 3] | |
| B = quat.size(0) | |
| w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2) | |
| wx, wy, wz = w * x, w * y, w * z | |
| xy, xz, yz = x * y, x * z, y * z | |
| rotMat = torch.stack([w2 + x2 - y2 - z2, 2 * xy - 2 * wz, 2 * wy + 2 * xz, | |
| 2 * wz + 2 * xy, w2 - x2 + y2 - z2, 2 * yz - 2 * wx, | |
| 2 * xz - 2 * wy, 2 * wx + 2 * yz, w2 - x2 - y2 + z2], dim=1).view(B, 3, 3) | |
| return rotMat | |
| def mat2quat(x): | |
| # x: SE3 matrix in shape [B,4,4] | |
| trans = x[:,:3,3] | |
| rot = x[:,:3,:3] | |
| quat = mat2quat_transform(rot) | |
| return torch.cat([quat, trans], dim=1) | |
| def mat2quat_transform(rotation_matrix, eps=1e-6): | |
| """Convert 3x3 rotation matrix to 4d quaternion vector""" | |
| if not torch.is_tensor(rotation_matrix): | |
| raise TypeError("Input type is not a torch.Tensor. Got {}".format( | |
| type(rotation_matrix))) | |
| if len(rotation_matrix.shape) > 3: | |
| raise ValueError( | |
| "Input size must be a three dimensional tensor. Got {}".format( | |
| rotation_matrix.shape)) | |
| if not rotation_matrix.shape[-2:] == (3, 3): | |
| raise ValueError( | |
| "Input size must be a N x 3 x 4 tensor. Got {}".format( | |
| rotation_matrix.shape)) | |
| rmat_t = torch.transpose(rotation_matrix, 1, 2) | |
| mask_d2 = rmat_t[:, 2, 2] < eps | |
| mask_d0_d1 = rmat_t[:, 0, 0] > rmat_t[:, 1, 1] | |
| mask_d0_nd1 = rmat_t[:, 0, 0] < -rmat_t[:, 1, 1] | |
| t0 = 1 + rmat_t[:, 0, 0] - rmat_t[:, 1, 1] - rmat_t[:, 2, 2] | |
| q0 = torch.stack([rmat_t[:, 1, 2] - rmat_t[:, 2, 1], | |
| t0, rmat_t[:, 0, 1] + rmat_t[:, 1, 0], | |
| rmat_t[:, 2, 0] + rmat_t[:, 0, 2]], -1) | |
| t0_rep = t0.repeat(4, 1).t() | |
| t1 = 1 - rmat_t[:, 0, 0] + rmat_t[:, 1, 1] - rmat_t[:, 2, 2] | |
| q1 = torch.stack([rmat_t[:, 2, 0] - rmat_t[:, 0, 2], | |
| rmat_t[:, 0, 1] + rmat_t[:, 1, 0], | |
| t1, rmat_t[:, 1, 2] + rmat_t[:, 2, 1]], -1) | |
| t1_rep = t1.repeat(4, 1).t() | |
| t2 = 1 - rmat_t[:, 0, 0] - rmat_t[:, 1, 1] + rmat_t[:, 2, 2] | |
| q2 = torch.stack([rmat_t[:, 0, 1] - rmat_t[:, 1, 0], | |
| rmat_t[:, 2, 0] + rmat_t[:, 0, 2], | |
| rmat_t[:, 1, 2] + rmat_t[:, 2, 1], t2], -1) | |
| t2_rep = t2.repeat(4, 1).t() | |
| t3 = 1 + rmat_t[:, 0, 0] + rmat_t[:, 1, 1] + rmat_t[:, 2, 2] | |
| q3 = torch.stack([t3, rmat_t[:, 1, 2] - rmat_t[:, 2, 1], | |
| rmat_t[:, 2, 0] - rmat_t[:, 0, 2], | |
| rmat_t[:, 0, 1] - rmat_t[:, 1, 0]], -1) | |
| t3_rep = t3.repeat(4, 1).t() | |
| mask_c0 = mask_d2 * mask_d0_d1 | |
| mask_c1 = mask_d2 * ~mask_d0_d1 | |
| mask_c2 = ~mask_d2 * mask_d0_nd1 | |
| mask_c3 = ~mask_d2 * ~mask_d0_nd1 | |
| mask_c0 = mask_c0.view(-1, 1).type_as(q0) | |
| mask_c1 = mask_c1.view(-1, 1).type_as(q1) | |
| mask_c2 = mask_c2.view(-1, 1).type_as(q2) | |
| mask_c3 = mask_c3.view(-1, 1).type_as(q3) | |
| q = q0 * mask_c0 + q1 * mask_c1 + q2 * mask_c2 + q3 * mask_c3 | |
| q /= torch.sqrt(t0_rep * mask_c0 + t1_rep * mask_c1 + # noqa | |
| t2_rep * mask_c2 + t3_rep * mask_c3) # noqa | |
| q *= 0.5 | |
| return q | |
| def compute_scene_scale(cam_translations): | |
| """ | |
| Computes the scale for each scene based on camera translations. | |
| Args: | |
| cam_translations (Tensor): Tensor of shape [B, V, 3] representing camera translations | |
| for B scenes with V cameras per scene. | |
| Returns: | |
| Tensor: A tensor of shape [B] containing the scale for each scene. | |
| """ | |
| # Compute the camera center for each scene as the mean translation across all cameras | |
| cam_centers = cam_translations.mean(dim=1) # shape: [B, 3] | |
| # Compute the Euclidean distances from each camera to the scene's center | |
| # Expand cam_centers to [B, 1, 3] so that it can be broadcasted against cam_translations [B, V, 3] | |
| distances = torch.norm(cam_translations - cam_centers.unsqueeze(1), dim=2) # shape: [B, V] | |
| # The scale is the maximum distance from the center for each scene | |
| scene_scales = distances.max(dim=1)[0] # shape: [B] | |
| return scene_scales |