Spaces:
Running on Zero
Running on Zero
| """ | |
| Camera transformation utilities for relative pose dataset. | |
| Provides unified camera transformation functions for converting between | |
| different camera representations and computing relative poses. | |
| Coordinate System: OpenCV/COLMAP convention | |
| - X: right | |
| - Y: down | |
| - Z: forward (viewing direction, into the scene) | |
| - Transformation: X_cam = R @ X_world + T (left-multiply convention) | |
| Note: This differs from: | |
| - PyTorch3D: X left, Y up, Z forward (same Z direction, flipped X and Y) | |
| - Blender camera: X right, Y up, Z backward (same X direction, flipped Y and Z) | |
| (Blender world coordinates use Z-up: X right, Y forward, Z up) | |
| """ | |
| import torch | |
| import numpy as np | |
| from typing import Dict, List, Tuple | |
| from scipy.spatial.transform import Rotation | |
| class CameraTransformUtils: | |
| """Unified camera transformation utilities for relative pose training.""" | |
| def rotation_matrix_to_6d(R: np.ndarray) -> np.ndarray: | |
| """ | |
| Convert 3x3 rotation matrix to 6D representation. | |
| Uses first two columns of rotation matrix, flattened. | |
| Args: | |
| R: (3, 3) rotation matrix | |
| Returns: | |
| rot_6d: (6,) vector [r1, r2] where r1 and r2 are the first two columns of R | |
| Example: | |
| >>> R = np.eye(3) | |
| >>> rot_6d = rotation_matrix_to_6d(R) | |
| >>> np.allclose(rot_6d, [1, 0, 0, 0, 1, 0]) | |
| True | |
| """ | |
| rot_6d = R[:, :2].flatten() | |
| return rot_6d | |
| def rotation_matrix_to_axis_angle(R: np.ndarray) -> np.ndarray: | |
| """ | |
| Convert 3x3 rotation matrix to axis-angle representation. | |
| Uses Rodrigues formula via scipy. | |
| Args: | |
| R: (3, 3) rotation matrix | |
| Returns: | |
| axis_angle: (3,) vector where: | |
| - direction is the rotation axis | |
| - magnitude is the rotation angle in radians | |
| - range: each component in [-π, π] | |
| Example: | |
| >>> R = np.eye(3) # Identity rotation | |
| >>> axis_angle = rotation_matrix_to_axis_angle(R) | |
| >>> np.allclose(axis_angle, [0, 0, 0]) | |
| True | |
| """ | |
| # Use scipy's Rotation class | |
| rot = Rotation.from_matrix(R) | |
| axis_angle = rot.as_rotvec() # Returns axis-angle representation | |
| return axis_angle | |
| def axis_angle_to_rotation_matrix(axis_angle: np.ndarray) -> np.ndarray: | |
| """ | |
| Convert axis-angle representation to 3x3 rotation matrix. | |
| Args: | |
| axis_angle: (3,) vector [rot_x, rot_y, rot_z] | |
| - direction is the rotation axis | |
| - magnitude is the rotation angle in radians | |
| Returns: | |
| R: (3, 3) rotation matrix | |
| Example: | |
| >>> axis_angle = np.array([0, 0, np.pi/2]) # 90° rotation around Z | |
| >>> R = axis_angle_to_rotation_matrix(axis_angle) | |
| >>> # Should be approximately [[0, -1, 0], [1, 0, 0], [0, 0, 1]] | |
| """ | |
| # Use scipy's Rotation class | |
| rot = Rotation.from_rotvec(axis_angle) | |
| R = rot.as_matrix() | |
| return R | |
| def compute_relative_pose( | |
| R_src: np.ndarray, | |
| T_src: np.ndarray, | |
| R_tgt: np.ndarray, | |
| T_tgt: np.ndarray, | |
| ) -> Dict[str, np.ndarray]: | |
| """ | |
| Compute relative pose from source camera to target camera. | |
| The relative pose transforms points from source camera frame to target camera frame. | |
| Math: | |
| R_rel = R_tgt @ R_src.T | |
| T_rel = T_tgt - R_rel @ T_src | |
| This means: X_tgt = R_rel @ X_src + T_rel | |
| Args: | |
| R_src: (3, 3) source camera rotation matrix | |
| T_src: (3,) source camera translation vector | |
| R_tgt: (3, 3) target camera rotation matrix | |
| T_tgt: (3,) target camera translation vector | |
| Returns: | |
| relative_pose: { | |
| 'rotation': (3,) axis-angle [rot_x, rot_y, rot_z], | |
| 'translation': (3,) [trans_x, trans_y, trans_z] | |
| } | |
| Example: | |
| >>> # Same camera -> identity transformation | |
| >>> R = np.eye(3) | |
| >>> T = np.array([1.0, 2.0, 3.0]) | |
| >>> rel = compute_relative_pose(R, T, R, T) | |
| >>> np.allclose(rel['rotation'], [0, 0, 0]) | |
| True | |
| >>> np.allclose(rel['translation'], [0, 0, 0]) | |
| True | |
| """ | |
| # Compute relative rotation | |
| R_rel = R_tgt @ R_src.T | |
| # Compute relative translation | |
| # Formula: T_rel = T_tgt - R_rel @ T_src | |
| # This gives the translation from source camera origin to target camera origin | |
| # in the target camera's coordinate frame | |
| T_rel = T_tgt - R_rel @ T_src | |
| # Convert rotation to axis-angle | |
| rotation_axis_angle = CameraTransformUtils.rotation_matrix_to_axis_angle(R_rel) | |
| rotation_6d = CameraTransformUtils.rotation_matrix_to_6d(R_rel) | |
| return { | |
| 'rotation_6d': rotation_6d, | |
| 'rotation': rotation_axis_angle, | |
| 'translation': T_rel, | |
| } | |
| def apply_relative_pose( | |
| R_src: np.ndarray, | |
| T_src: np.ndarray, | |
| relative_pose: Dict[str, np.ndarray] | |
| ) -> Tuple[np.ndarray, np.ndarray]: | |
| """ | |
| Apply relative pose to source camera to get target camera. | |
| This is the inverse operation of compute_relative_pose. | |
| Can be used to validate that relative pose computation is correct. | |
| Args: | |
| R_src: (3, 3) source camera rotation matrix | |
| T_src: (3,) source camera translation vector | |
| relative_pose: dict with: | |
| - 'rotation': (3,) axis-angle | |
| - 'translation': (3,) relative translation | |
| Returns: | |
| R_tgt: (3, 3) target camera rotation matrix | |
| T_tgt: (3,) target camera translation vector | |
| Example: | |
| >>> # Round-trip test | |
| >>> R1, T1 = random_rotation(), random_translation() | |
| >>> R2, T2 = random_rotation(), random_translation() | |
| >>> rel = compute_relative_pose(R1, T1, R2, T2) | |
| >>> R2_rec, T2_rec = apply_relative_pose(R1, T1, rel) | |
| >>> np.allclose(R2, R2_rec) and np.allclose(T2, T2_rec) | |
| True | |
| """ | |
| # Convert axis-angle to rotation matrix | |
| R_rel = CameraTransformUtils.axis_angle_to_rotation_matrix(relative_pose['rotation']) | |
| T_rel = relative_pose['translation'] | |
| # Apply relative transformation | |
| # R_rel = R_tgt @ R_src.T => R_tgt = R_rel @ R_src | |
| R_tgt = R_rel @ R_src | |
| # T_rel = T_tgt - R_rel @ T_src => T_tgt = R_rel @ T_src + T_rel | |
| T_tgt = R_rel @ T_src + T_rel | |
| return R_tgt, T_tgt | |
| def normalize_scene_translations( | |
| translations: List[np.ndarray], | |
| scale_factor: float = 2.0 | |
| ) -> Tuple[List[np.ndarray], float]: | |
| """ | |
| Normalize translations for a scene based on bounding sphere. | |
| Computes the radius of the bounding sphere containing all camera positions, | |
| then normalizes all translations to approximately [-0.5, 0.5] range. | |
| Args: | |
| translations: List of (3,) translation vectors | |
| scale_factor: Divisor for normalization (default: 2.0) | |
| - scale_factor=2.0 → range ≈ [-0.5, 0.5] | |
| - scale_factor=1.0 → range ≈ [-1.0, 1.0] | |
| Returns: | |
| normalized_translations: List of (3,) normalized vectors | |
| scene_radius: Float, radius of bounding sphere | |
| Example: | |
| >>> translations = [np.array([1, 0, 0]), np.array([0, 2, 0])] | |
| >>> norm_trans, radius = normalize_scene_translations(translations) | |
| >>> radius # max(||T||) = 2.0 | |
| 2.0 | |
| >>> # With scale_factor=2.0, translations are in [-0.5, 0.5] range | |
| """ | |
| # Compute bounding sphere radius (maximum distance from origin) | |
| norms = [np.linalg.norm(T) for T in translations] | |
| scene_radius = max(norms) | |
| if scene_radius < 1e-6: | |
| # Degenerate case: all cameras at origin | |
| return translations, 1.0 | |
| # Normalize all translations | |
| normalized = [T / (scale_factor * scene_radius) for T in translations] | |
| return normalized, scene_radius | |
| def create_lookat_rotation( | |
| camera_pos: np.ndarray, | |
| target_pos: np.ndarray, | |
| up_vector: np.ndarray = np.array([0.0, 0.0, 1.0]) | |
| ) -> np.ndarray: | |
| """ | |
| Create a look-at rotation matrix for left-multiply convention. | |
| NOTE: This function produces Blender convention (+X right, +Y up, +Z backward). | |
| Must apply Y,Z flip (diag(1, -1, -1)) to convert to our standard. | |
| Constructs a world-to-camera rotation matrix where: | |
| - Camera -Z axis points from camera to target (viewing direction) | |
| - Camera +Y axis aligns with the up vector as much as possible | |
| - Camera +X axis is the right direction (right-handed system) | |
| Args: | |
| camera_pos: (3,) camera position in world coordinates | |
| target_pos: (3,) target position to look at (usually origin) | |
| up_vector: (3,) approximate up direction in world (default: [0,0,1] for Blender Z-up) | |
| Returns: | |
| R: (3, 3) world-to-camera rotation matrix (Blender convention) | |
| Rows are camera axes (X, Y, Z) expressed in world coordinates | |
| Output: +X right, +Y up (aligns with world Z-up), +Z backward (away from scene) | |
| Example: | |
| >>> camera_pos = np.array([4.0, 0.0, 2.0]) | |
| >>> target_pos = np.array([0.0, 0.0, 0.0]) | |
| >>> R = create_lookat_rotation(camera_pos, target_pos) | |
| >>> # Camera at (4, 0, 2) in world Z-up coordinates, looking at origin | |
| >>> # Camera Y-axis aligns with world Z-up | |
| >>> # Apply diag(1, -1, -1) to convert to our standard | |
| """ | |
| # Compute forward direction (from camera toward target) | |
| forward = target_pos - camera_pos | |
| forward_norm = np.linalg.norm(forward) | |
| if forward_norm < 1e-6: | |
| raise ValueError("Camera position and target position are too close") | |
| forward = forward / forward_norm | |
| # Camera -Z points toward target, so camera +Z points away from target | |
| z_axis = -forward | |
| # Compute right direction (X axis) | |
| # X = up × Z (cross product gives right direction) | |
| right = np.cross(up_vector, z_axis) | |
| right_norm = np.linalg.norm(right) | |
| if right_norm < 1e-6: | |
| # Camera is looking straight up or down, choose arbitrary right vector | |
| # If looking up/down, use X axis as right | |
| right = np.array([1.0, 0.0, 0.0]) | |
| # Make sure it's perpendicular to z_axis | |
| right = right - np.dot(right, z_axis) * z_axis | |
| right_norm = np.linalg.norm(right) | |
| if right_norm < 1e-6: | |
| # Use Y axis instead | |
| right = np.array([0.0, 1.0, 0.0]) | |
| right = right - np.dot(right, z_axis) * z_axis | |
| right_norm = np.linalg.norm(right) | |
| right = right / right_norm | |
| # Recompute up to ensure orthonormality | |
| # Y = Z × X (ensures right-handed coordinate system) | |
| up_actual = np.cross(z_axis, right) | |
| # Construct rotation matrix | |
| # Rows are camera axes in world coordinates (for left-multiply convention) | |
| R = np.array([ | |
| right, # Camera X axis (right) | |
| up_actual, # Camera Y axis (up) | |
| z_axis # Camera Z axis (backward, away from target) | |
| ], dtype=np.float32) | |
| return R | |
| def spherical_to_rotation_matrix( | |
| azimuth: float, | |
| elevation: float, | |
| roll: float = 0.0 | |
| ) -> np.ndarray: | |
| """ | |
| Convert spherical camera angles to rotation matrix. | |
| DEPRECATED: Use create_lookat_rotation instead for accurate look-at cameras. | |
| NOTE: This produces Blender convention (+X right, +Y up, +Z backward). | |
| Must apply Y,Z flip (diag(1, -1, -1)) to convert to our standard. | |
| Assumes camera is on a sphere looking at the origin. | |
| Rotation order: Y (azimuth) → X (elevation) → Z (roll) | |
| This is commonly used for Objaverse-style datasets where cameras | |
| are positioned on a sphere around an object. | |
| Args: | |
| azimuth: Horizontal rotation in radians [-π, π] | |
| - 0: camera at +X | |
| - π/2: camera at +Z | |
| elevation: Vertical rotation in radians [-π/2, π/2] | |
| - 0: camera on equator | |
| - π/2: camera at north pole (+Y) | |
| roll: Roll rotation in radians (default: 0) | |
| Returns: | |
| R: (3, 3) rotation matrix (Blender convention) | |
| Output: +X right, +Y up, +Z backward (away from scene) | |
| Example: | |
| >>> # Camera at +X looking at origin | |
| >>> R = spherical_to_rotation_matrix(azimuth=0, elevation=0) | |
| >>> # R produces Blender convention: +Z backward (toward camera) | |
| """ | |
| # Rotation matrix for rotation around Y-axis (azimuth) | |
| cos_az, sin_az = np.cos(azimuth), np.sin(azimuth) | |
| R_y = np.array([ | |
| [cos_az, 0, sin_az], | |
| [0, 1, 0], | |
| [-sin_az, 0, cos_az] | |
| ]) | |
| # Rotation matrix for rotation around X-axis (elevation) | |
| cos_el, sin_el = np.cos(elevation), np.sin(elevation) | |
| R_x = np.array([ | |
| [1, 0, 0], | |
| [0, cos_el, -sin_el], | |
| [0, sin_el, cos_el] | |
| ]) | |
| # Rotation matrix for rotation around Z-axis (roll) | |
| cos_roll, sin_roll = np.cos(roll), np.sin(roll) | |
| R_z = np.array([ | |
| [cos_roll, -sin_roll, 0], | |
| [sin_roll, cos_roll, 0], | |
| [0, 0, 1] | |
| ]) | |
| # Combined rotation: first azimuth (Y), then elevation (X), then roll (Z) | |
| R = R_z @ R_x @ R_y | |
| return R | |
| def rotation_matrix_to_spherical(R: np.ndarray) -> Tuple[float, float, float]: | |
| """ | |
| Extract azimuth, elevation, and roll from rotation matrix. | |
| This is the inverse of spherical_to_rotation_matrix. | |
| Extracts the Euler angles assuming rotation order: Y (azimuth) → X (elevation) → Z (roll) | |
| Args: | |
| R: (3, 3) rotation matrix | |
| Returns: | |
| azimuth: Horizontal rotation in radians [-π, π] | |
| - 0: camera looking along +X | |
| - π/2: camera looking along +Z | |
| elevation: Vertical rotation in radians [-π/2, π/2] | |
| - 0: camera on equator | |
| - π/2: camera at north pole (+Y) | |
| roll: Roll rotation in radians [-π, π] | |
| - 0: no roll | |
| Example: | |
| >>> # Round-trip test | |
| >>> R_orig = spherical_to_rotation_matrix(az=0.5, el=0.3, roll=0.1) | |
| >>> az, el, roll = rotation_matrix_to_spherical(R_orig) | |
| >>> R_reconstructed = spherical_to_rotation_matrix(az, el, roll) | |
| >>> np.allclose(R_orig, R_reconstructed) | |
| True | |
| """ | |
| from scipy.spatial.transform import Rotation as R_scipy | |
| # Convert to scipy Rotation | |
| rot = R_scipy.from_matrix(R) | |
| # Extract Euler angles in YXZ order (intrinsic) | |
| # This matches the construction order in spherical_to_rotation_matrix: | |
| # R = R_z @ R_x @ R_y means intrinsic rotations: Y, then X, then Z | |
| # In scipy, lowercase = intrinsic, uppercase = extrinsic | |
| angles = rot.as_euler('yxz', degrees=False) | |
| azimuth = angles[0] | |
| elevation = angles[1] | |
| roll = angles[2] | |
| return azimuth, elevation, roll | |
| def rotation_matrix_to_camera_angles(R: np.ndarray) -> Dict[str, float]: | |
| """ | |
| Extract viewing direction angles from rotation matrix. | |
| Computes azimuth and elevation from the camera's forward direction (+Z axis). | |
| More robust than rotation_matrix_to_spherical for general rotations. | |
| Args: | |
| R: (3, 3) rotation matrix (world-to-camera, our convention) | |
| - Rows are camera axes in world coordinates | |
| - Our convention: +X right, +Y down, +Z forward | |
| Returns: | |
| dict with: | |
| - 'azimuth': Horizontal angle in XZ plane (radians) [-π, π] | |
| - 'elevation': Vertical angle from XZ plane (radians) [-π/2, π/2] | |
| - 'roll': Roll around viewing direction (radians) [-π, π] | |
| Example: | |
| >>> R = create_lookat_rotation([2, 1, 0], [0, 0, 0]) | |
| >>> # (After applying Y,Z flip to R) | |
| >>> angles = rotation_matrix_to_camera_angles(R) | |
| >>> # azimuth ≈ 0, elevation ≈ atan2(-1, 2) (Y is down!) | |
| """ | |
| # Camera viewing direction in world coords (camera looks along +Z in our convention) | |
| forward = R[2, :] # Z-axis (row 2) | |
| # Azimuth: angle in XZ plane (rotation around Y axis) | |
| azimuth = np.arctan2(forward[2], forward[0]) | |
| # Elevation: angle from XZ plane (rotation around X axis) | |
| # In our convention, +Y is down, so negative Y component means looking up | |
| # elevation = arcsin(-forward_y) if forward is normalized | |
| # More stable: elevation = atan2(-y, sqrt(x^2 + z^2)) | |
| elevation = np.arctan2(-forward[1], np.sqrt(forward[0]**2 + forward[2]**2)) | |
| # Roll: rotation around viewing direction | |
| # Compute expected up vector for zero roll | |
| # Create rotation with same azimuth/elevation but zero roll | |
| R_no_roll = CameraTransformUtils.spherical_to_rotation_matrix(azimuth, elevation, roll=0.0) | |
| expected_up = R_no_roll[1, :] # Expected up vector (row 1) | |
| actual_up = R[1, :] # Actual up vector | |
| # Project both onto plane perpendicular to forward | |
| expected_up_proj = expected_up - np.dot(expected_up, forward) * forward | |
| actual_up_proj = actual_up - np.dot(actual_up, forward) * forward | |
| # Normalize projections | |
| expected_up_norm = np.linalg.norm(expected_up_proj) | |
| actual_up_norm = np.linalg.norm(actual_up_proj) | |
| if expected_up_norm > 1e-6 and actual_up_norm > 1e-6: | |
| expected_up_proj = expected_up_proj / expected_up_norm | |
| actual_up_proj = actual_up_proj / actual_up_norm | |
| # Compute roll angle using atan2 for proper quadrant | |
| cos_roll = np.dot(expected_up_proj, actual_up_proj) | |
| # Cross product gives vector along forward direction, sign indicates rotation direction | |
| cross_prod = np.cross(expected_up_proj, actual_up_proj) | |
| sin_roll = np.dot(cross_prod, forward) | |
| roll = np.arctan2(sin_roll, cos_roll) | |
| else: | |
| # Degenerate case: camera pointing straight up or down | |
| roll = 0.0 | |
| return { | |
| 'azimuth': azimuth, | |
| 'elevation': elevation, | |
| 'roll': roll | |
| } | |
| def clip_and_warn( | |
| values: np.ndarray, | |
| min_val: float, | |
| max_val: float, | |
| name: str = "values" | |
| ) -> np.ndarray: | |
| """ | |
| Clip values to range and warn if clipping occurs. | |
| Useful for detecting normalization issues. | |
| Args: | |
| values: Array to clip | |
| min_val: Minimum value | |
| max_val: Maximum value | |
| name: Name for warning message | |
| Returns: | |
| clipped: Clipped array | |
| """ | |
| clipped = np.clip(values, min_val, max_val) | |
| # Check if any values were clipped | |
| num_clipped_low = np.sum(values < min_val) | |
| num_clipped_high = np.sum(values > max_val) | |
| if num_clipped_low > 0 or num_clipped_high > 0: | |
| print(f"Warning: {name} clipped:") | |
| if num_clipped_low > 0: | |
| print(f" {num_clipped_low} values below {min_val} (min: {values.min():.4f})") | |
| if num_clipped_high > 0: | |
| print(f" {num_clipped_high} values above {max_val} (max: {values.max():.4f})") | |
| return clipped | |
| def compute_angular_offset(rotation: 'torch.Tensor', translation: 'torch.Tensor', normalizer: float = 7.0) -> 'torch.Tensor': | |
| """ | |
| Compute angular offset (pitch, yaw) between actual and expected looking directions. | |
| The camera is generated to look at the origin with some noise added to the direction. | |
| This function extracts that noise as angular deviations in the camera's local frame. | |
| Args: | |
| rotation: (3, 3) rotation matrix [R] from world-to-camera | |
| translation: (3,) translation vector [T] from world-to-camera | |
| normalizer: Scale factor for camera position (default: 7.0) | |
| Returns: | |
| angular_offset: (2,) tensor with [pitch, yaw] in radians | |
| - pitch: Up/down angular offset (positive = camera tilted UP, object appears below center) | |
| - yaw: Left/right angular offset (positive = camera turned RIGHT, object appears left of center) | |
| """ | |
| import torch | |
| # Calculate camera position in world coordinates: C = -R^T @ T | |
| camera_position = -rotation.T @ translation | |
| # Expected direction: camera should look directly at origin | |
| # This is the unit vector from camera position toward origin | |
| expected_dir_world = -camera_position / torch.norm(camera_position) | |
| # Transform expected direction into camera's local coordinate frame | |
| # Camera frame: X=right, Y=up, Z=backward (OpenGL convention) | |
| # If camera looks perfectly at origin, expected_dir_cam should be (0, 0, -1) | |
| expected_dir_cam = rotation @ expected_dir_world | |
| # Compute angular deviations in camera frame | |
| # Since the actual looking direction is (0, 0, -1) in camera space, | |
| # the angles represent how much the expected direction deviates from actual | |
| # Pitch (up/down): rotation around camera's X-axis (right vector) | |
| # atan2(y, -z) gives the angle in the YZ plane | |
| # Positive pitch = expected direction is above actual = camera is tilted UP from origin-pointing | |
| # (object appears below center in the image) | |
| pitch = torch.atan2(expected_dir_cam[1], -expected_dir_cam[2]) | |
| # Yaw (left/right): rotation around camera's Y-axis (up vector) | |
| # atan2(x, -z) gives the angle in the XZ plane | |
| # Positive yaw = expected direction is to the right = camera is turned RIGHT from origin-pointing | |
| # (object appears left of center in the image) | |
| yaw = torch.atan2(expected_dir_cam[0], -expected_dir_cam[2]) | |
| return torch.tensor([pitch, yaw], dtype=torch.float32) | |
| def reconstruct_camera_from_factorized( | |
| azimuth: float, | |
| elevation: float, | |
| radius: float, | |
| pitch: float = 0.0, | |
| yaw: float = 0.0, | |
| return_numpy: bool = True | |
| ): | |
| """ | |
| Reconstruct camera rotation and translation from factorized parameters. | |
| This is the inverse operation of compute_angular_offset. Given spherical coordinates | |
| and angular offsets, reconstructs the world-to-camera transformation matrices. | |
| Args: | |
| azimuth: Azimuth angle in radians (horizontal rotation) | |
| elevation: Elevation angle in radians (vertical rotation) | |
| radius: Distance from origin | |
| pitch: Pitch offset in radians (rotation around camera X-axis, up/down) | |
| Positive = camera tilted UP from origin-pointing | |
| yaw: Yaw offset in radians (rotation around camera Y-axis, left/right) | |
| Positive = camera turned RIGHT from origin-pointing | |
| return_numpy: If True, return numpy arrays; if False, return torch tensors | |
| Returns: | |
| R: (3, 3) world-to-camera rotation matrix | |
| T: (3,) world-to-camera translation vector | |
| Example: | |
| >>> # Camera at azimuth=45°, elevation=30°, radius=7, looking at origin | |
| >>> R, T = reconstruct_camera_from_factorized( | |
| ... azimuth=np.pi/4, elevation=np.pi/6, radius=7.0, pitch=0.0, yaw=0.0 | |
| ... ) | |
| >>> # Verify: camera position should be at (7*cos(30°)*cos(45°), 7*cos(30°)*sin(45°), 7*sin(30°)) | |
| """ | |
| import torch | |
| import numpy as np | |
| # 1. Compute camera position in world coordinates (spherical to Cartesian) | |
| cos_el = np.cos(elevation) | |
| sin_el = np.sin(elevation) | |
| cos_az = np.cos(azimuth) | |
| sin_az = np.sin(azimuth) | |
| camera_pos = np.array([ | |
| radius * cos_el * cos_az, | |
| radius * cos_el * sin_az, | |
| radius * sin_el | |
| ], dtype=np.float32) | |
| # 2. Construct base rotation matrix (camera looking at origin, pitch=yaw=0) | |
| target_pos = np.array([0.0, 0.0, 0.0], dtype=np.float32) | |
| up_vector = np.array([0.0, 0.0, 1.0], dtype=np.float32) | |
| R_base = CameraTransformUtils.create_lookat_rotation(camera_pos, target_pos, up_vector) | |
| # 3. Create rotation offset from pitch/yaw in camera frame | |
| # Pitch: rotation around X-axis (up/down) | |
| # Yaw: rotation around Y-axis (left/right) | |
| # When pitch=yaw=0, R_base @ expected_dir_world = (0, 0, -1) | |
| # When pitch/yaw ≠ 0, we apply R_offset to rotate the viewing direction | |
| if abs(pitch) < 1e-9 and abs(yaw) < 1e-9: | |
| # No offset, use base rotation | |
| R = R_base | |
| else: | |
| # Compute target direction in camera frame from pitch/yaw | |
| # Working backwards from the atan2 extraction formulas: | |
| # pitch = atan2(y, -z) → tan(pitch) = y / (-z) | |
| # yaw = atan2(x, -z) → tan(yaw) = x / (-z) | |
| # | |
| # To satisfy both simultaneously with unit length: | |
| tan_pitch = np.tan(pitch) | |
| tan_yaw = np.tan(yaw) | |
| # Normalization factor | |
| norm_factor = np.sqrt(1 + tan_pitch**2 + tan_yaw**2) | |
| # Target direction that produces exact pitch/yaw when passed through atan2 | |
| target_dir_cam = np.array([ | |
| tan_yaw / norm_factor, # x component | |
| tan_pitch / norm_factor, # y component | |
| -1.0 / norm_factor # z component (negative = backward) | |
| ], dtype=np.float32) | |
| # Initial direction in camera frame (looking at origin after R_base) | |
| initial_dir_cam = np.array([0.0, 0.0, -1.0], dtype=np.float32) | |
| # Compute rotation from initial to target using Rodrigues' formula | |
| # axis = initial × target | |
| # angle = acos(initial · target) | |
| dot_product = np.dot(initial_dir_cam, target_dir_cam) | |
| if dot_product > 0.999999: | |
| # Directions are nearly identical, no rotation needed | |
| R_offset = np.eye(3, dtype=np.float32) | |
| elif dot_product < -0.999999: | |
| # Directions are opposite, rotate 180° around any perpendicular axis | |
| # Use Y-axis for consistency | |
| R_offset = np.array([ | |
| [-1, 0, 0], | |
| [0, 1, 0], | |
| [0, 0, -1] | |
| ], dtype=np.float32) | |
| else: | |
| # General case: use Rodrigues' formula | |
| axis = np.cross(initial_dir_cam, target_dir_cam) | |
| axis = axis / np.linalg.norm(axis) | |
| angle = np.arccos(np.clip(dot_product, -1.0, 1.0)) | |
| # Rodrigues' formula: R = I + sin(θ)K + (1-cos(θ))K^2 | |
| # where K is the skew-symmetric matrix of the axis | |
| K = np.array([ | |
| [0, -axis[2], axis[1]], | |
| [axis[2], 0, -axis[0]], | |
| [-axis[1], axis[0], 0] | |
| ], dtype=np.float32) | |
| R_offset = (np.eye(3, dtype=np.float32) + | |
| np.sin(angle) * K + | |
| (1 - np.cos(angle)) * (K @ K)) | |
| # Final rotation: apply offset in camera frame | |
| R = R_offset @ R_base | |
| # 4. Compute world-to-camera translation | |
| T = -R @ camera_pos | |
| if return_numpy: | |
| return R, T | |
| else: | |
| return torch.from_numpy(R).float(), torch.from_numpy(T).float() | |