""" Camera transformation utilities for relative pose dataset. Provides unified camera transformation functions for converting between different camera representations and computing relative poses. Coordinate System: OpenCV/COLMAP convention - X: right - Y: down - Z: forward (viewing direction, into the scene) - Transformation: X_cam = R @ X_world + T (left-multiply convention) Note: This differs from: - PyTorch3D: X left, Y up, Z forward (same Z direction, flipped X and Y) - Blender camera: X right, Y up, Z backward (same X direction, flipped Y and Z) (Blender world coordinates use Z-up: X right, Y forward, Z up) """ import torch import numpy as np from typing import Dict, List, Tuple from scipy.spatial.transform import Rotation class CameraTransformUtils: """Unified camera transformation utilities for relative pose training.""" @staticmethod def rotation_matrix_to_6d(R: np.ndarray) -> np.ndarray: """ Convert 3x3 rotation matrix to 6D representation. Uses first two columns of rotation matrix, flattened. Args: R: (3, 3) rotation matrix Returns: rot_6d: (6,) vector [r1, r2] where r1 and r2 are the first two columns of R Example: >>> R = np.eye(3) >>> rot_6d = rotation_matrix_to_6d(R) >>> np.allclose(rot_6d, [1, 0, 0, 0, 1, 0]) True """ rot_6d = R[:, :2].flatten() return rot_6d @staticmethod def rotation_matrix_to_axis_angle(R: np.ndarray) -> np.ndarray: """ Convert 3x3 rotation matrix to axis-angle representation. Uses Rodrigues formula via scipy. Args: R: (3, 3) rotation matrix Returns: axis_angle: (3,) vector where: - direction is the rotation axis - magnitude is the rotation angle in radians - range: each component in [-π, π] Example: >>> R = np.eye(3) # Identity rotation >>> axis_angle = rotation_matrix_to_axis_angle(R) >>> np.allclose(axis_angle, [0, 0, 0]) True """ # Use scipy's Rotation class rot = Rotation.from_matrix(R) axis_angle = rot.as_rotvec() # Returns axis-angle representation return axis_angle @staticmethod def axis_angle_to_rotation_matrix(axis_angle: np.ndarray) -> np.ndarray: """ Convert axis-angle representation to 3x3 rotation matrix. Args: axis_angle: (3,) vector [rot_x, rot_y, rot_z] - direction is the rotation axis - magnitude is the rotation angle in radians Returns: R: (3, 3) rotation matrix Example: >>> axis_angle = np.array([0, 0, np.pi/2]) # 90° rotation around Z >>> R = axis_angle_to_rotation_matrix(axis_angle) >>> # Should be approximately [[0, -1, 0], [1, 0, 0], [0, 0, 1]] """ # Use scipy's Rotation class rot = Rotation.from_rotvec(axis_angle) R = rot.as_matrix() return R @staticmethod def compute_relative_pose( R_src: np.ndarray, T_src: np.ndarray, R_tgt: np.ndarray, T_tgt: np.ndarray, ) -> Dict[str, np.ndarray]: """ Compute relative pose from source camera to target camera. The relative pose transforms points from source camera frame to target camera frame. Math: R_rel = R_tgt @ R_src.T T_rel = T_tgt - R_rel @ T_src This means: X_tgt = R_rel @ X_src + T_rel Args: R_src: (3, 3) source camera rotation matrix T_src: (3,) source camera translation vector R_tgt: (3, 3) target camera rotation matrix T_tgt: (3,) target camera translation vector Returns: relative_pose: { 'rotation': (3,) axis-angle [rot_x, rot_y, rot_z], 'translation': (3,) [trans_x, trans_y, trans_z] } Example: >>> # Same camera -> identity transformation >>> R = np.eye(3) >>> T = np.array([1.0, 2.0, 3.0]) >>> rel = compute_relative_pose(R, T, R, T) >>> np.allclose(rel['rotation'], [0, 0, 0]) True >>> np.allclose(rel['translation'], [0, 0, 0]) True """ # Compute relative rotation R_rel = R_tgt @ R_src.T # Compute relative translation # Formula: T_rel = T_tgt - R_rel @ T_src # This gives the translation from source camera origin to target camera origin # in the target camera's coordinate frame T_rel = T_tgt - R_rel @ T_src # Convert rotation to axis-angle rotation_axis_angle = CameraTransformUtils.rotation_matrix_to_axis_angle(R_rel) rotation_6d = CameraTransformUtils.rotation_matrix_to_6d(R_rel) return { 'rotation_6d': rotation_6d, 'rotation': rotation_axis_angle, 'translation': T_rel, } @staticmethod def apply_relative_pose( R_src: np.ndarray, T_src: np.ndarray, relative_pose: Dict[str, np.ndarray] ) -> Tuple[np.ndarray, np.ndarray]: """ Apply relative pose to source camera to get target camera. This is the inverse operation of compute_relative_pose. Can be used to validate that relative pose computation is correct. Args: R_src: (3, 3) source camera rotation matrix T_src: (3,) source camera translation vector relative_pose: dict with: - 'rotation': (3,) axis-angle - 'translation': (3,) relative translation Returns: R_tgt: (3, 3) target camera rotation matrix T_tgt: (3,) target camera translation vector Example: >>> # Round-trip test >>> R1, T1 = random_rotation(), random_translation() >>> R2, T2 = random_rotation(), random_translation() >>> rel = compute_relative_pose(R1, T1, R2, T2) >>> R2_rec, T2_rec = apply_relative_pose(R1, T1, rel) >>> np.allclose(R2, R2_rec) and np.allclose(T2, T2_rec) True """ # Convert axis-angle to rotation matrix R_rel = CameraTransformUtils.axis_angle_to_rotation_matrix(relative_pose['rotation']) T_rel = relative_pose['translation'] # Apply relative transformation # R_rel = R_tgt @ R_src.T => R_tgt = R_rel @ R_src R_tgt = R_rel @ R_src # T_rel = T_tgt - R_rel @ T_src => T_tgt = R_rel @ T_src + T_rel T_tgt = R_rel @ T_src + T_rel return R_tgt, T_tgt @staticmethod def normalize_scene_translations( translations: List[np.ndarray], scale_factor: float = 2.0 ) -> Tuple[List[np.ndarray], float]: """ Normalize translations for a scene based on bounding sphere. Computes the radius of the bounding sphere containing all camera positions, then normalizes all translations to approximately [-0.5, 0.5] range. Args: translations: List of (3,) translation vectors scale_factor: Divisor for normalization (default: 2.0) - scale_factor=2.0 → range ≈ [-0.5, 0.5] - scale_factor=1.0 → range ≈ [-1.0, 1.0] Returns: normalized_translations: List of (3,) normalized vectors scene_radius: Float, radius of bounding sphere Example: >>> translations = [np.array([1, 0, 0]), np.array([0, 2, 0])] >>> norm_trans, radius = normalize_scene_translations(translations) >>> radius # max(||T||) = 2.0 2.0 >>> # With scale_factor=2.0, translations are in [-0.5, 0.5] range """ # Compute bounding sphere radius (maximum distance from origin) norms = [np.linalg.norm(T) for T in translations] scene_radius = max(norms) if scene_radius < 1e-6: # Degenerate case: all cameras at origin return translations, 1.0 # Normalize all translations normalized = [T / (scale_factor * scene_radius) for T in translations] return normalized, scene_radius @staticmethod def create_lookat_rotation( camera_pos: np.ndarray, target_pos: np.ndarray, up_vector: np.ndarray = np.array([0.0, 0.0, 1.0]) ) -> np.ndarray: """ Create a look-at rotation matrix for left-multiply convention. NOTE: This function produces Blender convention (+X right, +Y up, +Z backward). Must apply Y,Z flip (diag(1, -1, -1)) to convert to our standard. Constructs a world-to-camera rotation matrix where: - Camera -Z axis points from camera to target (viewing direction) - Camera +Y axis aligns with the up vector as much as possible - Camera +X axis is the right direction (right-handed system) Args: camera_pos: (3,) camera position in world coordinates target_pos: (3,) target position to look at (usually origin) up_vector: (3,) approximate up direction in world (default: [0,0,1] for Blender Z-up) Returns: R: (3, 3) world-to-camera rotation matrix (Blender convention) Rows are camera axes (X, Y, Z) expressed in world coordinates Output: +X right, +Y up (aligns with world Z-up), +Z backward (away from scene) Example: >>> camera_pos = np.array([4.0, 0.0, 2.0]) >>> target_pos = np.array([0.0, 0.0, 0.0]) >>> R = create_lookat_rotation(camera_pos, target_pos) >>> # Camera at (4, 0, 2) in world Z-up coordinates, looking at origin >>> # Camera Y-axis aligns with world Z-up >>> # Apply diag(1, -1, -1) to convert to our standard """ # Compute forward direction (from camera toward target) forward = target_pos - camera_pos forward_norm = np.linalg.norm(forward) if forward_norm < 1e-6: raise ValueError("Camera position and target position are too close") forward = forward / forward_norm # Camera -Z points toward target, so camera +Z points away from target z_axis = -forward # Compute right direction (X axis) # X = up × Z (cross product gives right direction) right = np.cross(up_vector, z_axis) right_norm = np.linalg.norm(right) if right_norm < 1e-6: # Camera is looking straight up or down, choose arbitrary right vector # If looking up/down, use X axis as right right = np.array([1.0, 0.0, 0.0]) # Make sure it's perpendicular to z_axis right = right - np.dot(right, z_axis) * z_axis right_norm = np.linalg.norm(right) if right_norm < 1e-6: # Use Y axis instead right = np.array([0.0, 1.0, 0.0]) right = right - np.dot(right, z_axis) * z_axis right_norm = np.linalg.norm(right) right = right / right_norm # Recompute up to ensure orthonormality # Y = Z × X (ensures right-handed coordinate system) up_actual = np.cross(z_axis, right) # Construct rotation matrix # Rows are camera axes in world coordinates (for left-multiply convention) R = np.array([ right, # Camera X axis (right) up_actual, # Camera Y axis (up) z_axis # Camera Z axis (backward, away from target) ], dtype=np.float32) return R @staticmethod def spherical_to_rotation_matrix( azimuth: float, elevation: float, roll: float = 0.0 ) -> np.ndarray: """ Convert spherical camera angles to rotation matrix. DEPRECATED: Use create_lookat_rotation instead for accurate look-at cameras. NOTE: This produces Blender convention (+X right, +Y up, +Z backward). Must apply Y,Z flip (diag(1, -1, -1)) to convert to our standard. Assumes camera is on a sphere looking at the origin. Rotation order: Y (azimuth) → X (elevation) → Z (roll) This is commonly used for Objaverse-style datasets where cameras are positioned on a sphere around an object. Args: azimuth: Horizontal rotation in radians [-π, π] - 0: camera at +X - π/2: camera at +Z elevation: Vertical rotation in radians [-π/2, π/2] - 0: camera on equator - π/2: camera at north pole (+Y) roll: Roll rotation in radians (default: 0) Returns: R: (3, 3) rotation matrix (Blender convention) Output: +X right, +Y up, +Z backward (away from scene) Example: >>> # Camera at +X looking at origin >>> R = spherical_to_rotation_matrix(azimuth=0, elevation=0) >>> # R produces Blender convention: +Z backward (toward camera) """ # Rotation matrix for rotation around Y-axis (azimuth) cos_az, sin_az = np.cos(azimuth), np.sin(azimuth) R_y = np.array([ [cos_az, 0, sin_az], [0, 1, 0], [-sin_az, 0, cos_az] ]) # Rotation matrix for rotation around X-axis (elevation) cos_el, sin_el = np.cos(elevation), np.sin(elevation) R_x = np.array([ [1, 0, 0], [0, cos_el, -sin_el], [0, sin_el, cos_el] ]) # Rotation matrix for rotation around Z-axis (roll) cos_roll, sin_roll = np.cos(roll), np.sin(roll) R_z = np.array([ [cos_roll, -sin_roll, 0], [sin_roll, cos_roll, 0], [0, 0, 1] ]) # Combined rotation: first azimuth (Y), then elevation (X), then roll (Z) R = R_z @ R_x @ R_y return R @staticmethod def rotation_matrix_to_spherical(R: np.ndarray) -> Tuple[float, float, float]: """ Extract azimuth, elevation, and roll from rotation matrix. This is the inverse of spherical_to_rotation_matrix. Extracts the Euler angles assuming rotation order: Y (azimuth) → X (elevation) → Z (roll) Args: R: (3, 3) rotation matrix Returns: azimuth: Horizontal rotation in radians [-π, π] - 0: camera looking along +X - π/2: camera looking along +Z elevation: Vertical rotation in radians [-π/2, π/2] - 0: camera on equator - π/2: camera at north pole (+Y) roll: Roll rotation in radians [-π, π] - 0: no roll Example: >>> # Round-trip test >>> R_orig = spherical_to_rotation_matrix(az=0.5, el=0.3, roll=0.1) >>> az, el, roll = rotation_matrix_to_spherical(R_orig) >>> R_reconstructed = spherical_to_rotation_matrix(az, el, roll) >>> np.allclose(R_orig, R_reconstructed) True """ from scipy.spatial.transform import Rotation as R_scipy # Convert to scipy Rotation rot = R_scipy.from_matrix(R) # Extract Euler angles in YXZ order (intrinsic) # This matches the construction order in spherical_to_rotation_matrix: # R = R_z @ R_x @ R_y means intrinsic rotations: Y, then X, then Z # In scipy, lowercase = intrinsic, uppercase = extrinsic angles = rot.as_euler('yxz', degrees=False) azimuth = angles[0] elevation = angles[1] roll = angles[2] return azimuth, elevation, roll @staticmethod def rotation_matrix_to_camera_angles(R: np.ndarray) -> Dict[str, float]: """ Extract viewing direction angles from rotation matrix. Computes azimuth and elevation from the camera's forward direction (+Z axis). More robust than rotation_matrix_to_spherical for general rotations. Args: R: (3, 3) rotation matrix (world-to-camera, our convention) - Rows are camera axes in world coordinates - Our convention: +X right, +Y down, +Z forward Returns: dict with: - 'azimuth': Horizontal angle in XZ plane (radians) [-π, π] - 'elevation': Vertical angle from XZ plane (radians) [-π/2, π/2] - 'roll': Roll around viewing direction (radians) [-π, π] Example: >>> R = create_lookat_rotation([2, 1, 0], [0, 0, 0]) >>> # (After applying Y,Z flip to R) >>> angles = rotation_matrix_to_camera_angles(R) >>> # azimuth ≈ 0, elevation ≈ atan2(-1, 2) (Y is down!) """ # Camera viewing direction in world coords (camera looks along +Z in our convention) forward = R[2, :] # Z-axis (row 2) # Azimuth: angle in XZ plane (rotation around Y axis) azimuth = np.arctan2(forward[2], forward[0]) # Elevation: angle from XZ plane (rotation around X axis) # In our convention, +Y is down, so negative Y component means looking up # elevation = arcsin(-forward_y) if forward is normalized # More stable: elevation = atan2(-y, sqrt(x^2 + z^2)) elevation = np.arctan2(-forward[1], np.sqrt(forward[0]**2 + forward[2]**2)) # Roll: rotation around viewing direction # Compute expected up vector for zero roll # Create rotation with same azimuth/elevation but zero roll R_no_roll = CameraTransformUtils.spherical_to_rotation_matrix(azimuth, elevation, roll=0.0) expected_up = R_no_roll[1, :] # Expected up vector (row 1) actual_up = R[1, :] # Actual up vector # Project both onto plane perpendicular to forward expected_up_proj = expected_up - np.dot(expected_up, forward) * forward actual_up_proj = actual_up - np.dot(actual_up, forward) * forward # Normalize projections expected_up_norm = np.linalg.norm(expected_up_proj) actual_up_norm = np.linalg.norm(actual_up_proj) if expected_up_norm > 1e-6 and actual_up_norm > 1e-6: expected_up_proj = expected_up_proj / expected_up_norm actual_up_proj = actual_up_proj / actual_up_norm # Compute roll angle using atan2 for proper quadrant cos_roll = np.dot(expected_up_proj, actual_up_proj) # Cross product gives vector along forward direction, sign indicates rotation direction cross_prod = np.cross(expected_up_proj, actual_up_proj) sin_roll = np.dot(cross_prod, forward) roll = np.arctan2(sin_roll, cos_roll) else: # Degenerate case: camera pointing straight up or down roll = 0.0 return { 'azimuth': azimuth, 'elevation': elevation, 'roll': roll } @staticmethod def clip_and_warn( values: np.ndarray, min_val: float, max_val: float, name: str = "values" ) -> np.ndarray: """ Clip values to range and warn if clipping occurs. Useful for detecting normalization issues. Args: values: Array to clip min_val: Minimum value max_val: Maximum value name: Name for warning message Returns: clipped: Clipped array """ clipped = np.clip(values, min_val, max_val) # Check if any values were clipped num_clipped_low = np.sum(values < min_val) num_clipped_high = np.sum(values > max_val) if num_clipped_low > 0 or num_clipped_high > 0: print(f"Warning: {name} clipped:") if num_clipped_low > 0: print(f" {num_clipped_low} values below {min_val} (min: {values.min():.4f})") if num_clipped_high > 0: print(f" {num_clipped_high} values above {max_val} (max: {values.max():.4f})") return clipped def compute_angular_offset(rotation: 'torch.Tensor', translation: 'torch.Tensor', normalizer: float = 7.0) -> 'torch.Tensor': """ Compute angular offset (pitch, yaw) between actual and expected looking directions. The camera is generated to look at the origin with some noise added to the direction. This function extracts that noise as angular deviations in the camera's local frame. Args: rotation: (3, 3) rotation matrix [R] from world-to-camera translation: (3,) translation vector [T] from world-to-camera normalizer: Scale factor for camera position (default: 7.0) Returns: angular_offset: (2,) tensor with [pitch, yaw] in radians - pitch: Up/down angular offset (positive = camera tilted UP, object appears below center) - yaw: Left/right angular offset (positive = camera turned RIGHT, object appears left of center) """ import torch # Calculate camera position in world coordinates: C = -R^T @ T camera_position = -rotation.T @ translation # Expected direction: camera should look directly at origin # This is the unit vector from camera position toward origin expected_dir_world = -camera_position / torch.norm(camera_position) # Transform expected direction into camera's local coordinate frame # Camera frame: X=right, Y=up, Z=backward (OpenGL convention) # If camera looks perfectly at origin, expected_dir_cam should be (0, 0, -1) expected_dir_cam = rotation @ expected_dir_world # Compute angular deviations in camera frame # Since the actual looking direction is (0, 0, -1) in camera space, # the angles represent how much the expected direction deviates from actual # Pitch (up/down): rotation around camera's X-axis (right vector) # atan2(y, -z) gives the angle in the YZ plane # Positive pitch = expected direction is above actual = camera is tilted UP from origin-pointing # (object appears below center in the image) pitch = torch.atan2(expected_dir_cam[1], -expected_dir_cam[2]) # Yaw (left/right): rotation around camera's Y-axis (up vector) # atan2(x, -z) gives the angle in the XZ plane # Positive yaw = expected direction is to the right = camera is turned RIGHT from origin-pointing # (object appears left of center in the image) yaw = torch.atan2(expected_dir_cam[0], -expected_dir_cam[2]) return torch.tensor([pitch, yaw], dtype=torch.float32) def reconstruct_camera_from_factorized( azimuth: float, elevation: float, radius: float, pitch: float = 0.0, yaw: float = 0.0, return_numpy: bool = True ): """ Reconstruct camera rotation and translation from factorized parameters. This is the inverse operation of compute_angular_offset. Given spherical coordinates and angular offsets, reconstructs the world-to-camera transformation matrices. Args: azimuth: Azimuth angle in radians (horizontal rotation) elevation: Elevation angle in radians (vertical rotation) radius: Distance from origin pitch: Pitch offset in radians (rotation around camera X-axis, up/down) Positive = camera tilted UP from origin-pointing yaw: Yaw offset in radians (rotation around camera Y-axis, left/right) Positive = camera turned RIGHT from origin-pointing return_numpy: If True, return numpy arrays; if False, return torch tensors Returns: R: (3, 3) world-to-camera rotation matrix T: (3,) world-to-camera translation vector Example: >>> # Camera at azimuth=45°, elevation=30°, radius=7, looking at origin >>> R, T = reconstruct_camera_from_factorized( ... azimuth=np.pi/4, elevation=np.pi/6, radius=7.0, pitch=0.0, yaw=0.0 ... ) >>> # Verify: camera position should be at (7*cos(30°)*cos(45°), 7*cos(30°)*sin(45°), 7*sin(30°)) """ import torch import numpy as np # 1. Compute camera position in world coordinates (spherical to Cartesian) cos_el = np.cos(elevation) sin_el = np.sin(elevation) cos_az = np.cos(azimuth) sin_az = np.sin(azimuth) camera_pos = np.array([ radius * cos_el * cos_az, radius * cos_el * sin_az, radius * sin_el ], dtype=np.float32) # 2. Construct base rotation matrix (camera looking at origin, pitch=yaw=0) target_pos = np.array([0.0, 0.0, 0.0], dtype=np.float32) up_vector = np.array([0.0, 0.0, 1.0], dtype=np.float32) R_base = CameraTransformUtils.create_lookat_rotation(camera_pos, target_pos, up_vector) # 3. Create rotation offset from pitch/yaw in camera frame # Pitch: rotation around X-axis (up/down) # Yaw: rotation around Y-axis (left/right) # When pitch=yaw=0, R_base @ expected_dir_world = (0, 0, -1) # When pitch/yaw ≠ 0, we apply R_offset to rotate the viewing direction if abs(pitch) < 1e-9 and abs(yaw) < 1e-9: # No offset, use base rotation R = R_base else: # Compute target direction in camera frame from pitch/yaw # Working backwards from the atan2 extraction formulas: # pitch = atan2(y, -z) → tan(pitch) = y / (-z) # yaw = atan2(x, -z) → tan(yaw) = x / (-z) # # To satisfy both simultaneously with unit length: tan_pitch = np.tan(pitch) tan_yaw = np.tan(yaw) # Normalization factor norm_factor = np.sqrt(1 + tan_pitch**2 + tan_yaw**2) # Target direction that produces exact pitch/yaw when passed through atan2 target_dir_cam = np.array([ tan_yaw / norm_factor, # x component tan_pitch / norm_factor, # y component -1.0 / norm_factor # z component (negative = backward) ], dtype=np.float32) # Initial direction in camera frame (looking at origin after R_base) initial_dir_cam = np.array([0.0, 0.0, -1.0], dtype=np.float32) # Compute rotation from initial to target using Rodrigues' formula # axis = initial × target # angle = acos(initial · target) dot_product = np.dot(initial_dir_cam, target_dir_cam) if dot_product > 0.999999: # Directions are nearly identical, no rotation needed R_offset = np.eye(3, dtype=np.float32) elif dot_product < -0.999999: # Directions are opposite, rotate 180° around any perpendicular axis # Use Y-axis for consistency R_offset = np.array([ [-1, 0, 0], [0, 1, 0], [0, 0, -1] ], dtype=np.float32) else: # General case: use Rodrigues' formula axis = np.cross(initial_dir_cam, target_dir_cam) axis = axis / np.linalg.norm(axis) angle = np.arccos(np.clip(dot_product, -1.0, 1.0)) # Rodrigues' formula: R = I + sin(θ)K + (1-cos(θ))K^2 # where K is the skew-symmetric matrix of the axis K = np.array([ [0, -axis[2], axis[1]], [axis[2], 0, -axis[0]], [-axis[1], axis[0], 0] ], dtype=np.float32) R_offset = (np.eye(3, dtype=np.float32) + np.sin(angle) * K + (1 - np.cos(angle)) * (K @ K)) # Final rotation: apply offset in camera frame R = R_offset @ R_base # 4. Compute world-to-camera translation T = -R @ camera_pos if return_numpy: return R, T else: return torch.from_numpy(R).float(), torch.from_numpy(T).float()