Spaces:

XinxuanLu
/

viewtoken-harmon-demo

Running on Zero

File size: 27,901 Bytes

becf13a

"""
Camera transformation utilities for relative pose dataset.

Provides unified camera transformation functions for converting between
different camera representations and computing relative poses.

Coordinate System: OpenCV/COLMAP convention
- X: right
- Y: down
- Z: forward (viewing direction, into the scene)
- Transformation: X_cam = R @ X_world + T (left-multiply convention)

Note: This differs from:
- PyTorch3D: X left, Y up, Z forward (same Z direction, flipped X and Y)
- Blender camera: X right, Y up, Z backward (same X direction, flipped Y and Z)
  (Blender world coordinates use Z-up: X right, Y forward, Z up)
"""

import torch
import numpy as np
from typing import Dict, List, Tuple
from scipy.spatial.transform import Rotation


class CameraTransformUtils:
    """Unified camera transformation utilities for relative pose training."""

    @staticmethod
    def rotation_matrix_to_6d(R: np.ndarray) -> np.ndarray:
        """
        Convert 3x3 rotation matrix to 6D representation.

        Uses first two columns of rotation matrix, flattened.

        Args:
            R: (3, 3) rotation matrix

        Returns:
            rot_6d: (6,) vector [r1, r2] where r1 and r2 are the first two columns of R

        Example:
            >>> R = np.eye(3)
            >>> rot_6d = rotation_matrix_to_6d(R)
            >>> np.allclose(rot_6d, [1, 0, 0, 0, 1, 0])
            True
        """
        rot_6d = R[:, :2].flatten()
        return rot_6d

    @staticmethod
    def rotation_matrix_to_axis_angle(R: np.ndarray) -> np.ndarray:
        """
        Convert 3x3 rotation matrix to axis-angle representation.

        Uses Rodrigues formula via scipy.

        Args:
            R: (3, 3) rotation matrix

        Returns:
            axis_angle: (3,) vector where:
                - direction is the rotation axis
                - magnitude is the rotation angle in radians
                - range: each component in [-π, π]

        Example:
            >>> R = np.eye(3)  # Identity rotation
            >>> axis_angle = rotation_matrix_to_axis_angle(R)
            >>> np.allclose(axis_angle, [0, 0, 0])
            True
        """
        # Use scipy's Rotation class
        rot = Rotation.from_matrix(R)
        axis_angle = rot.as_rotvec()  # Returns axis-angle representation
        return axis_angle

    @staticmethod
    def axis_angle_to_rotation_matrix(axis_angle: np.ndarray) -> np.ndarray:
        """
        Convert axis-angle representation to 3x3 rotation matrix.

        Args:
            axis_angle: (3,) vector [rot_x, rot_y, rot_z]
                - direction is the rotation axis
                - magnitude is the rotation angle in radians

        Returns:
            R: (3, 3) rotation matrix

        Example:
            >>> axis_angle = np.array([0, 0, np.pi/2])  # 90° rotation around Z
            >>> R = axis_angle_to_rotation_matrix(axis_angle)
            >>> # Should be approximately [[0, -1, 0], [1, 0, 0], [0, 0, 1]]
        """
        # Use scipy's Rotation class
        rot = Rotation.from_rotvec(axis_angle)
        R = rot.as_matrix()
        return R

    @staticmethod
    def compute_relative_pose(
        R_src: np.ndarray,
        T_src: np.ndarray,
        R_tgt: np.ndarray,
        T_tgt: np.ndarray,
    ) -> Dict[str, np.ndarray]:
        """
        Compute relative pose from source camera to target camera.

        The relative pose transforms points from source camera frame to target camera frame.

        Math:
            R_rel = R_tgt @ R_src.T
            T_rel = T_tgt - R_rel @ T_src

        This means: X_tgt = R_rel @ X_src + T_rel

        Args:
            R_src: (3, 3) source camera rotation matrix
            T_src: (3,) source camera translation vector
            R_tgt: (3, 3) target camera rotation matrix
            T_tgt: (3,) target camera translation vector

        Returns:
            relative_pose: {
                'rotation': (3,) axis-angle [rot_x, rot_y, rot_z],
                'translation': (3,) [trans_x, trans_y, trans_z]
            }

        Example:
            >>> # Same camera -> identity transformation
            >>> R = np.eye(3)
            >>> T = np.array([1.0, 2.0, 3.0])
            >>> rel = compute_relative_pose(R, T, R, T)
            >>> np.allclose(rel['rotation'], [0, 0, 0])
            True
            >>> np.allclose(rel['translation'], [0, 0, 0])
            True
        """
        # Compute relative rotation
        R_rel = R_tgt @ R_src.T

        # Compute relative translation
        # Formula: T_rel = T_tgt - R_rel @ T_src
        # This gives the translation from source camera origin to target camera origin
        # in the target camera's coordinate frame
        T_rel = T_tgt - R_rel @ T_src

        # Convert rotation to axis-angle
        rotation_axis_angle = CameraTransformUtils.rotation_matrix_to_axis_angle(R_rel)

        rotation_6d = CameraTransformUtils.rotation_matrix_to_6d(R_rel)

        return {
            'rotation_6d': rotation_6d,
            'rotation': rotation_axis_angle,
            'translation': T_rel,
        }

    @staticmethod
    def apply_relative_pose(
        R_src: np.ndarray,
        T_src: np.ndarray,
        relative_pose: Dict[str, np.ndarray]
    ) -> Tuple[np.ndarray, np.ndarray]:
        """
        Apply relative pose to source camera to get target camera.

        This is the inverse operation of compute_relative_pose.
        Can be used to validate that relative pose computation is correct.

        Args:
            R_src: (3, 3) source camera rotation matrix
            T_src: (3,) source camera translation vector
            relative_pose: dict with:
                - 'rotation': (3,) axis-angle
                - 'translation': (3,) relative translation

        Returns:
            R_tgt: (3, 3) target camera rotation matrix
            T_tgt: (3,) target camera translation vector

        Example:
            >>> # Round-trip test
            >>> R1, T1 = random_rotation(), random_translation()
            >>> R2, T2 = random_rotation(), random_translation()
            >>> rel = compute_relative_pose(R1, T1, R2, T2)
            >>> R2_rec, T2_rec = apply_relative_pose(R1, T1, rel)
            >>> np.allclose(R2, R2_rec) and np.allclose(T2, T2_rec)
            True
        """
        # Convert axis-angle to rotation matrix
        R_rel = CameraTransformUtils.axis_angle_to_rotation_matrix(relative_pose['rotation'])
        T_rel = relative_pose['translation']

        # Apply relative transformation
        # R_rel = R_tgt @ R_src.T  =>  R_tgt = R_rel @ R_src
        R_tgt = R_rel @ R_src

        # T_rel = T_tgt - R_rel @ T_src  =>  T_tgt = R_rel @ T_src + T_rel
        T_tgt = R_rel @ T_src + T_rel

        return R_tgt, T_tgt

    @staticmethod
    def normalize_scene_translations(
        translations: List[np.ndarray],
        scale_factor: float = 2.0
    ) -> Tuple[List[np.ndarray], float]:
        """
        Normalize translations for a scene based on bounding sphere.

        Computes the radius of the bounding sphere containing all camera positions,
        then normalizes all translations to approximately [-0.5, 0.5] range.

        Args:
            translations: List of (3,) translation vectors
            scale_factor: Divisor for normalization (default: 2.0)
                - scale_factor=2.0 → range ≈ [-0.5, 0.5]
                - scale_factor=1.0 → range ≈ [-1.0, 1.0]

        Returns:
            normalized_translations: List of (3,) normalized vectors
            scene_radius: Float, radius of bounding sphere

        Example:
            >>> translations = [np.array([1, 0, 0]), np.array([0, 2, 0])]
            >>> norm_trans, radius = normalize_scene_translations(translations)
            >>> radius  # max(||T||) = 2.0
            2.0
            >>> # With scale_factor=2.0, translations are in [-0.5, 0.5] range
        """
        # Compute bounding sphere radius (maximum distance from origin)
        norms = [np.linalg.norm(T) for T in translations]
        scene_radius = max(norms)

        if scene_radius < 1e-6:
            # Degenerate case: all cameras at origin
            return translations, 1.0

        # Normalize all translations
        normalized = [T / (scale_factor * scene_radius) for T in translations]

        return normalized, scene_radius

    @staticmethod
    def create_lookat_rotation(
        camera_pos: np.ndarray,
        target_pos: np.ndarray,
        up_vector: np.ndarray = np.array([0.0, 0.0, 1.0])
    ) -> np.ndarray:
        """
        Create a look-at rotation matrix for left-multiply convention.

        NOTE: This function produces Blender convention (+X right, +Y up, +Z backward).
        Must apply Y,Z flip (diag(1, -1, -1)) to convert to our standard.

        Constructs a world-to-camera rotation matrix where:
        - Camera -Z axis points from camera to target (viewing direction)
        - Camera +Y axis aligns with the up vector as much as possible
        - Camera +X axis is the right direction (right-handed system)

        Args:
            camera_pos: (3,) camera position in world coordinates
            target_pos: (3,) target position to look at (usually origin)
            up_vector: (3,) approximate up direction in world (default: [0,0,1] for Blender Z-up)

        Returns:
            R: (3, 3) world-to-camera rotation matrix (Blender convention)
               Rows are camera axes (X, Y, Z) expressed in world coordinates
               Output: +X right, +Y up (aligns with world Z-up), +Z backward (away from scene)

        Example:
            >>> camera_pos = np.array([4.0, 0.0, 2.0])
            >>> target_pos = np.array([0.0, 0.0, 0.0])
            >>> R = create_lookat_rotation(camera_pos, target_pos)
            >>> # Camera at (4, 0, 2) in world Z-up coordinates, looking at origin
            >>> # Camera Y-axis aligns with world Z-up
            >>> # Apply diag(1, -1, -1) to convert to our standard
        """
        # Compute forward direction (from camera toward target)
        forward = target_pos - camera_pos
        forward_norm = np.linalg.norm(forward)
        if forward_norm < 1e-6:
            raise ValueError("Camera position and target position are too close")
        forward = forward / forward_norm

        # Camera -Z points toward target, so camera +Z points away from target
        z_axis = -forward

        # Compute right direction (X axis)
        # X = up × Z (cross product gives right direction)
        right = np.cross(up_vector, z_axis)
        right_norm = np.linalg.norm(right)
        if right_norm < 1e-6:
            # Camera is looking straight up or down, choose arbitrary right vector
            # If looking up/down, use X axis as right
            right = np.array([1.0, 0.0, 0.0])
            # Make sure it's perpendicular to z_axis
            right = right - np.dot(right, z_axis) * z_axis
            right_norm = np.linalg.norm(right)
            if right_norm < 1e-6:
                # Use Y axis instead
                right = np.array([0.0, 1.0, 0.0])
                right = right - np.dot(right, z_axis) * z_axis
                right_norm = np.linalg.norm(right)
        right = right / right_norm

        # Recompute up to ensure orthonormality
        # Y = Z × X (ensures right-handed coordinate system)
        up_actual = np.cross(z_axis, right)

        # Construct rotation matrix
        # Rows are camera axes in world coordinates (for left-multiply convention)
        R = np.array([
            right,      # Camera X axis (right)
            up_actual,  # Camera Y axis (up)
            z_axis      # Camera Z axis (backward, away from target)
        ], dtype=np.float32)

        return R

    @staticmethod
    def spherical_to_rotation_matrix(
        azimuth: float,
        elevation: float,
        roll: float = 0.0
    ) -> np.ndarray:
        """
        Convert spherical camera angles to rotation matrix.

        DEPRECATED: Use create_lookat_rotation instead for accurate look-at cameras.

        NOTE: This produces Blender convention (+X right, +Y up, +Z backward).
        Must apply Y,Z flip (diag(1, -1, -1)) to convert to our standard.

        Assumes camera is on a sphere looking at the origin.
        Rotation order: Y (azimuth) → X (elevation) → Z (roll)

        This is commonly used for Objaverse-style datasets where cameras
        are positioned on a sphere around an object.

        Args:
            azimuth: Horizontal rotation in radians [-π, π]
                - 0: camera at +X
                - π/2: camera at +Z
            elevation: Vertical rotation in radians [-π/2, π/2]
                - 0: camera on equator
                - π/2: camera at north pole (+Y)
            roll: Roll rotation in radians (default: 0)

        Returns:
            R: (3, 3) rotation matrix (Blender convention)
               Output: +X right, +Y up, +Z backward (away from scene)

        Example:
            >>> # Camera at +X looking at origin
            >>> R = spherical_to_rotation_matrix(azimuth=0, elevation=0)
            >>> # R produces Blender convention: +Z backward (toward camera)
        """
        # Rotation matrix for rotation around Y-axis (azimuth)
        cos_az, sin_az = np.cos(azimuth), np.sin(azimuth)
        R_y = np.array([
            [cos_az, 0, sin_az],
            [0, 1, 0],
            [-sin_az, 0, cos_az]
        ])

        # Rotation matrix for rotation around X-axis (elevation)
        cos_el, sin_el = np.cos(elevation), np.sin(elevation)
        R_x = np.array([
            [1, 0, 0],
            [0, cos_el, -sin_el],
            [0, sin_el, cos_el]
        ])

        # Rotation matrix for rotation around Z-axis (roll)
        cos_roll, sin_roll = np.cos(roll), np.sin(roll)
        R_z = np.array([
            [cos_roll, -sin_roll, 0],
            [sin_roll, cos_roll, 0],
            [0, 0, 1]
        ])

        # Combined rotation: first azimuth (Y), then elevation (X), then roll (Z)
        R = R_z @ R_x @ R_y

        return R

    @staticmethod
    def rotation_matrix_to_spherical(R: np.ndarray) -> Tuple[float, float, float]:
        """
        Extract azimuth, elevation, and roll from rotation matrix.

        This is the inverse of spherical_to_rotation_matrix.
        Extracts the Euler angles assuming rotation order: Y (azimuth) → X (elevation) → Z (roll)

        Args:
            R: (3, 3) rotation matrix

        Returns:
            azimuth: Horizontal rotation in radians [-π, π]
                - 0: camera looking along +X
                - π/2: camera looking along +Z
            elevation: Vertical rotation in radians [-π/2, π/2]
                - 0: camera on equator
                - π/2: camera at north pole (+Y)
            roll: Roll rotation in radians [-π, π]
                - 0: no roll

        Example:
            >>> # Round-trip test
            >>> R_orig = spherical_to_rotation_matrix(az=0.5, el=0.3, roll=0.1)
            >>> az, el, roll = rotation_matrix_to_spherical(R_orig)
            >>> R_reconstructed = spherical_to_rotation_matrix(az, el, roll)
            >>> np.allclose(R_orig, R_reconstructed)
            True
        """
        from scipy.spatial.transform import Rotation as R_scipy

        # Convert to scipy Rotation
        rot = R_scipy.from_matrix(R)

        # Extract Euler angles in YXZ order (intrinsic)
        # This matches the construction order in spherical_to_rotation_matrix:
        # R = R_z @ R_x @ R_y means intrinsic rotations: Y, then X, then Z
        # In scipy, lowercase = intrinsic, uppercase = extrinsic
        angles = rot.as_euler('yxz', degrees=False)

        azimuth = angles[0]
        elevation = angles[1]
        roll = angles[2]

        return azimuth, elevation, roll

    @staticmethod
    def rotation_matrix_to_camera_angles(R: np.ndarray) -> Dict[str, float]:
        """
        Extract viewing direction angles from rotation matrix.

        Computes azimuth and elevation from the camera's forward direction (+Z axis).
        More robust than rotation_matrix_to_spherical for general rotations.

        Args:
            R: (3, 3) rotation matrix (world-to-camera, our convention)
                - Rows are camera axes in world coordinates
                - Our convention: +X right, +Y down, +Z forward

        Returns:
            dict with:
                - 'azimuth': Horizontal angle in XZ plane (radians) [-π, π]
                - 'elevation': Vertical angle from XZ plane (radians) [-π/2, π/2]
                - 'roll': Roll around viewing direction (radians) [-π, π]

        Example:
            >>> R = create_lookat_rotation([2, 1, 0], [0, 0, 0])
            >>> # (After applying Y,Z flip to R)
            >>> angles = rotation_matrix_to_camera_angles(R)
            >>> # azimuth ≈ 0, elevation ≈ atan2(-1, 2) (Y is down!)
        """
        # Camera viewing direction in world coords (camera looks along +Z in our convention)
        forward = R[2, :]  # Z-axis (row 2)

        # Azimuth: angle in XZ plane (rotation around Y axis)
        azimuth = np.arctan2(forward[2], forward[0])

        # Elevation: angle from XZ plane (rotation around X axis)
        # In our convention, +Y is down, so negative Y component means looking up
        # elevation = arcsin(-forward_y) if forward is normalized
        # More stable: elevation = atan2(-y, sqrt(x^2 + z^2))
        elevation = np.arctan2(-forward[1], np.sqrt(forward[0]**2 + forward[2]**2))

        # Roll: rotation around viewing direction
        # Compute expected up vector for zero roll
        # Create rotation with same azimuth/elevation but zero roll
        R_no_roll = CameraTransformUtils.spherical_to_rotation_matrix(azimuth, elevation, roll=0.0)
        expected_up = R_no_roll[1, :]  # Expected up vector (row 1)
        actual_up = R[1, :]  # Actual up vector

        # Project both onto plane perpendicular to forward
        expected_up_proj = expected_up - np.dot(expected_up, forward) * forward
        actual_up_proj = actual_up - np.dot(actual_up, forward) * forward

        # Normalize projections
        expected_up_norm = np.linalg.norm(expected_up_proj)
        actual_up_norm = np.linalg.norm(actual_up_proj)

        if expected_up_norm > 1e-6 and actual_up_norm > 1e-6:
            expected_up_proj = expected_up_proj / expected_up_norm
            actual_up_proj = actual_up_proj / actual_up_norm

            # Compute roll angle using atan2 for proper quadrant
            cos_roll = np.dot(expected_up_proj, actual_up_proj)
            # Cross product gives vector along forward direction, sign indicates rotation direction
            cross_prod = np.cross(expected_up_proj, actual_up_proj)
            sin_roll = np.dot(cross_prod, forward)
            roll = np.arctan2(sin_roll, cos_roll)
        else:
            # Degenerate case: camera pointing straight up or down
            roll = 0.0

        return {
            'azimuth': azimuth,
            'elevation': elevation,
            'roll': roll
        }

    @staticmethod
    def clip_and_warn(
        values: np.ndarray,
        min_val: float,
        max_val: float,
        name: str = "values"
    ) -> np.ndarray:
        """
        Clip values to range and warn if clipping occurs.

        Useful for detecting normalization issues.

        Args:
            values: Array to clip
            min_val: Minimum value
            max_val: Maximum value
            name: Name for warning message

        Returns:
            clipped: Clipped array
        """
        clipped = np.clip(values, min_val, max_val)

        # Check if any values were clipped
        num_clipped_low = np.sum(values < min_val)
        num_clipped_high = np.sum(values > max_val)

        if num_clipped_low > 0 or num_clipped_high > 0:
            print(f"Warning: {name} clipped:")
            if num_clipped_low > 0:
                print(f"  {num_clipped_low} values below {min_val} (min: {values.min():.4f})")
            if num_clipped_high > 0:
                print(f"  {num_clipped_high} values above {max_val} (max: {values.max():.4f})")

        return clipped


def compute_angular_offset(rotation: 'torch.Tensor', translation: 'torch.Tensor', normalizer: float = 7.0) -> 'torch.Tensor':
    """
    Compute angular offset (pitch, yaw) between actual and expected looking directions.

    The camera is generated to look at the origin with some noise added to the direction.
    This function extracts that noise as angular deviations in the camera's local frame.

    Args:
        rotation: (3, 3) rotation matrix [R] from world-to-camera
        translation: (3,) translation vector [T] from world-to-camera
        normalizer: Scale factor for camera position (default: 7.0)

    Returns:
        angular_offset: (2,) tensor with [pitch, yaw] in radians
            - pitch: Up/down angular offset (positive = camera tilted UP, object appears below center)
            - yaw: Left/right angular offset (positive = camera turned RIGHT, object appears left of center)
    """
    import torch

    # Calculate camera position in world coordinates: C = -R^T @ T
    camera_position = -rotation.T @ translation

    # Expected direction: camera should look directly at origin
    # This is the unit vector from camera position toward origin
    expected_dir_world = -camera_position / torch.norm(camera_position)

    # Transform expected direction into camera's local coordinate frame
    # Camera frame: X=right, Y=up, Z=backward (OpenGL convention)
    # If camera looks perfectly at origin, expected_dir_cam should be (0, 0, -1)
    expected_dir_cam = rotation @ expected_dir_world

    # Compute angular deviations in camera frame
    # Since the actual looking direction is (0, 0, -1) in camera space,
    # the angles represent how much the expected direction deviates from actual

    # Pitch (up/down): rotation around camera's X-axis (right vector)
    # atan2(y, -z) gives the angle in the YZ plane
    # Positive pitch = expected direction is above actual = camera is tilted UP from origin-pointing
    # (object appears below center in the image)
    pitch = torch.atan2(expected_dir_cam[1], -expected_dir_cam[2])

    # Yaw (left/right): rotation around camera's Y-axis (up vector)
    # atan2(x, -z) gives the angle in the XZ plane
    # Positive yaw = expected direction is to the right = camera is turned RIGHT from origin-pointing
    # (object appears left of center in the image)
    yaw = torch.atan2(expected_dir_cam[0], -expected_dir_cam[2])

    return torch.tensor([pitch, yaw], dtype=torch.float32)


def reconstruct_camera_from_factorized(
    azimuth: float,
    elevation: float,
    radius: float,
    pitch: float = 0.0,
    yaw: float = 0.0,
    return_numpy: bool = True
):
    """
    Reconstruct camera rotation and translation from factorized parameters.

    This is the inverse operation of compute_angular_offset. Given spherical coordinates
    and angular offsets, reconstructs the world-to-camera transformation matrices.

    Args:
        azimuth: Azimuth angle in radians (horizontal rotation)
        elevation: Elevation angle in radians (vertical rotation)
        radius: Distance from origin
        pitch: Pitch offset in radians (rotation around camera X-axis, up/down)
               Positive = camera tilted UP from origin-pointing
        yaw: Yaw offset in radians (rotation around camera Y-axis, left/right)
             Positive = camera turned RIGHT from origin-pointing
        return_numpy: If True, return numpy arrays; if False, return torch tensors

    Returns:
        R: (3, 3) world-to-camera rotation matrix
        T: (3,) world-to-camera translation vector

    Example:
        >>> # Camera at azimuth=45°, elevation=30°, radius=7, looking at origin
        >>> R, T = reconstruct_camera_from_factorized(
        ...     azimuth=np.pi/4, elevation=np.pi/6, radius=7.0, pitch=0.0, yaw=0.0
        ... )
        >>> # Verify: camera position should be at (7*cos(30°)*cos(45°), 7*cos(30°)*sin(45°), 7*sin(30°))
    """
    import torch
    import numpy as np

    # 1. Compute camera position in world coordinates (spherical to Cartesian)
    cos_el = np.cos(elevation)
    sin_el = np.sin(elevation)
    cos_az = np.cos(azimuth)
    sin_az = np.sin(azimuth)

    camera_pos = np.array([
        radius * cos_el * cos_az,
        radius * cos_el * sin_az,
        radius * sin_el
    ], dtype=np.float32)

    # 2. Construct base rotation matrix (camera looking at origin, pitch=yaw=0)
    target_pos = np.array([0.0, 0.0, 0.0], dtype=np.float32)
    up_vector = np.array([0.0, 0.0, 1.0], dtype=np.float32)
    R_base = CameraTransformUtils.create_lookat_rotation(camera_pos, target_pos, up_vector)

    # 3. Create rotation offset from pitch/yaw in camera frame
    # Pitch: rotation around X-axis (up/down)
    # Yaw: rotation around Y-axis (left/right)
    # When pitch=yaw=0, R_base @ expected_dir_world = (0, 0, -1)
    # When pitch/yaw ≠ 0, we apply R_offset to rotate the viewing direction

    if abs(pitch) < 1e-9 and abs(yaw) < 1e-9:
        # No offset, use base rotation
        R = R_base
    else:
        # Compute target direction in camera frame from pitch/yaw
        # Working backwards from the atan2 extraction formulas:
        # pitch = atan2(y, -z) → tan(pitch) = y / (-z)
        # yaw = atan2(x, -z) → tan(yaw) = x / (-z)
        #
        # To satisfy both simultaneously with unit length:
        tan_pitch = np.tan(pitch)
        tan_yaw = np.tan(yaw)

        # Normalization factor
        norm_factor = np.sqrt(1 + tan_pitch**2 + tan_yaw**2)

        # Target direction that produces exact pitch/yaw when passed through atan2
        target_dir_cam = np.array([
            tan_yaw / norm_factor,      # x component
            tan_pitch / norm_factor,    # y component
            -1.0 / norm_factor          # z component (negative = backward)
        ], dtype=np.float32)

        # Initial direction in camera frame (looking at origin after R_base)
        initial_dir_cam = np.array([0.0, 0.0, -1.0], dtype=np.float32)

        # Compute rotation from initial to target using Rodrigues' formula
        # axis = initial × target
        # angle = acos(initial · target)
        dot_product = np.dot(initial_dir_cam, target_dir_cam)

        if dot_product > 0.999999:
            # Directions are nearly identical, no rotation needed
            R_offset = np.eye(3, dtype=np.float32)
        elif dot_product < -0.999999:
            # Directions are opposite, rotate 180° around any perpendicular axis
            # Use Y-axis for consistency
            R_offset = np.array([
                [-1, 0, 0],
                [0, 1, 0],
                [0, 0, -1]
            ], dtype=np.float32)
        else:
            # General case: use Rodrigues' formula
            axis = np.cross(initial_dir_cam, target_dir_cam)
            axis = axis / np.linalg.norm(axis)
            angle = np.arccos(np.clip(dot_product, -1.0, 1.0))

            # Rodrigues' formula: R = I + sin(θ)K + (1-cos(θ))K^2
            # where K is the skew-symmetric matrix of the axis
            K = np.array([
                [0, -axis[2], axis[1]],
                [axis[2], 0, -axis[0]],
                [-axis[1], axis[0], 0]
            ], dtype=np.float32)

            R_offset = (np.eye(3, dtype=np.float32) +
                       np.sin(angle) * K +
                       (1 - np.cos(angle)) * (K @ K))

        # Final rotation: apply offset in camera frame
        R = R_offset @ R_base

    # 4. Compute world-to-camera translation
    T = -R @ camera_pos

    if return_numpy:
        return R, T
    else:
        return torch.from_numpy(R).float(), torch.from_numpy(T).float()