Spaces:

HorizonRobotics
/

EmbodiedGen-Image-to-3D

Running on Zero

File size: 31,728 Bytes

7734c01

# Copyright (c) Meta Platforms, Inc. and affiliates.
import torch
import numpy as np
import open3d as o3d
import trimesh
from pytorch3d.structures import Meshes
from pytorch3d.transforms import quaternion_to_matrix, Transform3d, matrix_to_quaternion
from sam3d_objects.data.dataset.tdfy.transforms_3d import compose_transform, decompose_transform
from sam3d_objects.data.dataset.tdfy.pose_target import PoseTargetConverter
from loguru import logger
from sam3d_objects.pipeline.layout_post_optimization_utils import (
    run_ICP,
    compute_iou,
    set_seed,
    apply_transform,
    get_mesh,
    get_mask_renderer,
    run_alignment,
    run_render_compare,
    check_occlusion,
)


SLAT_STD = torch.tensor(
    [
        2.377650737762451,
        2.386378288269043,
        2.124418020248413,
        2.1748552322387695,
        2.663944721221924,
        2.371192216873169,
        2.6217446327209473,
        2.684523105621338,
    ]
)
SLAT_MEAN = torch.tensor(
    [
        -2.1687545776367188,
        -0.004347046371549368,
        -0.13352349400520325,
        -0.08418072760105133,
        -0.5271206498146057,
        0.7238689064979553,
        -1.1414450407028198,
        1.2039363384246826,
    ]
)

ROTATION_6D_MEAN = torch.tensor(
    [
        -0.06366084883674913,
        0.008438224692279752,
        0.00017084786438302483,
        0.0007126610473540038,
        -0.0030916726538816417,
        0.5166093753457688,
    ]
)
ROTATION_6D_STD = torch.tensor(
    [
        0.6656971967514863,
        0.6787012271867754,
        0.30345010594844524,
        0.4394504420678794,
        0.39817973931717104,
        0.6176286868761914,
    ]
)

def layout_post_optimization(
    Mesh,
    Quaternion,
    Translation,
    Scale,
    Mask,
    Point_Map,
    Intrinsics,
    Enable_shape_ICP=True,
    Enable_rendering_optimization=True,
    min_size=512,
    device=None,
):

    set_seed(100)
    if device is None:
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # init transform and process mesh
    Rotation = quaternion_to_matrix(Quaternion.squeeze(1))
    center = Translation[0].clone()
    tfm_ori = compose_transform(scale=Scale, rotation=Rotation, translation=Translation)
    mesh, faces_idx, textures = get_mesh(Mesh, tfm_ori, device)

    # get mask and renderer
    mask, renderer = get_mask_renderer(Mask, min_size, Intrinsics, device)

    # check occlusion
    if check_occlusion(mask[0, 0].cpu().numpy(), Point_Map.cpu().numpy()):
        return (
            Quaternion,
            Translation,
            Scale,
            -1.0,
            False,
            False,
        )

    # Step 1: Manual Alignment
    source_points, target_points, center, tfm1, mesh, ori_iou, final_iou, flag_notgt = (
        run_alignment(
            Point_Map, mask, mesh, center, faces_idx, textures, renderer, device
        )
    )

    # return original layout if no target points. 
    if flag_notgt:
        return (
            Quaternion,
            Translation,
            Scale,
            -1.0,
            False,
            False,
        )

    # Step 2: Shape ICP
    if Enable_shape_ICP:
        Flag_ICP = True
        points_aligned_icp, transformation = run_ICP(
            mesh, source_points, target_points, threshold=0.05
        )
        mesh_ICP = Meshes(
            verts=[points_aligned_icp], faces=[faces_idx], textures=textures
        )
        rendered = renderer(mesh_ICP)
        ori_iou_shapeICP = compute_iou(
            rendered[..., 3][0][None, None], mask, threshold=0.5
        )
        # determine whether accept ICP
        if ori_iou_shapeICP > ori_iou:
            mesh = mesh_ICP
            final_iou = ori_iou_shapeICP.cpu().item()
            T_o3d = torch.tensor(transformation, dtype=torch.float32, device=device)
            T_o3d = T_o3d.T
            A = T_o3d[:3, :3]
            t = T_o3d[3, :3]
            scale = A.norm(dim=1)
            R = A / scale[:, None]
            center = ((center[None] * scale) @ R + t)[0]  # transform center
            tfm2 = (
                Transform3d(device=device)
                .scale(scale[None])
                .rotate(R[None])
                .translate(t[None])
            )
        else:
            Flag_ICP = False
            scale_2, translation_2 = torch.tensor(1).to(device), torch.zeros([3]).to(
                device
            )
            tfm2 = (
                Transform3d(device=device)
                .scale(scale_2.expand(3)[None])
                .translate(translation_2[None])
            )
    else:
        Flag_ICP = False
        scale_2, translation_2 = torch.tensor(1).to(device), torch.zeros([3]).to(device)
        tfm2 = (
            Transform3d(device=device)
            .scale(scale_2.expand(3)[None])
            .translate(translation_2[None])
        )

    # Step 3: Render-and-Compare
    if not Enable_rendering_optimization:
        Flag_optim = False
        tfm = tfm_ori.compose(tfm1).compose(tfm2)
    else:
        quat, translation, scale, R = run_render_compare(
            mesh, center, renderer, mask, device
        )
        with torch.no_grad():
            transformed = apply_transform(mesh, center, quat, translation, scale)
            rendered = renderer(transformed)
        optimized_iou = compute_iou(
            rendered[..., 3][0][None, None], mask, threshold=0.5
        )
        # Criterior to use layout optimization
        if optimized_iou < 0.5 or optimized_iou <= ori_iou:
            Flag_optim = False
            tfm = tfm_ori  # reject manual alignment and ICP as well.
            # tfm = tfm_ori.compose(tfm1).compose(tfm2)  # only reject render-compare but keep manual alignment and ICP.
        else:
            Flag_optim = True
            final_iou = optimized_iou.detach().cpu().item()
            tfm3 = (
                Transform3d(device=device)
                .translate(-center[None])  # move to center
                .scale(scale.expand(3)[None])
                .rotate(R.T[None])
                .translate(center[None])  # move back
                .translate(translation[None])
            )
            tfm = tfm_ori.compose(tfm1).compose(tfm2).compose(tfm3)

    M = tfm.get_matrix()[0]
    T_final = M[3, :3][None]
    A = M[:3, :3]
    scale_final = A.norm(dim=1)[None]
    R_final = A / scale_final[:, None]
    quat_final = matrix_to_quaternion(R_final)

    return (
        quat_final,
        T_final,
        scale_final,
        round(float(final_iou), 4),
        Flag_ICP,
        Flag_optim,
    )


def pose_decoder(
    pose_target_convention,
):
    def decode(model_output_dict, scene_scale=None, scene_shift=None):
        x = model_output_dict

        # BEGIN: copied from generative.py
        key_mapping = {
            "shape": "x_shape_latent",
            "quaternion": "x_instance_rotation",
            "6drotation": "x_instance_rotation_6d",
            "6drotation_normalized": "x_instance_rotation_6d_normalized",
            "translation": "x_instance_translation",
            "scale": "x_instance_scale",
            "translation_scale": "x_translation_scale",
        }

        # Decodes for metrics
        pose_target_dict = {}
        for k, v in x.items():
            pose_target_dict[key_mapping.get(k, k)] = v

        # TODO: Hao & Bowen please do clean this up!
        # Convert 6D rotation to quaternion if needed
        if (
            "x_instance_rotation_6d" in pose_target_dict
            or "x_instance_rotation_6d_normalized" in pose_target_dict
        ):
            # Extract the two 3D vectors
            if "x_instance_rotation_6d_normalized" in pose_target_dict:
                rot_6d = pose_target_dict[
                    "x_instance_rotation_6d_normalized"
                ] * ROTATION_6D_STD.to(
                    pose_target_dict["x_instance_rotation_6d_normalized"].device
                ) + ROTATION_6D_MEAN.to(
                    pose_target_dict["x_instance_rotation_6d_normalized"].device
                )
            else:
                rot_6d = pose_target_dict["x_instance_rotation_6d"]
            a1 = rot_6d[..., 0:3]
            a2 = rot_6d[..., 3:6]

            # Normalize first vector
            b1 = torch.nn.functional.normalize(a1, dim=-1)

            # Make second vector orthogonal to first
            b2 = a2 - torch.sum(b1 * a2, dim=-1, keepdim=True) * b1
            b2 = torch.nn.functional.normalize(b2, dim=-1)

            # Compute third vector as cross product
            b3 = torch.cross(b1, b2, dim=-1)

            # Stack to create rotation matrix
            rotation_matrix = torch.stack([b1, b2, b3], dim=-1)

            # Convert to quaternion
            quaternion = matrix_to_quaternion(rotation_matrix)
            pose_target_dict["x_instance_rotation"] = quaternion

        if "x_instance_scale" in pose_target_dict:
            pose_target_dict["x_instance_scale"] = torch.exp(
                pose_target_dict["x_instance_scale"]
            )

        if "x_translation_scale" in pose_target_dict:
            pose_target_dict["x_translation_scale"] = torch.exp(
                pose_target_dict["x_translation_scale"]
            )

        pose_target_dict["pose_target_convention"] = [pose_target_convention] * x[
            "shape"
        ].shape[0]
        # END: copied from generative.py

        # Fake pointmap moments
        device = x["shape"].device
        _scene_scale = (
            scene_scale if scene_scale is not None else torch.tensor(1.0, device=device)
        )
        _scene_shift = (
            scene_shift
            if scene_shift is not None
            else torch.tensor([[0, 0, 0]], device=device)
        )
        pose_target_dict["x_scene_scale"] = _scene_scale
        pose_target_dict["x_scene_center"] = _scene_shift

        # Convert to instance pose
        pose_instance_dict = PoseTargetConverter.dicts_pose_target_to_instance_pose(
            pose_target_convention=pose_target_convention,
            x_instance_scale=pose_target_dict["x_instance_scale"],
            x_instance_translation=pose_target_dict["x_instance_translation"],
            x_instance_rotation=pose_target_dict["x_instance_rotation"],
            x_translation_scale=pose_target_dict["x_translation_scale"],
            x_scene_scale=pose_target_dict["x_scene_scale"],
            x_scene_center=pose_target_dict["x_scene_center"],
        )
        return {
            "translation": pose_instance_dict["instance_position_l2c"].squeeze(0),
            "rotation": pose_instance_dict["instance_quaternion_l2c"].squeeze(0),
            "scale": pose_instance_dict["instance_scale_l2c"].squeeze(0).mean(-1, keepdim=True).expand(1,3),
        }

    return decode

def zero_prediction_decoder():
    def decode(model_output_dict, scene_scale=None, scene_shift=None):
        import copy
        from loguru import logger
        _pose_decoder = pose_decoder("ScaleShiftInvariant")
        model_output_dict = copy.deepcopy(model_output_dict)
        logger.warning("Overwriting predictions to zero prediction")
        model_output_dict["translation"] = torch.zeros_like(model_output_dict["translation"])
        model_output_dict["translation_scale"] = torch.zeros_like(model_output_dict["translation_scale"])
        model_output_dict["scale"] = torch.zeros_like(model_output_dict["scale"]) + 1.337 # Empirical average on R3
        return _pose_decoder(model_output_dict, scene_scale, scene_shift)

    return decode


def get_default_pose_decoder():
    def decode(model_output_dict, **kwargs):
        return {}

    return decode


POSE_DECODERS = {
    "default": get_default_pose_decoder(),
    "ApparentSize": pose_decoder("ApparentSize"),
    "DisparitySpace": pose_decoder("DisparitySpace"),
    "ScaleShiftInvariant": pose_decoder("ScaleShiftInvariant"),
    "ZeroPredictionScaleShiftInvariant": zero_prediction_decoder(),
}


def get_pose_decoder(name):
    if name not in POSE_DECODERS:
        raise NotImplementedError

    return POSE_DECODERS[name]


def prune_sparse_structure(
    coord_batch,
    max_neighbor_axes_dist=1,
):
    coords, batch = coord_batch[:, 1:], coord_batch[:, 0].unsqueeze(-1)
    device = coords.device
    # 1) shift coords so minimum is zero
    min_xyz = coords.min(0)[0]
    coords0 = coords - min_xyz
    # 2) build occupancy grid
    max_xyz = coords0.max(0)[0] + 1  # size in each dim
    D, H, W = max_xyz.tolist()
    # shape (1,1,D,H,W)
    occ = torch.zeros((1, 1, D, H, W), dtype=torch.uint8, device=device)
    x, y, z = coords0.unbind(1)
    occ[0, 0, x, y, z] = 1
    # 3) 3×3×3 convolution to count each voxel + neighbors
    kernel = torch.ones(
        (
            1,
            1,
            2 * max_neighbor_axes_dist + 1,
            2 * max_neighbor_axes_dist + 1,
            2 * max_neighbor_axes_dist + 1,
        ),
        dtype=torch.uint8,
        device=device,
    )
    # pad so output is same size
    pad = max_neighbor_axes_dist
    counts = torch.nn.functional.conv3d(occ.float(), kernel.float(), padding=pad)
    # interior voxels have count == (2*max_neighbor_axes_dist+1)**3
    full_count = (2 * max_neighbor_axes_dist + 1) ** 3
    # 4) lookup counts at each original coord
    counts_at_pts = counts[0, 0, x, y, z]  # (N,)
    is_surface = counts_at_pts < full_count
    # 5) return filtered batch+coords (shift back if you want original coords)
    kept = is_surface.nonzero(as_tuple=False).squeeze(1)
    out_batch = batch[kept]
    out_coords = coords[kept]
    coords = torch.cat([out_batch, out_coords], dim=1)

    return torch.cat([out_batch, out_coords], dim=1)


def downsample_sparse_structure(
    coord_batch,
    max_coords=42000,
    downsample_factor=2,
):
    """
    Downsample sparse structure coordinates when there are more than max_coords.

    Downsamples by rescaling coordinates, effectively shrinking the grid while preserving
    the structure. The downsampled grid is centered in the original space.

    Args:
        coord_batch: tensor of shape (N, 4) where [:, 0] is batch index and [:, 1:] are coords
        max_coords: maximum number of coordinates to keep
            42000 should be safe number. Calculation: max(int32) / (64*768) ~= 43691
            Only needed for mesh decoding.
        downsample_factor: factor by which to downsample (e.g., 2 means half resolution)

    Returns:
        Downsampled coord_batch with coordinates rescaled if downsampling is needed
    """
    if coord_batch.shape[0] <= max_coords:
        return coord_batch, 1

    # Extract coordinates and batch indices
    coords = coord_batch[:, 1:].float()  # Shape: (N, 3), convert to float for scaling
    batch_indices = coord_batch[:, 0:1]  # Shape: (N, 1)

    # Find the actual coordinate bounds
    coords_min = coords.min(dim=0)[0]  # Shape: (3,)
    coords_max = coords.max(dim=0)[0]  # Shape: (3,)
    original_size = coords_max - coords_min + 1  # Add 1 since coordinates are discrete

    # Calculate target size after downsampling
    target_size = original_size / downsample_factor

    # Calculate the offset to center the downsampled grid
    offset = (original_size - target_size) / 2
    target_min = coords_min + offset
    target_max = coords_min + offset + target_size - 1

    # Normalize coordinates to [0, 1] within their actual range
    coords_normalized = (coords - coords_min) / (coords_max - coords_min)

    # Scale to the target range
    coords_rescaled = coords_normalized * (target_size - 1) + target_min

    # Round to integers to get discrete grid coordinates
    coords_rescaled = torch.round(coords_rescaled).int()

    # Clamp to ensure we stay within bounds
    coords_rescaled = torch.clamp(coords_rescaled, target_min.int(), target_max.int())

    # Remove duplicates that may have been created by the downsampling
    # Concatenate batch and coords for duplicate removal
    combined = torch.cat([batch_indices, coords_rescaled], dim=1)
    unique_combined = torch.unique(combined, dim=0)

    # If still too many after deduplication, randomly subsample
    if unique_combined.shape[0] > max_coords:
        indices = torch.randperm(unique_combined.shape[0], device=coord_batch.device)[
            :max_coords
        ]
        unique_combined = unique_combined[indices]

    return unique_combined.int(), downsample_factor


def normalize_mesh_verts(verts):
    vmin = verts.min(axis=0)
    vmax = verts.max(axis=0)
    center = (vmax + vmin) / 2.0
    extent = vmax - vmin  # largest side length
    max_extent = np.max(extent)
    if max_extent == 0:
        vertices = verts - center
        scale = 1
    else:
        scale = 1.0 / max_extent
        vertices = (verts - center) * scale
    return vertices, scale, center


def voxelize_mesh(mesh, resolution=64):
    verts = np.asarray(mesh.vertices)
    # rotate mesh (from z-up to y-up)
    verts = verts @ np.array([[1, 0, 0], [0, 0, -1], [0, 1, 0]]).T
    # normalize vertices
    # skip vertices to avoid losing points, likely already normalized
    if np.abs(verts.min() + 0.5) < 1e-3 and np.abs(verts.max() - 0.5) < 1e-3:
        vertices, scale, center = verts, None, None
    else:
        vertices, scale, center = normalize_mesh_verts(verts)

    vertices = np.clip(vertices, -0.5 + 1e-6, 0.5 - 1e-6)
    mesh.vertices = o3d.utility.Vector3dVector(vertices)
    voxel_grid = o3d.geometry.VoxelGrid.create_from_triangle_mesh_within_bounds(
        mesh,
        voxel_size=1 / 64,
        min_bound=(-0.5, -0.5, -0.5),
        max_bound=(0.5, 0.5, 0.5),
    )
    vertices = np.array([voxel.grid_index for voxel in voxel_grid.get_voxels()])
    vertices = (vertices + 0.5) / 64 - 0.5
    coords = ((torch.tensor(vertices) + 0.5) * resolution).int().contiguous()
    ss = torch.zeros(1, resolution, resolution, resolution, dtype=torch.long)
    ss[:, coords[:, 0], coords[:, 1], coords[:, 2]] = 1
    return ss, scale, center


def preprocess_mesh(mesh: trimesh.Trimesh):
    verts = mesh.vertices
    if np.abs(verts.min() + 0.5) < 1e-3 and np.abs(verts.max() - 0.5) < 1e-3:
        return mesh
    vertices, _, _ = normalize_mesh_verts(verts)
    mesh.vertices = vertices
    return mesh


def trimesh2o3d_mesh(trimesh_mesh):
    verts = np.asarray(trimesh_mesh.vertices)
    faces = np.asarray(trimesh_mesh.faces)
    return o3d.geometry.TriangleMesh(
        o3d.utility.Vector3dVector(verts), o3d.utility.Vector3iVector(faces)
    )


def update_layout(pred_t, pred_s, pred_quat, center, scale, to_halo=True):
    if center is None and not to_halo:
        return pred_t, pred_s, pred_quat
    pred_transform = compose_transform(
        pred_s, quaternion_to_matrix(pred_quat[0]), pred_t
    )
    if center is None:
        comb_transform = pred_transform
    else:
        norm_transform = compose_transform(
            scale * torch.ones_like(pred_t),
            torch.eye(3, dtype=pred_t.dtype).to(pred_t.device)[None],
            scale * -torch.tensor(center, dtype=pred_t.dtype).to(pred_t.device)[None],
        )
        comb_transform = norm_transform.compose(pred_transform)
    comb_transform = convert_to_halo(comb_transform, pred_t.device, pred_t.dtype)
    decomposed = decompose_transform(comb_transform)
    quat = matrix_to_quaternion(decomposed.rotation)
    return decomposed.translation, decomposed.scale, quat


def convert_to_halo(pred_transform, device, dtype):
    on_mesh_transform = Transform3d(dtype=dtype, device=device).rotate(
        torch.tensor(
            [
                [1, 0, 0],
                [0, 0, 1],
                [0, -1, 0],
            ],
            dtype=dtype,
        )
    )
    on_pm_transform = Transform3d(dtype=dtype, device=device).rotate(
        torch.tensor(
            [
                [-1, 0, 0],
                [0, -1, 0],
                [0, 0, 1],
            ],
            dtype=dtype,
        )
    )
    return on_mesh_transform.compose(pred_transform).compose(on_pm_transform)


def quat_wxyz_to_euler_XYZ(q: torch.Tensor) -> torch.Tensor:
    """
    Convert PyTorch3D quaternions (w,x,y,z) to SciPy-style Euler angles
    with sequence 'XYZ' (extrinsic, radians). Works with batch dims.

    Args:
        q: (..., 4) tensor in w,x,y,z order. Doesn't need to be normalized.
    Returns:
        angles: (..., 3) tensor [alpha_X, beta_Y, gamma_Z] in radians.
    """
    q = q / q.norm(dim=-1, keepdim=True)  # normalize
    R = quaternion_to_matrix(q)  # (..., 3, 3)
    R = R.transpose(-1, -2)

    r00 = R[..., 0, 0]
    r10 = R[..., 1, 0]
    r20 = R[..., 2, 0]
    r21 = R[..., 2, 1]
    r22 = R[..., 2, 2]

    # For extrinsic XYZ (R = Rz(gamma) @ Ry(beta) @ Rx(alpha)):
    # beta = atan2(-r20, sqrt(r00^2 + r10^2))
    # alpha = atan2(r21, r22)
    # gamma = atan2(r10, r00)
    eps = torch.finfo(R.dtype).eps
    beta = torch.atan2(-r20, torch.clamp((r00 * r00 + r10 * r10).sqrt(), min=eps))
    alpha = torch.atan2(r21, r22)
    gamma = torch.atan2(r10, r00)

    return -torch.stack((alpha, beta, gamma), dim=-1)


def format_to_halo(layout_output):
    json_out = {}
    quaternion = layout_output["quaternion"][0, 0]
    translation = layout_output["translation"][0]
    scale = list(layout_output["scale"][0])

    euler = quat_wxyz_to_euler_XYZ(quaternion)
    json_out["roll"] = float(euler[0])
    json_out["pitch"] = float(euler[1])
    json_out["yaw"] = float(euler[2])
    json_out["pred_scale"] = [float(s) for s in scale]
    rot_matrix = quaternion_to_matrix(quaternion)
    pred_transform = torch.eye(4, dtype=quaternion.dtype).to(quaternion.device)
    pred_transform[:3, :3] = rot_matrix
    pred_transform[:3, 3] = translation
    pred_transform_list = [
        [float(t) for t in trans_row] for trans_row in pred_transform
    ]
    json_out["pred_transform"] = pred_transform_list
    return json_out


def json_to_halo_payloads(target_data):
    pred_transform = target_data["pred_transform"]
    pred_scale = target_data["pred_scale"]
    roll = target_data.get("roll", 0)
    pitch = target_data.get("pitch", 0)
    yaw = target_data.get("yaw", 0)
    # Update positions, rotation, and scale in the payload
    item_attachments = {}
    item_attachments["positions"] = {
        "x": pred_transform[0][3],
        "y": pred_transform[1][3],
        "z": pred_transform[2][3] - 1,  # Adjust for Halo design
    }
    item_attachments["rotation"] = {"x": roll, "y": pitch, "z": yaw}
    item_attachments["scale"] = {
        "x": pred_scale[0],
        "y": pred_scale[1],
        "z": pred_scale[2],
    }
    return item_attachments


def o3d_plane_estimation(points):
    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(points)
    plane_model, inliers = pcd.segment_plane(0.02, 3, 1000)

    [a, b, c, d] = plane_model
    logger.info(f"Plane equation: {a:.2f}x + {b:.2f}y + {c:.2f}z + {d:.2f} = 0")

    # Get the inlier points from RANSAC
    inlier_points = np.asarray(pcd.points)[inliers]

    # Adaptive flying point removal based on Z-range
    z_range = np.max(inlier_points[:, 2]) - np.min(inlier_points[:, 2])
    if z_range > 6.0:       # Large range - likely flying points
        thresh = 0.90       # Remove 10%
    elif z_range > 2.0:     # Moderate range
        thresh = 0.93       # Remove 7%
    else:                   # Small range - clean
        thresh = 0.95       # Remove 5%

    depth_quantile = np.quantile(inlier_points[:, 2], thresh)
    clean_points = inlier_points[inlier_points[:, 2] <= depth_quantile]

    logger.info(f"Flying point removal: {len(inlier_points)} -> {len(clean_points)} points (z_range: {z_range:.2f}m, thresh: {thresh})")
    logger.info(f"Clean points Z range: [{clean_points[:, 2].min():.3f}, {clean_points[:, 2].max():.3f}]")

    # Get the normal vector of the plane
    normal = np.array([a, b, c])
    normal = normal / np.linalg.norm(normal)

    # Create two orthogonal vectors in the plane using camera-aware approach
    # Use Z-axis as primary tangent (depth direction in camera coords)
    # This helps align one plane axis with the camera's depth direction
    if abs(normal[2]) < 0.9:  # Use Z-axis if normal isn't too close to Z
        tangent = np.array([0, 0, 1])
    else:
        tangent = np.array([1, 0, 0])  # Use X-axis otherwise

    v1 = np.cross(normal, tangent)
    v1 = v1 / np.linalg.norm(v1)
    v2 = np.cross(normal, v1)
    v2 = v2 / np.linalg.norm(v2)  # Explicit normalization for numerical stability

    # Ensure consistent right-handed coordinate system
    if np.dot(np.cross(v1, v2), normal) < 0:
        v2 = -v2

    logger.info(f"Plane basis vectors - v1: [{v1[0]:.3f}, {v1[1]:.3f}, {v1[2]:.3f}], v2: [{v2[0]:.3f}, {v2[1]:.3f}, {v2[2]:.3f}]")

    # Calculate centroid using bounding box center (more robust to density bias)
    min_vals = np.min(clean_points, axis=0)
    max_vals = np.max(clean_points, axis=0)
    centroid = (min_vals + max_vals) / 2
    logger.info(f"Bbox centroid: [{centroid[0]:.3f}, {centroid[1]:.3f}, {centroid[2]:.3f}]")

    # Project clean points onto the plane's coordinate system
    relative_points = clean_points - centroid
    u_coords = np.dot(relative_points, v1)  # coordinates along v1 direction
    v_coords = np.dot(relative_points, v2)  # coordinates along v2 direction

    # Since flying points are already removed, use minimal percentile filtering [0, 99]
    u_min, u_max = np.percentile(u_coords, [0, 100])
    v_min, v_max = np.percentile(v_coords, [0, 100])

    # Calculate extents
    u_extent = u_max - u_min
    v_extent = v_max - v_min

    # Ensure minimum size
    u_extent = max(u_extent, 0.1)  # minimum 10cm
    v_extent = max(v_extent, 0.1)
    logger.info(f"Plane size: {u_extent:.3f}m x {v_extent:.3f}m")

    # Calculate direction away from camera center (at origin [0,0,0])
    camera_pos = np.array([0, 0, 0])  # Camera at origin
    camera_to_centroid = centroid - camera_pos  # Direction from camera to plane center
    camera_distance = np.linalg.norm(camera_to_centroid)
    away_direction = camera_to_centroid / camera_distance

    # Project away direction onto the plane (remove component normal to plane)
    away_in_plane = away_direction - np.dot(away_direction, normal) * normal
    away_in_plane_norm = np.linalg.norm(away_in_plane)

    # Create plane coordinate system based on camera direction
    if away_in_plane_norm > 1e-6:  # Only if there's a meaningful in-plane component
        # Define plane axes directly based on camera direction
        away_axis = away_in_plane / away_in_plane_norm  # Away from camera direction (in plane)
        perp_axis = np.cross(normal, away_axis)  # Perpendicular to away direction (in plane)
        perp_axis = perp_axis / np.linalg.norm(perp_axis)

        logger.info(f"Camera-based plane axes:")
        logger.info(f"  Away axis: [{away_axis[0]:.3f}, {away_axis[1]:.3f}, {away_axis[2]:.3f}]")
        logger.info(f"  Perp axis: [{perp_axis[0]:.3f}, {perp_axis[1]:.3f}, {perp_axis[2]:.3f}]")

        # Project all points onto this camera-aligned coordinate system
        relative_points = clean_points - centroid
        away_coords = np.dot(relative_points, away_axis)  # coordinates along away direction
        perp_coords = np.dot(relative_points, perp_axis)  # coordinates perpendicular to away

        # Calculate extents in camera-aligned system
        away_min, away_max = np.percentile(away_coords, [0, 100])
        perp_min, perp_max = np.percentile(perp_coords, [0, 100])

        away_extent = max(away_max - away_min, 0.1)
        perp_extent = max(perp_max - perp_min, 0.1)

        # Asymmetric extension: 10% towards camera, 50% away from camera, 20% perpendicular both sides
        away_extent_extended = away_extent * 1.6  # 60% larger in away direction (10% + 50%)
        perp_extent_extended = perp_extent * 1.4  # 40% larger in perpendicular direction (20% each side)

        logger.info(f"Original extents: away={away_extent:.3f}m, perp={perp_extent:.3f}m")
        logger.info(f"Extended extents: away={away_extent_extended:.3f}m, perp={perp_extent_extended:.3f}m")

        # Extension amounts for each direction
        away_extension_near = away_extent * 0.1   # 10% extension towards camera (near side)
        away_extension_far = away_extent * 0.5    # 50% extension away from camera (far side)
        perp_extension = perp_extent * 0.2        # 20% extension on each perpendicular side

        logger.info(f"Extensions: near={away_extension_near:.3f}m, far={away_extension_far:.3f}m, perp={perp_extension:.3f}m per side")
        logger.info(f"Extending plane asymmetrically: 10% towards camera, 50% away from camera, 20% perpendicular both sides")

        corners = []
        for da in [-1, 1]:
            for dp in [-1, 1]:
                # Asymmetric extension in away direction
                if da == 1:  # Away from camera side - extend by 50%
                    away_distance = away_extent/2 + away_extension_far
                else:  # Near camera side - extend by 10%
                    away_distance = da * (away_extent/2 + away_extension_near)

                # Extend perpendicular direction by 20% on both sides
                perp_distance = dp * (perp_extent/2 + perp_extension)

                corner = (centroid +
                         away_distance * away_axis +
                         perp_distance * perp_axis)
                corners.append(corner)
    else:
        # If plane is parallel to camera direction, use original v1/v2 system
        logger.info("Plane parallel to camera direction, using original coordinate system")
        corners = []
        for dx in [-1, 1]:
            for dy in [-1, 1]:
                corner = centroid + dx * (u_extent/2) * v1 + dy * (v_extent/2) * v2
                corners.append(corner)
    corners = np.array(corners)
    # Create a quad mesh using trimesh
    # Define vertices (4 corners)
    vertices = corners
    # Define a single quad face (indices of the 4 vertices)
    # Make sure the order is correct for proper orientation
    faces = np.array([[0, 1, 3, 2]])  # quad face
    # Create trimesh with quad faces

    # rotate mesh (from z-up to y-up)
    vertices = vertices @ np.array([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
    mesh = trimesh.Trimesh(
        vertices=vertices,
        faces=faces,
        process=False  # Important: prevents automatic triangulation
    )
    # Optional: set face colors
    mesh.visual.face_colors = [128, 128, 128, 255]  # gray color (RGBA)

    return mesh


def estimate_plane_area(mask):
    """
    Calculate the area covered by the mask's 2D bounding box as a fraction of total image area.
    """
    if mask.numel() == 0:
        return 0.0

    # Find coordinates where mask > 0.5 (valid mask pixels)
    valid_mask = mask > 0.5

    # If no valid pixels, return 0
    if not torch.any(valid_mask):
        return 0.0

    # Get mask dimensions
    H, W = mask.shape
    total_area = H * W

    # Find bounding box coordinates
    # Get row and column indices of valid pixels
    valid_coords = torch.nonzero(valid_mask, as_tuple=False)  # Returns [N, 2] array of [row, col]

    if valid_coords.size(0) == 0:
        return 0.0

    # Find min/max coordinates to form bounding box
    min_row = torch.min(valid_coords[:, 0]).item()
    max_row = torch.max(valid_coords[:, 0]).item()
    min_col = torch.min(valid_coords[:, 1]).item()
    max_col = torch.max(valid_coords[:, 1]).item()

    # Calculate bounding box dimensions
    bbox_height = max_row - min_row + 1
    bbox_width = max_col - min_col + 1
    bbox_area = bbox_height * bbox_width

    # Return ratio of bounding box area to total image area
    return bbox_area / total_area