File size: 6,706 Bytes

2c76547

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

from dataclasses import dataclass
from typing import Dict, Optional, Union

import torch
from pytorch3d.utils import opencv_from_cameras_projection


@dataclass(eq=True, frozen=True)
class PerceptionMetric:
    metric: str
    depth_scaling_norm: Optional[str] = None
    suffix: str = ""
    index: str = ""

    def __str__(self):
        return (
            self.metric
            + self.index
            + (
                ("_norm_" + self.depth_scaling_norm)
                if self.depth_scaling_norm is not None
                else ""
            )
            + self.suffix
        )


def eval_endpoint_error_sequence(
    x: torch.Tensor,
    y: torch.Tensor,
    mask: torch.Tensor,
    crop: int = 0,
    mask_thr: float = 0.5,
    clamp_thr: float = 1e-5,
) -> Dict[str, torch.Tensor]:

    assert len(x.shape) == len(y.shape) == len(mask.shape) == 4, (
        x.shape,
        y.shape,
        mask.shape,
    )
    assert x.shape[0] == y.shape[0] == mask.shape[0], (x.shape, y.shape, mask.shape)

    # chuck out the border
    if crop > 0:
        if crop > min(y.shape[2:]) - crop:
            raise ValueError("Incorrect crop size.")
        y = y[:, :, crop:-crop, crop:-crop]
        x = x[:, :, crop:-crop, crop:-crop]
        mask = mask[:, :, crop:-crop, crop:-crop]

    y = y * (mask > mask_thr).float()
    x = x * (mask > mask_thr).float()
    y[torch.isnan(y)] = 0

    results = {}
    for epe_name in ("epe", "temp_epe"):
        if epe_name == "epe":
            endpoint_error = (mask * (x - y) ** 2).sum(dim=1).sqrt()
        elif epe_name == "temp_epe":
            delta_mask = mask[:-1] * mask[1:]
            endpoint_error = (
                (delta_mask * ((x[:-1] - x[1:]) - (y[:-1] - y[1:])) ** 2)
                .sum(dim=1)
                .sqrt()
            )

        # epe_nonzero = endpoint_error != 0
        nonzero = torch.count_nonzero(endpoint_error)

        epe_mean = endpoint_error.sum() / torch.clamp(
            nonzero, clamp_thr
        )  # average error for all the sequence pixels
        epe_inv_accuracy_05px = (endpoint_error > 0.5).sum() / torch.clamp(
            nonzero, clamp_thr
        )
        epe_inv_accuracy_1px = (endpoint_error > 1).sum() / torch.clamp(
            nonzero, clamp_thr
        )
        epe_inv_accuracy_2px = (endpoint_error > 2).sum() / torch.clamp(
            nonzero, clamp_thr
        )
        epe_inv_accuracy_3px = (endpoint_error > 3).sum() / torch.clamp(
            nonzero, clamp_thr
        )

        results[f"{epe_name}_mean"] = epe_mean[None]
        results[f"{epe_name}_bad_0.5px"] = epe_inv_accuracy_05px[None] * 100
        results[f"{epe_name}_bad_1px"] = epe_inv_accuracy_1px[None] * 100
        results[f"{epe_name}_bad_2px"] = epe_inv_accuracy_2px[None] * 100
        results[f"{epe_name}_bad_3px"] = epe_inv_accuracy_3px[None] * 100
    return results


def depth2disparity_scale(left_camera, right_camera, image_size_tensor):
    # # opencv camera matrices
    (_, T1, K1), (_, T2, _) = [
        opencv_from_cameras_projection(
            f,
            image_size_tensor,
        )
        for f in (left_camera, right_camera)
    ]
    fix_baseline = T1[0][0] - T2[0][0]
    focal_length_px = K1[0][0][0]
    # following this https://github.com/princeton-vl/RAFT-Stereo#converting-disparity-to-depth
    return focal_length_px * fix_baseline


def depth_to_pcd(
    depth_map,
    img,
    focal_length,
    cx,
    cy,
    step: int = None,
    inv_extrinsic=None,
    mask=None,
    filter=False,
):
    __, w, __ = img.shape
    if step is None:
        step = int(w / 100)
    Z = depth_map[::step, ::step]
    colors = img[::step, ::step, :]

    Pixels_Y = torch.arange(Z.shape[0]).to(Z.device) * step
    Pixels_X = torch.arange(Z.shape[1]).to(Z.device) * step

    X = (Pixels_X[None, :] - cx) * Z / focal_length
    Y = (Pixels_Y[:, None] - cy) * Z / focal_length

    inds = Z > 0

    if mask is not None:
        inds = inds * (mask[::step, ::step] > 0)

    X = X[inds].reshape(-1)
    Y = Y[inds].reshape(-1)
    Z = Z[inds].reshape(-1)
    colors = colors[inds]
    pcd = torch.stack([X, Y, Z]).T

    if inv_extrinsic is not None:
        pcd_ext = torch.vstack([pcd.T, torch.ones((1, pcd.shape[0])).to(Z.device)])
        pcd = (inv_extrinsic @ pcd_ext)[:3, :].T

    if filter:
        pcd, filt_inds = filter_outliers(pcd)
        colors = colors[filt_inds]
    return pcd, colors


def filter_outliers(pcd, sigma=3):
    mean = pcd.mean(0)
    std = pcd.std(0)
    inds = ((pcd - mean).abs() < sigma * std)[:, 2]
    pcd = pcd[inds]
    return pcd, inds

# -- Modified by Chu King on 22nd November 2025 to fix the resolution during evaluation.
def eval_batch(batch_dict, predictions, resolution=[480, 640]) -> Dict[str, Union[float, torch.Tensor]]:
    """
    Produce performance metrics for a single batch of perception
    predictions.
    Args:
        frame_data: A PixarFrameData object containing the input to the new view
            synthesis method.
        preds: A PerceptionPrediction object with the predicted data.
    Returns:
        results: A dictionary holding evaluation metrics.
    """
    results = {}

    assert "disparity" in predictions
    mask_now = torch.ones_like(batch_dict["fg_mask"][..., :resolution[0], :resolution[1]])

    mask_now = mask_now * batch_dict["disparity_mask"][..., :resolution[0], :resolution[1]]

    eval_flow_traj_output = eval_endpoint_error_sequence(
        predictions["disparity"], batch_dict["disparity"][..., :resolution[0], :resolution[1]], mask_now
    )
    for epe_name in ("epe", "temp_epe"):
        results[PerceptionMetric(f"disp_{epe_name}_mean")] = eval_flow_traj_output[
            f"{epe_name}_mean"
        ]

        results[PerceptionMetric(f"disp_{epe_name}_bad_3px")] = eval_flow_traj_output[
            f"{epe_name}_bad_3px"
        ]

        results[PerceptionMetric(f"disp_{epe_name}_bad_2px")] = eval_flow_traj_output[
            f"{epe_name}_bad_2px"
        ]

        results[PerceptionMetric(f"disp_{epe_name}_bad_1px")] = eval_flow_traj_output[
            f"{epe_name}_bad_1px"
        ]

        results[PerceptionMetric(f"disp_{epe_name}_bad_0.5px")] = eval_flow_traj_output[
            f"{epe_name}_bad_0.5px"
        ]
    if "endpoint_error_per_pixel" in eval_flow_traj_output:
        results["disp_endpoint_error_per_pixel"] = eval_flow_traj_output[
            "endpoint_error_per_pixel"
        ]
    return (results, len(predictions["disparity"]))