E2E_SCSI / evaluation /utils /eval_utils.py
kungchuking's picture
Copied from github repository.
2c76547
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
from dataclasses import dataclass
from typing import Dict, Optional, Union
import torch
from pytorch3d.utils import opencv_from_cameras_projection
@dataclass(eq=True, frozen=True)
class PerceptionMetric:
metric: str
depth_scaling_norm: Optional[str] = None
suffix: str = ""
index: str = ""
def __str__(self):
return (
self.metric
+ self.index
+ (
("_norm_" + self.depth_scaling_norm)
if self.depth_scaling_norm is not None
else ""
)
+ self.suffix
)
def eval_endpoint_error_sequence(
x: torch.Tensor,
y: torch.Tensor,
mask: torch.Tensor,
crop: int = 0,
mask_thr: float = 0.5,
clamp_thr: float = 1e-5,
) -> Dict[str, torch.Tensor]:
assert len(x.shape) == len(y.shape) == len(mask.shape) == 4, (
x.shape,
y.shape,
mask.shape,
)
assert x.shape[0] == y.shape[0] == mask.shape[0], (x.shape, y.shape, mask.shape)
# chuck out the border
if crop > 0:
if crop > min(y.shape[2:]) - crop:
raise ValueError("Incorrect crop size.")
y = y[:, :, crop:-crop, crop:-crop]
x = x[:, :, crop:-crop, crop:-crop]
mask = mask[:, :, crop:-crop, crop:-crop]
y = y * (mask > mask_thr).float()
x = x * (mask > mask_thr).float()
y[torch.isnan(y)] = 0
results = {}
for epe_name in ("epe", "temp_epe"):
if epe_name == "epe":
endpoint_error = (mask * (x - y) ** 2).sum(dim=1).sqrt()
elif epe_name == "temp_epe":
delta_mask = mask[:-1] * mask[1:]
endpoint_error = (
(delta_mask * ((x[:-1] - x[1:]) - (y[:-1] - y[1:])) ** 2)
.sum(dim=1)
.sqrt()
)
# epe_nonzero = endpoint_error != 0
nonzero = torch.count_nonzero(endpoint_error)
epe_mean = endpoint_error.sum() / torch.clamp(
nonzero, clamp_thr
) # average error for all the sequence pixels
epe_inv_accuracy_05px = (endpoint_error > 0.5).sum() / torch.clamp(
nonzero, clamp_thr
)
epe_inv_accuracy_1px = (endpoint_error > 1).sum() / torch.clamp(
nonzero, clamp_thr
)
epe_inv_accuracy_2px = (endpoint_error > 2).sum() / torch.clamp(
nonzero, clamp_thr
)
epe_inv_accuracy_3px = (endpoint_error > 3).sum() / torch.clamp(
nonzero, clamp_thr
)
results[f"{epe_name}_mean"] = epe_mean[None]
results[f"{epe_name}_bad_0.5px"] = epe_inv_accuracy_05px[None] * 100
results[f"{epe_name}_bad_1px"] = epe_inv_accuracy_1px[None] * 100
results[f"{epe_name}_bad_2px"] = epe_inv_accuracy_2px[None] * 100
results[f"{epe_name}_bad_3px"] = epe_inv_accuracy_3px[None] * 100
return results
def depth2disparity_scale(left_camera, right_camera, image_size_tensor):
# # opencv camera matrices
(_, T1, K1), (_, T2, _) = [
opencv_from_cameras_projection(
f,
image_size_tensor,
)
for f in (left_camera, right_camera)
]
fix_baseline = T1[0][0] - T2[0][0]
focal_length_px = K1[0][0][0]
# following this https://github.com/princeton-vl/RAFT-Stereo#converting-disparity-to-depth
return focal_length_px * fix_baseline
def depth_to_pcd(
depth_map,
img,
focal_length,
cx,
cy,
step: int = None,
inv_extrinsic=None,
mask=None,
filter=False,
):
__, w, __ = img.shape
if step is None:
step = int(w / 100)
Z = depth_map[::step, ::step]
colors = img[::step, ::step, :]
Pixels_Y = torch.arange(Z.shape[0]).to(Z.device) * step
Pixels_X = torch.arange(Z.shape[1]).to(Z.device) * step
X = (Pixels_X[None, :] - cx) * Z / focal_length
Y = (Pixels_Y[:, None] - cy) * Z / focal_length
inds = Z > 0
if mask is not None:
inds = inds * (mask[::step, ::step] > 0)
X = X[inds].reshape(-1)
Y = Y[inds].reshape(-1)
Z = Z[inds].reshape(-1)
colors = colors[inds]
pcd = torch.stack([X, Y, Z]).T
if inv_extrinsic is not None:
pcd_ext = torch.vstack([pcd.T, torch.ones((1, pcd.shape[0])).to(Z.device)])
pcd = (inv_extrinsic @ pcd_ext)[:3, :].T
if filter:
pcd, filt_inds = filter_outliers(pcd)
colors = colors[filt_inds]
return pcd, colors
def filter_outliers(pcd, sigma=3):
mean = pcd.mean(0)
std = pcd.std(0)
inds = ((pcd - mean).abs() < sigma * std)[:, 2]
pcd = pcd[inds]
return pcd, inds
# -- Modified by Chu King on 22nd November 2025 to fix the resolution during evaluation.
def eval_batch(batch_dict, predictions, resolution=[480, 640]) -> Dict[str, Union[float, torch.Tensor]]:
"""
Produce performance metrics for a single batch of perception
predictions.
Args:
frame_data: A PixarFrameData object containing the input to the new view
synthesis method.
preds: A PerceptionPrediction object with the predicted data.
Returns:
results: A dictionary holding evaluation metrics.
"""
results = {}
assert "disparity" in predictions
mask_now = torch.ones_like(batch_dict["fg_mask"][..., :resolution[0], :resolution[1]])
mask_now = mask_now * batch_dict["disparity_mask"][..., :resolution[0], :resolution[1]]
eval_flow_traj_output = eval_endpoint_error_sequence(
predictions["disparity"], batch_dict["disparity"][..., :resolution[0], :resolution[1]], mask_now
)
for epe_name in ("epe", "temp_epe"):
results[PerceptionMetric(f"disp_{epe_name}_mean")] = eval_flow_traj_output[
f"{epe_name}_mean"
]
results[PerceptionMetric(f"disp_{epe_name}_bad_3px")] = eval_flow_traj_output[
f"{epe_name}_bad_3px"
]
results[PerceptionMetric(f"disp_{epe_name}_bad_2px")] = eval_flow_traj_output[
f"{epe_name}_bad_2px"
]
results[PerceptionMetric(f"disp_{epe_name}_bad_1px")] = eval_flow_traj_output[
f"{epe_name}_bad_1px"
]
results[PerceptionMetric(f"disp_{epe_name}_bad_0.5px")] = eval_flow_traj_output[
f"{epe_name}_bad_0.5px"
]
if "endpoint_error_per_pixel" in eval_flow_traj_output:
results["disp_endpoint_error_per_pixel"] = eval_flow_traj_output[
"endpoint_error_per_pixel"
]
return (results, len(predictions["disparity"]))