E2E_SCSI / evaluation /utils /eval_utils.py

Copied from github repository.

2c76547 2 months ago

6.71 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.

	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	from dataclasses import dataclass
	from typing import Dict, Optional, Union

	import torch
	from pytorch3d.utils import opencv_from_cameras_projection


	@dataclass(eq=True, frozen=True)
	class PerceptionMetric:
	metric: str
	depth_scaling_norm: Optional[str] = None
	suffix: str = ""
	index: str = ""

	def __str__(self):
	return (
	self.metric
	+ self.index
	+ (
	("_norm_" + self.depth_scaling_norm)
	if self.depth_scaling_norm is not None
	else ""
	)
	+ self.suffix
	)


	def eval_endpoint_error_sequence(
	x: torch.Tensor,
	y: torch.Tensor,
	mask: torch.Tensor,
	crop: int = 0,
	mask_thr: float = 0.5,
	clamp_thr: float = 1e-5,
	) -> Dict[str, torch.Tensor]:

	assert len(x.shape) == len(y.shape) == len(mask.shape) == 4, (
	x.shape,
	y.shape,
	mask.shape,
	)
	assert x.shape[0] == y.shape[0] == mask.shape[0], (x.shape, y.shape, mask.shape)

	# chuck out the border
	if crop > 0:
	if crop > min(y.shape[2:]) - crop:
	raise ValueError("Incorrect crop size.")
	y = y[:, :, crop:-crop, crop:-crop]
	x = x[:, :, crop:-crop, crop:-crop]
	mask = mask[:, :, crop:-crop, crop:-crop]

	y = y * (mask > mask_thr).float()
	x = x * (mask > mask_thr).float()
	y[torch.isnan(y)] = 0

	results = {}
	for epe_name in ("epe", "temp_epe"):
	if epe_name == "epe":
	endpoint_error = (mask * (x - y) ** 2).sum(dim=1).sqrt()
	elif epe_name == "temp_epe":
	delta_mask = mask[:-1] * mask[1:]
	endpoint_error = (
	(delta_mask * ((x[:-1] - x[1:]) - (y[:-1] - y[1:])) ** 2)
	.sum(dim=1)
	.sqrt()
	)

	# epe_nonzero = endpoint_error != 0
	nonzero = torch.count_nonzero(endpoint_error)

	epe_mean = endpoint_error.sum() / torch.clamp(
	nonzero, clamp_thr
	) # average error for all the sequence pixels
	epe_inv_accuracy_05px = (endpoint_error > 0.5).sum() / torch.clamp(
	nonzero, clamp_thr
	)
	epe_inv_accuracy_1px = (endpoint_error > 1).sum() / torch.clamp(
	nonzero, clamp_thr
	)
	epe_inv_accuracy_2px = (endpoint_error > 2).sum() / torch.clamp(
	nonzero, clamp_thr
	)
	epe_inv_accuracy_3px = (endpoint_error > 3).sum() / torch.clamp(
	nonzero, clamp_thr
	)

	results[f"{epe_name}_mean"] = epe_mean[None]
	results[f"{epe_name}_bad_0.5px"] = epe_inv_accuracy_05px[None] * 100
	results[f"{epe_name}_bad_1px"] = epe_inv_accuracy_1px[None] * 100
	results[f"{epe_name}_bad_2px"] = epe_inv_accuracy_2px[None] * 100
	results[f"{epe_name}_bad_3px"] = epe_inv_accuracy_3px[None] * 100
	return results


	def depth2disparity_scale(left_camera, right_camera, image_size_tensor):
	# # opencv camera matrices
	(_, T1, K1), (_, T2, _) = [
	opencv_from_cameras_projection(
	f,
	image_size_tensor,
	)
	for f in (left_camera, right_camera)
	]
	fix_baseline = T1[0][0] - T2[0][0]
	focal_length_px = K1[0][0][0]
	# following this https://github.com/princeton-vl/RAFT-Stereo#converting-disparity-to-depth
	return focal_length_px * fix_baseline


	def depth_to_pcd(
	depth_map,
	img,
	focal_length,
	cx,
	cy,
	step: int = None,
	inv_extrinsic=None,
	mask=None,
	filter=False,
	):
	__, w, __ = img.shape
	if step is None:
	step = int(w / 100)
	Z = depth_map[::step, ::step]
	colors = img[::step, ::step, :]

	Pixels_Y = torch.arange(Z.shape[0]).to(Z.device) * step
	Pixels_X = torch.arange(Z.shape[1]).to(Z.device) * step

	X = (Pixels_X[None, :] - cx) * Z / focal_length
	Y = (Pixels_Y[:, None] - cy) * Z / focal_length

	inds = Z > 0

	if mask is not None:
	inds = inds * (mask[::step, ::step] > 0)

	X = X[inds].reshape(-1)
	Y = Y[inds].reshape(-1)
	Z = Z[inds].reshape(-1)
	colors = colors[inds]
	pcd = torch.stack([X, Y, Z]).T

	if inv_extrinsic is not None:
	pcd_ext = torch.vstack([pcd.T, torch.ones((1, pcd.shape[0])).to(Z.device)])
	pcd = (inv_extrinsic @ pcd_ext)[:3, :].T

	if filter:
	pcd, filt_inds = filter_outliers(pcd)
	colors = colors[filt_inds]
	return pcd, colors


	def filter_outliers(pcd, sigma=3):
	mean = pcd.mean(0)
	std = pcd.std(0)
	inds = ((pcd - mean).abs() < sigma * std)[:, 2]
	pcd = pcd[inds]
	return pcd, inds

	# -- Modified by Chu King on 22nd November 2025 to fix the resolution during evaluation.
	def eval_batch(batch_dict, predictions, resolution=[480, 640]) -> Dict[str, Union[float, torch.Tensor]]:
	"""
	Produce performance metrics for a single batch of perception
	predictions.
	Args:
	frame_data: A PixarFrameData object containing the input to the new view
	synthesis method.
	preds: A PerceptionPrediction object with the predicted data.
	Returns:
	results: A dictionary holding evaluation metrics.
	"""
	results = {}

	assert "disparity" in predictions
	mask_now = torch.ones_like(batch_dict["fg_mask"][..., :resolution[0], :resolution[1]])

	mask_now = mask_now * batch_dict["disparity_mask"][..., :resolution[0], :resolution[1]]

	eval_flow_traj_output = eval_endpoint_error_sequence(
	predictions["disparity"], batch_dict["disparity"][..., :resolution[0], :resolution[1]], mask_now
	)
	for epe_name in ("epe", "temp_epe"):
	results[PerceptionMetric(f"disp_{epe_name}_mean")] = eval_flow_traj_output[
	f"{epe_name}_mean"
	]

	results[PerceptionMetric(f"disp_{epe_name}_bad_3px")] = eval_flow_traj_output[
	f"{epe_name}_bad_3px"
	]

	results[PerceptionMetric(f"disp_{epe_name}_bad_2px")] = eval_flow_traj_output[
	f"{epe_name}_bad_2px"
	]

	results[PerceptionMetric(f"disp_{epe_name}_bad_1px")] = eval_flow_traj_output[
	f"{epe_name}_bad_1px"
	]

	results[PerceptionMetric(f"disp_{epe_name}_bad_0.5px")] = eval_flow_traj_output[
	f"{epe_name}_bad_0.5px"
	]
	if "endpoint_error_per_pixel" in eval_flow_traj_output:
	results["disp_endpoint_error_per_pixel"] = eval_flow_traj_output[
	"endpoint_error_per_pixel"
	]
	return (results, len(predictions["disparity"]))