import json import os import cv2 import numpy as np import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt from longstream.data import LongStreamDataLoader from longstream.eval.io import ( frame_stems, read_depth, read_opencv_camera_yml, read_pointcloud_xyz, read_pred_w2c_txt, ) from longstream.eval.metrics import ate_rmse, chamfer_and_f1, transform_points from longstream.utils.sky_mask import sky_mask_filename def _ensure_dir(path): os.makedirs(path, exist_ok=True) def _sequence_output_dir(output_root, seq_name): return os.path.join(output_root, seq_name) def _sequence_metrics_path(output_root, seq_name): return os.path.join(output_root, "metrics", f"{seq_name}.json") def _sequence_plot_path(output_root, seq_name): return os.path.join(output_root, "plots", f"{seq_name}_traj_3d.png") def _world_xyz_to_plot_xyz(xyz): xyz = np.asarray(xyz, dtype=np.float64) return np.stack([xyz[:, 0], xyz[:, 2], -xyz[:, 1]], axis=-1) def _set_equal_3d_axes(ax, xyz): mins = xyz.min(axis=0) maxs = xyz.max(axis=0) center = 0.5 * (mins + maxs) radius = 0.5 * np.max(np.maximum(maxs - mins, 1e-6)) ax.set_xlim(center[0] - radius, center[0] + radius) ax.set_ylim(center[1] - radius, center[1] + radius) ax.set_zlim(center[2] - radius, center[2] + radius) def _load_gt_pose_data(seq_info): if seq_info.camera is not None: cam_dir = os.path.join(seq_info.scene_root, "cameras", seq_info.camera) extri_path = os.path.join(cam_dir, "extri.yml") intri_path = os.path.join(cam_dir, "intri.yml") if os.path.exists(extri_path): extri, intri, image_sizes = read_opencv_camera_yml(extri_path, intri_path) return extri, intri, image_sizes extri_path = os.path.join(seq_info.scene_root, "extri.yml") intri_path = os.path.join(seq_info.scene_root, "intri.yml") if not os.path.exists(extri_path): return None, None, None extri, intri, image_sizes = read_opencv_camera_yml(extri_path, intri_path) return extri, intri, image_sizes def _resolve_gt_depth_root(seq_info): if seq_info.camera is not None: camera_depth_root = os.path.join(seq_info.scene_root, "depths", seq_info.camera) if os.path.isdir(camera_depth_root): return camera_depth_root depth_root = os.path.join(seq_info.scene_root, "depths") if os.path.isdir(depth_root): return depth_root return None def _resolve_gt_depth_path(seq_info, depth_root, image_path, stem): rel_path = os.path.relpath(image_path, seq_info.image_dir) rel_stem = os.path.splitext(rel_path)[0] file_stem = os.path.splitext(os.path.basename(image_path))[0] candidates = [ os.path.join(depth_root, f"{stem}.exr"), os.path.join(depth_root, rel_stem + ".exr"), os.path.join(depth_root, stem, f"{file_stem}.exr"), ] for candidate in candidates: if os.path.exists(candidate): return candidate return None def _resize_long_edge(arr, long_edge_size, interpolation): h, w = arr.shape[:2] scale = float(long_edge_size) / float(max(h, w)) new_w = int(round(w * scale)) new_h = int(round(h * scale)) return cv2.resize(arr, (new_w, new_h), interpolation=interpolation) def _prepare_map_for_eval( arr, size, crop, patch_size, target_shape, interpolation, square_ok=False ): h0, w0 = arr.shape[:2] long_edge = round(size * max(w0 / h0, h0 / w0)) if size == 224 else size arr = _resize_long_edge(arr, long_edge, interpolation) h, w = arr.shape[:2] cx, cy = w // 2, h // 2 if size == 224: half = min(cx, cy) target_w = 2 * half target_h = 2 * half if crop: arr = arr[cy - half : cy + half, cx - half : cx + half] else: arr = cv2.resize(arr, (target_w, target_h), interpolation=interpolation) else: halfw = ((2 * cx) // patch_size) * (patch_size // 2) halfh = ((2 * cy) // patch_size) * (patch_size // 2) if not square_ok and w == h: halfh = int(3 * halfw / 4) target_w = 2 * halfw target_h = 2 * halfh if crop: arr = arr[cy - halfh : cy + halfh, cx - halfw : cx + halfw] else: arr = cv2.resize(arr, (target_w, target_h), interpolation=interpolation) if arr.shape[:2] != tuple(target_shape): arr = cv2.resize( arr, (target_shape[1], target_shape[0]), interpolation=interpolation ) return arr def _sky_mask_path(seq_dir, image_path): return os.path.join(seq_dir, "sky_masks", sky_mask_filename(image_path)) def _sample_frame_points(points, max_points, rng): if max_points is None or len(points) <= max_points: return points keep = rng.choice(len(points), size=max_points, replace=False) return points[keep] def _depth_to_world_points(depth, intri, extri, valid_mask): ys, xs = np.nonzero(valid_mask) if ys.size == 0: return np.empty((0, 3), dtype=np.float32) z = depth[ys, xs].astype(np.float64) fx = float(intri[0, 0]) fy = float(intri[1, 1]) cx = float(intri[0, 2]) cy = float(intri[1, 2]) x = (xs.astype(np.float64) - cx) * z / max(fx, 1e-12) y = (ys.astype(np.float64) - cy) * z / max(fy, 1e-12) pts_cam = np.stack([x, y, z], axis=1) R = extri[:3, :3] t = extri[:3, 3] pts_world = (R.T @ (pts_cam.T - t[:, None])).T return pts_world.astype(np.float32, copy=False) def _load_gt_pointcloud(seq_info, seq_dir, gt_extri, gt_intri, eval_cfg): if not gt_extri or not gt_intri: return None gt_dir = _resolve_gt_depth_root(seq_info) if gt_dir is None: return None eval_max_points = int(eval_cfg.get("point_eval_max_points", 100000)) oversample_factor = int(eval_cfg.get("point_eval_oversample_factor", 4)) per_frame_budget = max( (eval_max_points * oversample_factor) // max(len(seq_info.image_paths), 1), 1 ) rng = np.random.default_rng(0) chunks = [] for image_path, stem in zip( seq_info.image_paths, frame_stems(seq_info.image_paths) ): depth_path = _resolve_gt_depth_path(seq_info, gt_dir, image_path, stem) if depth_path is None or stem not in gt_extri or stem not in gt_intri: continue depth = read_depth(depth_path) valid = np.isfinite(depth) & (depth > 0) if not np.any(valid): continue sky_path = _sky_mask_path(seq_dir, image_path) if os.path.exists(sky_path): sky_mask = cv2.imread(sky_path, cv2.IMREAD_GRAYSCALE) if sky_mask is not None: if sky_mask.shape[:2] != depth.shape[:2]: sky_mask = cv2.resize( sky_mask, (depth.shape[1], depth.shape[0]), interpolation=cv2.INTER_NEAREST, ) valid &= sky_mask > 0 if not np.any(valid): continue pts_world = _depth_to_world_points(depth, gt_intri[stem], gt_extri[stem], valid) if len(pts_world) == 0: continue chunks.append(_sample_frame_points(pts_world, per_frame_budget, rng)) if not chunks: return None return np.concatenate(chunks, axis=0) def _evaluate_pointclouds(seq_info, seq_dir, eval_cfg, pose_align, gt_cloud): if pose_align is None or gt_cloud is None: return None scale, R, t = pose_align point_paths = { "point_head": [ os.path.join(seq_dir, "points", "point_head_full.npy"), os.path.join(seq_dir, "points", "point_head_full.npz"), os.path.join(seq_dir, "points", "point_head_full.ply"), ], "dpt_unproj": [ os.path.join(seq_dir, "points", "dpt_unproj_full.npy"), os.path.join(seq_dir, "points", "dpt_unproj_full.npz"), os.path.join(seq_dir, "points", "dpt_unproj_full.ply"), ], } threshold = float(eval_cfg.get("point_f1_threshold", 0.25)) max_points = int(eval_cfg.get("point_eval_max_points", 100000)) voxel_size = eval_cfg.get("point_eval_voxel_size", None) voxel_size = None if voxel_size in (None, "", "null") else float(voxel_size) metrics_by_branch = {} for branch, candidates in point_paths.items(): path = next( (candidate for candidate in candidates if os.path.exists(candidate)), None ) if path is None: continue pred_cloud = read_pointcloud_xyz(path) pred_cloud = transform_points(pred_cloud, scale, R, t) metrics = chamfer_and_f1( pred_cloud, gt_cloud, threshold=threshold, max_points=max_points, voxel_size=voxel_size, seed=0 if branch == "point_head" else 1, ) if metrics is not None: metrics_by_branch[branch] = metrics return metrics_by_branch or None def _evaluate_video_dpt(seq_info, seq_dir, eval_cfg, data_cfg): pred_dir = os.path.join(seq_dir, "depth", "dpt") gt_dir = _resolve_gt_depth_root(seq_info) if not os.path.isdir(pred_dir) or gt_dir is None: return None size = int(data_cfg.get("size", 518)) crop = bool(data_cfg.get("crop", False)) patch_size = int(data_cfg.get("patch_size", 14)) rel_delta_threshold = float(eval_cfg.get("depth_rel_delta_threshold", 1.25)) abs_rel_sum = 0.0 rel_delta_hits = 0 valid_pixels = 0 evaluated_frames = 0 stems = frame_stems(seq_info.image_paths) for frame_id, stem in enumerate(stems): pred_path = os.path.join(pred_dir, f"frame_{frame_id:06d}.npy") gt_path = _resolve_gt_depth_path( seq_info, gt_dir, seq_info.image_paths[frame_id], stem ) if not os.path.exists(pred_path) or gt_path is None: continue pred = np.load(pred_path).astype(np.float32) gt = read_depth(gt_path) gt = _prepare_map_for_eval( gt, size=size, crop=crop, patch_size=patch_size, target_shape=pred.shape, interpolation=cv2.INTER_NEAREST, ) valid = np.isfinite(gt) & (gt > 0) if not np.any(valid): continue sky_mask_path = _sky_mask_path(seq_dir, seq_info.image_paths[frame_id]) if os.path.exists(sky_mask_path): sky_mask = cv2.imread(sky_mask_path, cv2.IMREAD_GRAYSCALE) if sky_mask is not None: sky_mask = _prepare_map_for_eval( sky_mask, size=size, crop=crop, patch_size=patch_size, target_shape=pred.shape, interpolation=cv2.INTER_NEAREST, ) valid &= sky_mask > 0 valid &= np.isfinite(pred) if not np.any(valid): continue pred_valid = pred[valid].astype(np.float64) gt_valid = gt[valid].astype(np.float64) pred_safe = np.clip(pred_valid, 1e-6, None) gt_safe = np.clip(gt_valid, 1e-6, None) abs_rel_sum += np.sum(np.abs(pred_valid - gt_valid) / gt_safe) rel_ratio = np.maximum(gt_safe / pred_safe, pred_safe / gt_safe) rel_delta_hits += int(np.sum(rel_ratio < rel_delta_threshold)) valid_pixels += int(gt_valid.size) evaluated_frames += 1 if valid_pixels == 0: return None return { "abs_rel": float(abs_rel_sum / valid_pixels), "rel_delta": float(rel_delta_hits / valid_pixels), "rel_delta_threshold": rel_delta_threshold, "num_valid_pixels": int(valid_pixels), "num_frames": int(evaluated_frames), } def _extract_pose_pairs(seq_info, pred_pose_path, gt_extri): frame_ids, pred_w2c = read_pred_w2c_txt(pred_pose_path) if not pred_w2c: return None stems = frame_stems(seq_info.image_paths) pred_xyz = [] gt_xyz = [] for frame_id, pred_mat in zip(frame_ids, pred_w2c): if frame_id < 0 or frame_id >= len(stems): continue stem = stems[frame_id] if stem not in gt_extri: continue pred_c2w = np.linalg.inv(pred_mat) gt_c2w = np.linalg.inv(gt_extri[stem]) pred_xyz.append(pred_c2w[:3, 3]) gt_xyz.append(gt_c2w[:3, 3]) if len(pred_xyz) < 3: return None return np.asarray(pred_xyz, dtype=np.float64), np.asarray(gt_xyz, dtype=np.float64) def _save_traj_plot_3d(path, pred_xyz, gt_xyz): _ensure_dir(os.path.dirname(path)) pred_plot = _world_xyz_to_plot_xyz(pred_xyz) gt_plot = _world_xyz_to_plot_xyz(gt_xyz) origin = gt_plot[:1] pred_plot = pred_plot - origin gt_plot = gt_plot - origin all_plot = np.concatenate([pred_plot, gt_plot], axis=0) fig = plt.figure(figsize=(7, 6)) ax = fig.add_subplot(111, projection="3d") ax.plot( gt_plot[:, 0], gt_plot[:, 1], gt_plot[:, 2], label="gt", linewidth=2.0, color="#1f77b4", ) ax.plot( pred_plot[:, 0], pred_plot[:, 1], pred_plot[:, 2], label="pred", linewidth=2.0, color="#d62728", ) _set_equal_3d_axes(ax, all_plot) ax.view_init(elev=24, azim=-118) ax.set_xlabel("x_right") ax.set_ylabel("z_forward") ax.set_zlabel("y_up") ax.legend(loc="best") ax.set_title("Trajectory 3D (Sim3-aligned view)") fig.tight_layout() fig.savefig(path, dpi=180) plt.close(fig) def evaluate_sequence(seq_info, output_root, eval_cfg, data_cfg): seq_dir = _sequence_output_dir(output_root, seq_info.name) result = { "sequence": seq_info.name, "output_dir": seq_dir, "has_gt": False, "has_gt_pose": False, "has_gt_depth": False, } gt_extri, gt_intri, _ = _load_gt_pose_data(seq_info) pose_align = None if gt_extri: result["has_gt"] = True result["has_gt_pose"] = True pred_pose_path = os.path.join(seq_dir, "poses", "abs_pose.txt") pairs = _extract_pose_pairs(seq_info, pred_pose_path, gt_extri) if pairs is not None: pred_xyz, gt_xyz = pairs pose_metrics = ate_rmse( pred_xyz, gt_xyz, align_scale=bool(eval_cfg.get("align_scale", True)) ) sim3_scale = float(pose_metrics.get("sim3_scale", 1.0)) pred_xyz_aligned = transform_points( pred_xyz, sim3_scale, np.asarray(pose_metrics["sim3_rotation"], dtype=np.float64), np.asarray(pose_metrics["sim3_translation"], dtype=np.float64), ) pose_align = ( sim3_scale, np.asarray(pose_metrics["sim3_rotation"], dtype=np.float64), np.asarray(pose_metrics["sim3_translation"], dtype=np.float64), ) plot_path = _sequence_plot_path(output_root, seq_info.name) _save_traj_plot_3d(plot_path, pred_xyz_aligned, gt_xyz) pose_metrics.pop("sim3_scale", None) pose_metrics["traj_3d_plot"] = plot_path result["pose"] = pose_metrics video_dpt_metrics = _evaluate_video_dpt(seq_info, seq_dir, eval_cfg, data_cfg) if video_dpt_metrics is not None: result["has_gt"] = True result["has_gt_depth"] = True result["video_dpt"] = video_dpt_metrics gt_cloud = _load_gt_pointcloud(seq_info, seq_dir, gt_extri, gt_intri, eval_cfg) pointcloud_metrics = _evaluate_pointclouds( seq_info, seq_dir, eval_cfg, pose_align, gt_cloud ) if pointcloud_metrics is not None: result["has_gt"] = True result["has_gt_depth"] = True result["pointcloud"] = pointcloud_metrics if not result["has_gt"]: result["skipped"] = "missing_gt" return result def _mean_metric(sequence_results, group_name, metric_name): values = [] for item in sequence_results: group = item for key in group_name.split("."): if not isinstance(group, dict): group = None break group = group.get(key) if not isinstance(group, dict): continue if metric_name in group: values.append(float(group[metric_name])) if not values: return None return float(np.mean(values)) def evaluate_predictions_cfg(cfg): data_cfg = dict(cfg.get("data", {})) data_cfg["format"] = "generalizable" output_cfg = cfg.get("output", {}) eval_cfg = cfg.get("evaluation", {}) output_root = output_cfg.get("root", "outputs") _ensure_dir(output_root) loader = LongStreamDataLoader(data_cfg) sequence_results = [] for seq_info in loader.iter_sequence_infos(): print(f"[longstream] eval {seq_info.name}: start", flush=True) metrics = evaluate_sequence(seq_info, output_root, eval_cfg, data_cfg) sequence_results.append(metrics) metrics_path = _sequence_metrics_path(output_root, seq_info.name) _ensure_dir(os.path.dirname(metrics_path)) with open(metrics_path, "w") as f: json.dump(metrics, f, indent=2) print(f"[longstream] eval {seq_info.name}: wrote {metrics_path}", flush=True) summary = { "num_sequences": len(sequence_results), "num_sequences_with_gt": sum(1 for x in sequence_results if x.get("has_gt")), "num_sequences_with_pose_gt": sum( 1 for x in sequence_results if x.get("has_gt_pose") ), "num_sequences_with_depth_gt": sum( 1 for x in sequence_results if x.get("has_gt_depth") ), "ate_mean": _mean_metric(sequence_results, "pose", "ate_mean"), "ate_rmse_mean": _mean_metric(sequence_results, "pose", "ate_rmse"), "video_dpt_abs_rel_mean": _mean_metric( sequence_results, "video_dpt", "abs_rel" ), "video_dpt_rel_delta_mean": _mean_metric( sequence_results, "video_dpt", "rel_delta" ), "point_head_cd_mean": _mean_metric( sequence_results, "pointcloud.point_head", "cd" ), "point_head_f1_mean": _mean_metric( sequence_results, "pointcloud.point_head", "f1" ), "dpt_unproj_cd_mean": _mean_metric( sequence_results, "pointcloud.dpt_unproj", "cd" ), "dpt_unproj_f1_mean": _mean_metric( sequence_results, "pointcloud.dpt_unproj", "f1" ), "sequences": sequence_results, } summary_path = os.path.join(output_root, "summary.json") with open(summary_path, "w") as f: json.dump(summary, f, indent=2) print(f"[longstream] eval: wrote {summary_path}", flush=True) return summary