#!/usr/bin/env python3 """ Compare Unity GENMO-exported .pt inputs between two sequences. Goal: find *input* differences (K/bbox/kp2d/cam vel) that could plausibly cause incam instability/oscillation for a specific clip. Run: /root/miniconda3/envs/gvhmr/bin/python debug_compare_unity_pt.py \\ --a 101_biboo_birthday_speech_explosion_2 \\ --b 107_biboo_birthday_speech_explosion_8 """ from __future__ import annotations import argparse from dataclasses import dataclass from pathlib import Path import numpy as np import torch from genmo.utils.pylogger import Log from third_party.GVHMR.hmr4d.utils.geo.hmr_cam import ( compute_bbox_info_bedlam, get_bbx_xys, get_a_pred_cam, safely_render_x3d_K, ) from third_party.GVHMR.hmr4d.utils.smplx_utils import make_smplx @dataclass class SeqStats: vid: str L: int W: int H: int fx_mean: float fx_std: float bbx_saved_c_std: tuple[float, float] bbx_saved_s_std: float bbx_gt_c_std: tuple[float, float] bbx_gt_s_std: float bbx_delta_c_mean: tuple[float, float] bbx_delta_c_std: tuple[float, float] bbx_delta_s_mean: float bbx_delta_s_std: float fcliff_saved_std: tuple[float, float, float] fcliff_gt_std: tuple[float, float, float] fcliff_delta_std: tuple[float, float, float] kp2d_conf_gt05_frac: float kp2d_oof_conf_gt05_frac: float gt_pred_cam_std: tuple[float, float, float] gt_pred_cam_d_std: tuple[float, float, float] gt_pred_cam_d_p95: tuple[float, float, float] gt_pred_cam_norm_std: float gt_pred_cam_d_norm_p95: float def _as_np(x: torch.Tensor) -> np.ndarray: return x.detach().cpu().numpy() def _infer_wh_from_K(K_fullimg: np.ndarray) -> tuple[int, int]: cx = float(np.median(K_fullimg[:, 0, 2])) cy = float(np.median(K_fullimg[:, 1, 2])) W = int(round(cx * 2.0)) H = int(round(cy * 2.0)) return max(W, 1), max(H, 1) def compute_seq_stats(pt_path: Path, smplx_model) -> SeqStats: data = torch.load(pt_path, map_location="cpu", weights_only=False) vid = pt_path.stem bbx_saved = _as_np(data["bbx_xys"]).astype(np.float64) # (L,3) K = _as_np(data["K_fullimg"]).astype(np.float64) # (L,3,3) kp2d = _as_np(data.get("kp2d", torch.zeros((bbx_saved.shape[0], 17, 3)))).astype( np.float64 ) transl_c = _as_np(data["smpl_params_c"]["transl"]).astype(np.float64) # (L,3) L = int(bbx_saved.shape[0]) W, H = _infer_wh_from_K(K) fx = K[:, 0, 0] # Compute GT-projected bbox from verts (same logic used during training when bbx is missing). smpl_params_c = data["smpl_params_c"] with torch.no_grad(): out = smplx_model( global_orient=smpl_params_c["global_orient"].float(), body_pose=smpl_params_c["body_pose"].float(), betas=smpl_params_c["betas"].float(), transl=smpl_params_c["transl"].float(), ) verts = out.vertices # (L, V, 3) verts_b = verts[None] # (1,L,V,3) K_b = torch.from_numpy(K).float()[None] # (1,L,3,3) i_x2d = safely_render_x3d_K(verts_b, K_b, thr=0.3) # (1,L,V,2) bbx_gt = get_bbx_xys(i_x2d, do_augment=False)[0].detach().cpu().numpy().astype(np.float64) # (L,3) # bbox stats bbx_saved_c = bbx_saved[:, :2] bbx_saved_s = bbx_saved[:, 2] bbx_gt_c = bbx_gt[:, :2] bbx_gt_s = bbx_gt[:, 2] bbx_delta_c = bbx_saved_c - bbx_gt_c bbx_delta_s = bbx_saved_s - bbx_gt_s # f_cliffcam stats (this is what the network sees) bbx_saved_t = torch.from_numpy(bbx_saved).float() bbx_gt_t = torch.from_numpy(bbx_gt).float() K_t = torch.from_numpy(K).float() fcliff_saved = compute_bbox_info_bedlam(bbx_saved_t, K_t).numpy().astype(np.float64) fcliff_gt = compute_bbox_info_bedlam(bbx_gt_t, K_t).numpy().astype(np.float64) fcliff_delta = fcliff_saved - fcliff_gt # Conditioning target used by incam translation loss: gt_pred_cam (s,tx,ty) # (see `third_party/.../hmr_cam.py:get_a_pred_cam`). gt_pred_cam = get_a_pred_cam( torch.from_numpy(transl_c).float(), bbx_saved_t, K_t, ).numpy().astype(np.float64) # (L,3) d_gt_pred_cam = np.diff(gt_pred_cam, axis=0) gt_pred_cam_std = gt_pred_cam.std(axis=0) gt_pred_cam_norm_std = float(np.linalg.norm(gt_pred_cam - gt_pred_cam.mean(axis=0), axis=1).std()) gt_pred_cam_d_std = d_gt_pred_cam.std(axis=0) gt_pred_cam_d_p95 = np.percentile(np.abs(d_gt_pred_cam), 95, axis=0) gt_pred_cam_d_norm_p95 = float(np.percentile(np.linalg.norm(d_gt_pred_cam, axis=1), 95)) # kp2d sanity: how much of provided kp2d is confidently in-frame? conf = kp2d[..., 2] x = kp2d[..., 0] y = kp2d[..., 1] conf_gt05 = conf > 0.5 conf_gt05_frac = float(conf_gt05.mean()) if conf.size else 0.0 oof = (x < 0.0) | (x > (W - 1.0)) | (y < 0.0) | (y > (H - 1.0)) oof_conf = oof & conf_gt05 oof_conf_frac = float(oof_conf.sum() / max(conf_gt05.sum(), 1.0)) return SeqStats( vid=vid, L=L, W=W, H=H, fx_mean=float(fx.mean()), fx_std=float(fx.std()), bbx_saved_c_std=(float(bbx_saved_c[:, 0].std()), float(bbx_saved_c[:, 1].std())), bbx_saved_s_std=float(bbx_saved_s.std()), bbx_gt_c_std=(float(bbx_gt_c[:, 0].std()), float(bbx_gt_c[:, 1].std())), bbx_gt_s_std=float(bbx_gt_s.std()), bbx_delta_c_mean=( float(bbx_delta_c[:, 0].mean()), float(bbx_delta_c[:, 1].mean()), ), bbx_delta_c_std=(float(bbx_delta_c[:, 0].std()), float(bbx_delta_c[:, 1].std())), bbx_delta_s_mean=float(bbx_delta_s.mean()), bbx_delta_s_std=float(bbx_delta_s.std()), fcliff_saved_std=( float(fcliff_saved[:, 0].std()), float(fcliff_saved[:, 1].std()), float(fcliff_saved[:, 2].std()), ), fcliff_gt_std=( float(fcliff_gt[:, 0].std()), float(fcliff_gt[:, 1].std()), float(fcliff_gt[:, 2].std()), ), fcliff_delta_std=( float(fcliff_delta[:, 0].std()), float(fcliff_delta[:, 1].std()), float(fcliff_delta[:, 2].std()), ), kp2d_conf_gt05_frac=conf_gt05_frac, kp2d_oof_conf_gt05_frac=oof_conf_frac, gt_pred_cam_std=(float(gt_pred_cam_std[0]), float(gt_pred_cam_std[1]), float(gt_pred_cam_std[2])), gt_pred_cam_d_std=(float(gt_pred_cam_d_std[0]), float(gt_pred_cam_d_std[1]), float(gt_pred_cam_d_std[2])), gt_pred_cam_d_p95=(float(gt_pred_cam_d_p95[0]), float(gt_pred_cam_d_p95[1]), float(gt_pred_cam_d_p95[2])), gt_pred_cam_norm_std=gt_pred_cam_norm_std, gt_pred_cam_d_norm_p95=gt_pred_cam_d_norm_p95, ) def _print_stats(s: SeqStats) -> None: Log.info(f"=== {s.vid} ===") Log.info(f"L={s.L} W×H={s.W}×{s.H} fx_mean/std={s.fx_mean:.3f}/{s.fx_std:.3f}") Log.info( "bbx_saved std: center(x/y)=(%.2f,%.2f) size=%.2f" % (*s.bbx_saved_c_std, s.bbx_saved_s_std) ) Log.info( "bbx_gt std: center(x/y)=(%.2f,%.2f) size=%.2f" % (*s.bbx_gt_c_std, s.bbx_gt_s_std) ) Log.info( "bbx(saved-gt) mean: center(x/y)=(%.2f,%.2f) size=%.2f" % (*s.bbx_delta_c_mean, s.bbx_delta_s_mean) ) Log.info( "bbx(saved-gt) std : center(x/y)=(%.2f,%.2f) size=%.2f" % (*s.bbx_delta_c_std, s.bbx_delta_s_std) ) Log.info( "f_cliff std saved=(%.4f,%.4f,%.4f) gt=(%.4f,%.4f,%.4f) delta_std=(%.4f,%.4f,%.4f)" % ( *s.fcliff_saved_std, *s.fcliff_gt_std, *s.fcliff_delta_std, ) ) Log.info( "kp2d conf>0.5 frac=%.3f oof|conf>0.5 frac=%.3f" % (s.kp2d_conf_gt05_frac, s.kp2d_oof_conf_gt05_frac) ) Log.info( "gt_pred_cam std(s/tx/ty)=(%.4f,%.4f,%.4f) norm_std=%.4f" % (*s.gt_pred_cam_std, s.gt_pred_cam_norm_std) ) Log.info( "d(gt_pred_cam) std(s/tx/ty)=(%.4f,%.4f,%.4f) p95_abs(s/tx/ty)=(%.4f,%.4f,%.4f) p95_norm=%.4f" % (*s.gt_pred_cam_d_std, *s.gt_pred_cam_d_p95, s.gt_pred_cam_d_norm_p95) ) def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--root", default="processed_dataset/genmo_features") ap.add_argument("--a", required=True) ap.add_argument("--b", required=True) args = ap.parse_args() root = Path(args.root) pt_a = root / f"{args.a}.pt" pt_b = root / f"{args.b}.pt" if not pt_a.exists(): raise FileNotFoundError(pt_a) if not pt_b.exists(): raise FileNotFoundError(pt_b) smplx_model = make_smplx("supermotion").eval() s_a = compute_seq_stats(pt_a, smplx_model) s_b = compute_seq_stats(pt_b, smplx_model) _print_stats(s_a) _print_stats(s_b) Log.info("=== delta (A - B) ===") Log.info( "fcliff_delta_std A vs B: (%.4f,%.4f,%.4f) vs (%.4f,%.4f,%.4f)" % (*s_a.fcliff_delta_std, *s_b.fcliff_delta_std) ) Log.info( "bbx(saved-gt) center std A vs B: (%.2f,%.2f) vs (%.2f,%.2f)" % (*s_a.bbx_delta_c_std, *s_b.bbx_delta_c_std) ) if __name__ == "__main__": main()