vr-hmr / debug_compare_unity_pt.py
zirobtc's picture
Upload folder using huggingface_hub
7e120dd
#!/usr/bin/env python3
"""
Compare Unity GENMO-exported .pt inputs between two sequences.
Goal: find *input* differences (K/bbox/kp2d/cam vel) that could plausibly cause
incam instability/oscillation for a specific clip.
Run:
/root/miniconda3/envs/gvhmr/bin/python debug_compare_unity_pt.py \\
--a 101_biboo_birthday_speech_explosion_2 \\
--b 107_biboo_birthday_speech_explosion_8
"""
from __future__ import annotations
import argparse
from dataclasses import dataclass
from pathlib import Path
import numpy as np
import torch
from genmo.utils.pylogger import Log
from third_party.GVHMR.hmr4d.utils.geo.hmr_cam import (
compute_bbox_info_bedlam,
get_bbx_xys,
get_a_pred_cam,
safely_render_x3d_K,
)
from third_party.GVHMR.hmr4d.utils.smplx_utils import make_smplx
@dataclass
class SeqStats:
vid: str
L: int
W: int
H: int
fx_mean: float
fx_std: float
bbx_saved_c_std: tuple[float, float]
bbx_saved_s_std: float
bbx_gt_c_std: tuple[float, float]
bbx_gt_s_std: float
bbx_delta_c_mean: tuple[float, float]
bbx_delta_c_std: tuple[float, float]
bbx_delta_s_mean: float
bbx_delta_s_std: float
fcliff_saved_std: tuple[float, float, float]
fcliff_gt_std: tuple[float, float, float]
fcliff_delta_std: tuple[float, float, float]
kp2d_conf_gt05_frac: float
kp2d_oof_conf_gt05_frac: float
gt_pred_cam_std: tuple[float, float, float]
gt_pred_cam_d_std: tuple[float, float, float]
gt_pred_cam_d_p95: tuple[float, float, float]
gt_pred_cam_norm_std: float
gt_pred_cam_d_norm_p95: float
def _as_np(x: torch.Tensor) -> np.ndarray:
return x.detach().cpu().numpy()
def _infer_wh_from_K(K_fullimg: np.ndarray) -> tuple[int, int]:
cx = float(np.median(K_fullimg[:, 0, 2]))
cy = float(np.median(K_fullimg[:, 1, 2]))
W = int(round(cx * 2.0))
H = int(round(cy * 2.0))
return max(W, 1), max(H, 1)
def compute_seq_stats(pt_path: Path, smplx_model) -> SeqStats:
data = torch.load(pt_path, map_location="cpu", weights_only=False)
vid = pt_path.stem
bbx_saved = _as_np(data["bbx_xys"]).astype(np.float64) # (L,3)
K = _as_np(data["K_fullimg"]).astype(np.float64) # (L,3,3)
kp2d = _as_np(data.get("kp2d", torch.zeros((bbx_saved.shape[0], 17, 3)))).astype(
np.float64
)
transl_c = _as_np(data["smpl_params_c"]["transl"]).astype(np.float64) # (L,3)
L = int(bbx_saved.shape[0])
W, H = _infer_wh_from_K(K)
fx = K[:, 0, 0]
# Compute GT-projected bbox from verts (same logic used during training when bbx is missing).
smpl_params_c = data["smpl_params_c"]
with torch.no_grad():
out = smplx_model(
global_orient=smpl_params_c["global_orient"].float(),
body_pose=smpl_params_c["body_pose"].float(),
betas=smpl_params_c["betas"].float(),
transl=smpl_params_c["transl"].float(),
)
verts = out.vertices # (L, V, 3)
verts_b = verts[None] # (1,L,V,3)
K_b = torch.from_numpy(K).float()[None] # (1,L,3,3)
i_x2d = safely_render_x3d_K(verts_b, K_b, thr=0.3) # (1,L,V,2)
bbx_gt = get_bbx_xys(i_x2d, do_augment=False)[0].detach().cpu().numpy().astype(np.float64) # (L,3)
# bbox stats
bbx_saved_c = bbx_saved[:, :2]
bbx_saved_s = bbx_saved[:, 2]
bbx_gt_c = bbx_gt[:, :2]
bbx_gt_s = bbx_gt[:, 2]
bbx_delta_c = bbx_saved_c - bbx_gt_c
bbx_delta_s = bbx_saved_s - bbx_gt_s
# f_cliffcam stats (this is what the network sees)
bbx_saved_t = torch.from_numpy(bbx_saved).float()
bbx_gt_t = torch.from_numpy(bbx_gt).float()
K_t = torch.from_numpy(K).float()
fcliff_saved = compute_bbox_info_bedlam(bbx_saved_t, K_t).numpy().astype(np.float64)
fcliff_gt = compute_bbox_info_bedlam(bbx_gt_t, K_t).numpy().astype(np.float64)
fcliff_delta = fcliff_saved - fcliff_gt
# Conditioning target used by incam translation loss: gt_pred_cam (s,tx,ty)
# (see `third_party/.../hmr_cam.py:get_a_pred_cam`).
gt_pred_cam = get_a_pred_cam(
torch.from_numpy(transl_c).float(),
bbx_saved_t,
K_t,
).numpy().astype(np.float64) # (L,3)
d_gt_pred_cam = np.diff(gt_pred_cam, axis=0)
gt_pred_cam_std = gt_pred_cam.std(axis=0)
gt_pred_cam_norm_std = float(np.linalg.norm(gt_pred_cam - gt_pred_cam.mean(axis=0), axis=1).std())
gt_pred_cam_d_std = d_gt_pred_cam.std(axis=0)
gt_pred_cam_d_p95 = np.percentile(np.abs(d_gt_pred_cam), 95, axis=0)
gt_pred_cam_d_norm_p95 = float(np.percentile(np.linalg.norm(d_gt_pred_cam, axis=1), 95))
# kp2d sanity: how much of provided kp2d is confidently in-frame?
conf = kp2d[..., 2]
x = kp2d[..., 0]
y = kp2d[..., 1]
conf_gt05 = conf > 0.5
conf_gt05_frac = float(conf_gt05.mean()) if conf.size else 0.0
oof = (x < 0.0) | (x > (W - 1.0)) | (y < 0.0) | (y > (H - 1.0))
oof_conf = oof & conf_gt05
oof_conf_frac = float(oof_conf.sum() / max(conf_gt05.sum(), 1.0))
return SeqStats(
vid=vid,
L=L,
W=W,
H=H,
fx_mean=float(fx.mean()),
fx_std=float(fx.std()),
bbx_saved_c_std=(float(bbx_saved_c[:, 0].std()), float(bbx_saved_c[:, 1].std())),
bbx_saved_s_std=float(bbx_saved_s.std()),
bbx_gt_c_std=(float(bbx_gt_c[:, 0].std()), float(bbx_gt_c[:, 1].std())),
bbx_gt_s_std=float(bbx_gt_s.std()),
bbx_delta_c_mean=(
float(bbx_delta_c[:, 0].mean()),
float(bbx_delta_c[:, 1].mean()),
),
bbx_delta_c_std=(float(bbx_delta_c[:, 0].std()), float(bbx_delta_c[:, 1].std())),
bbx_delta_s_mean=float(bbx_delta_s.mean()),
bbx_delta_s_std=float(bbx_delta_s.std()),
fcliff_saved_std=(
float(fcliff_saved[:, 0].std()),
float(fcliff_saved[:, 1].std()),
float(fcliff_saved[:, 2].std()),
),
fcliff_gt_std=(
float(fcliff_gt[:, 0].std()),
float(fcliff_gt[:, 1].std()),
float(fcliff_gt[:, 2].std()),
),
fcliff_delta_std=(
float(fcliff_delta[:, 0].std()),
float(fcliff_delta[:, 1].std()),
float(fcliff_delta[:, 2].std()),
),
kp2d_conf_gt05_frac=conf_gt05_frac,
kp2d_oof_conf_gt05_frac=oof_conf_frac,
gt_pred_cam_std=(float(gt_pred_cam_std[0]), float(gt_pred_cam_std[1]), float(gt_pred_cam_std[2])),
gt_pred_cam_d_std=(float(gt_pred_cam_d_std[0]), float(gt_pred_cam_d_std[1]), float(gt_pred_cam_d_std[2])),
gt_pred_cam_d_p95=(float(gt_pred_cam_d_p95[0]), float(gt_pred_cam_d_p95[1]), float(gt_pred_cam_d_p95[2])),
gt_pred_cam_norm_std=gt_pred_cam_norm_std,
gt_pred_cam_d_norm_p95=gt_pred_cam_d_norm_p95,
)
def _print_stats(s: SeqStats) -> None:
Log.info(f"=== {s.vid} ===")
Log.info(f"L={s.L} W×H={s.W}×{s.H} fx_mean/std={s.fx_mean:.3f}/{s.fx_std:.3f}")
Log.info(
"bbx_saved std: center(x/y)=(%.2f,%.2f) size=%.2f"
% (*s.bbx_saved_c_std, s.bbx_saved_s_std)
)
Log.info(
"bbx_gt std: center(x/y)=(%.2f,%.2f) size=%.2f"
% (*s.bbx_gt_c_std, s.bbx_gt_s_std)
)
Log.info(
"bbx(saved-gt) mean: center(x/y)=(%.2f,%.2f) size=%.2f"
% (*s.bbx_delta_c_mean, s.bbx_delta_s_mean)
)
Log.info(
"bbx(saved-gt) std : center(x/y)=(%.2f,%.2f) size=%.2f"
% (*s.bbx_delta_c_std, s.bbx_delta_s_std)
)
Log.info(
"f_cliff std saved=(%.4f,%.4f,%.4f) gt=(%.4f,%.4f,%.4f) delta_std=(%.4f,%.4f,%.4f)"
% (
*s.fcliff_saved_std,
*s.fcliff_gt_std,
*s.fcliff_delta_std,
)
)
Log.info(
"kp2d conf>0.5 frac=%.3f oof|conf>0.5 frac=%.3f"
% (s.kp2d_conf_gt05_frac, s.kp2d_oof_conf_gt05_frac)
)
Log.info(
"gt_pred_cam std(s/tx/ty)=(%.4f,%.4f,%.4f) norm_std=%.4f"
% (*s.gt_pred_cam_std, s.gt_pred_cam_norm_std)
)
Log.info(
"d(gt_pred_cam) std(s/tx/ty)=(%.4f,%.4f,%.4f) p95_abs(s/tx/ty)=(%.4f,%.4f,%.4f) p95_norm=%.4f"
% (*s.gt_pred_cam_d_std, *s.gt_pred_cam_d_p95, s.gt_pred_cam_d_norm_p95)
)
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--root", default="processed_dataset/genmo_features")
ap.add_argument("--a", required=True)
ap.add_argument("--b", required=True)
args = ap.parse_args()
root = Path(args.root)
pt_a = root / f"{args.a}.pt"
pt_b = root / f"{args.b}.pt"
if not pt_a.exists():
raise FileNotFoundError(pt_a)
if not pt_b.exists():
raise FileNotFoundError(pt_b)
smplx_model = make_smplx("supermotion").eval()
s_a = compute_seq_stats(pt_a, smplx_model)
s_b = compute_seq_stats(pt_b, smplx_model)
_print_stats(s_a)
_print_stats(s_b)
Log.info("=== delta (A - B) ===")
Log.info(
"fcliff_delta_std A vs B: (%.4f,%.4f,%.4f) vs (%.4f,%.4f,%.4f)"
% (*s_a.fcliff_delta_std, *s_b.fcliff_delta_std)
)
Log.info(
"bbx(saved-gt) center std A vs B: (%.2f,%.2f) vs (%.2f,%.2f)"
% (*s_a.bbx_delta_c_std, *s_b.bbx_delta_c_std)
)
if __name__ == "__main__":
main()