vr-hmr / scripts /demo /render_npz_global.py
zirobtc's picture
Upload folder using huggingface_hub
7e120dd
import argparse
from pathlib import Path
from typing import Dict, Tuple
import numpy as np
import torch
from genmo.utils.geo_transform import apply_T_on_points, compute_T_ayfz2ay
from genmo.utils.video_io_utils import get_writer
from genmo.utils.vis.renderer import (
Renderer,
get_global_cameras_static,
get_ground_params_from_points,
)
from third_party.GVHMR.hmr4d.utils.geo.hmr_cam import create_camera_sensor
from third_party.GVHMR.hmr4d.utils.smplx_utils import make_smplx
def _load_motion_npz(npz_path: Path) -> Tuple[np.ndarray, np.ndarray, np.ndarray, float, str]:
with np.load(npz_path, allow_pickle=True) as d:
poses = np.asarray(d["poses"], dtype=np.float32)
trans = np.asarray(d["trans"], dtype=np.float32)
betas = np.asarray(d["betas"], dtype=np.float32).reshape(-1)
fps = float(np.asarray(d.get("mocap_framerate", 30.0)))
gender = str(np.asarray(d.get("gender", "neutral")))
if poses.ndim != 2 or poses.shape[1] < 66:
raise ValueError(f"Expected poses (F,165) or (F,>=66); got {poses.shape}")
if trans.ndim != 2 or trans.shape[1] != 3:
raise ValueError(f"Expected trans (F,3); got {trans.shape}")
if betas.shape[0] < 10:
betas = np.pad(betas, (0, 10 - betas.shape[0]))
betas = betas[:10]
if trans.shape[0] != poses.shape[0]:
raise ValueError(f"poses and trans length mismatch: {poses.shape[0]} vs {trans.shape[0]}")
return poses, trans, betas, fps, gender
def _split_smplx_poses(poses165: torch.Tensor) -> Dict[str, torch.Tensor]:
# SMPL-X pose layout: [global(3), body(63), jaw(3), leye(3), reye(3), lhand(45), rhand(45)] = 165
global_orient = poses165[:, 0:3]
body_pose = poses165[:, 3:66]
extra = poses165[:, 66:]
params = {
"global_orient": global_orient,
"body_pose": body_pose,
}
if extra.shape[1] >= 99:
params.update(
{
"jaw_pose": extra[:, 0:3],
"leye_pose": extra[:, 3:6],
"reye_pose": extra[:, 6:9],
"left_hand_pose": extra[:, 9:54],
"right_hand_pose": extra[:, 54:99],
}
)
return params
def _try_smplx_forward(smplx, params: Dict[str, torch.Tensor]) -> torch.Tensor:
try:
out = smplx(**params)
verts = out.vertices if hasattr(out, "vertices") else out[0].vertices
return verts
except (TypeError, RuntimeError):
# Fallback: model variant doesn't take hand/face params (or expects PCA hand pose dims).
keep = {k: v for k, v in params.items() if k in {"global_orient", "body_pose", "betas", "transl"}}
out = smplx(**keep)
verts = out.vertices if hasattr(out, "vertices") else out[0].vertices
return verts
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--npz", required=True, type=str)
ap.add_argument("--out", required=True, type=str)
ap.add_argument("--max_frames", type=int, default=300, help="Max rendered frames (uniformly sampled). Use -1 for all.")
ap.add_argument("--size", type=int, default=512)
ap.add_argument("--f_mm", type=float, default=24.0)
ap.add_argument("--crf", type=int, default=23)
args = ap.parse_args()
npz_path = Path(args.npz)
out_path = Path(args.out)
out_path.parent.mkdir(parents=True, exist_ok=True)
poses, trans, betas, fps, gender = _load_motion_npz(npz_path)
total = poses.shape[0]
if args.max_frames is None or args.max_frames == 0:
raise ValueError("--max_frames must be -1 or a positive integer")
if args.max_frames < 0 or args.max_frames >= total:
idxs = np.arange(total, dtype=np.int64)
else:
idxs = np.linspace(0, total - 1, int(args.max_frames), dtype=np.int64)
# Keep the original FPS even when subsampling frames (avoids tiny fps which can
# overflow pyav's rational conversion on some builds).
fps_out = fps
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
smplx = make_smplx("supermotion").to(device).eval()
poses_t = torch.from_numpy(poses[idxs]).to(device)
trans_t = torch.from_numpy(trans[idxs]).to(device)
betas_t = torch.from_numpy(betas[None]).to(device).repeat(len(idxs), 1)
params = _split_smplx_poses(poses_t)
params["betas"] = betas_t
params["transl"] = trans_t
with torch.inference_mode():
verts_smplx = _try_smplx_forward(smplx, params)
# Convert to SMPL topology if possible (better matches the regressor + faster render).
smplx2smpl_path = Path("third_party/GVHMR/hmr4d/utils/body_model/smplx2smpl_sparse.pt")
if smplx2smpl_path.exists():
smplx2smpl = torch.load(smplx2smpl_path, map_location=device)
verts = torch.stack([torch.matmul(smplx2smpl, v) for v in verts_smplx])
faces = make_smplx("smpl", gender="male").faces
else:
verts = verts_smplx
faces = smplx.faces
# Align like infer_video.py (ground + face-Z)
j_reg_path = Path("third_party/GVHMR/inputs/checkpoints/body_models/smpl_neutral_J_regressor.pt")
J_reg = torch.load(j_reg_path, map_location=device) if j_reg_path.exists() else None
if J_reg is not None and verts.shape[1] == J_reg.shape[-1]:
root0 = torch.matmul(J_reg, verts[0])[0]
offset = root0.clone()
else:
J_reg = None
offset = verts[0].mean(0)
offset[1] = verts[..., 1].min()
verts = verts - offset
if J_reg is not None:
joints0 = torch.matmul(J_reg, verts[0])[None]
T_ay2ayfz = compute_T_ayfz2ay(joints0, inverse=True)
verts = apply_T_on_points(verts, T_ay2ayfz)
size = int(args.size)
_, _, K = create_camera_sensor(size, size, float(args.f_mm))
renderer = Renderer(size, size, device=device, faces=faces, K=K.to(device), bin_size=0)
global_R, global_T, global_lights = get_global_cameras_static(
verts.detach().cpu(),
beta=2.0,
cam_height_degree=20,
target_center_height=1.0,
device=str(device),
)
if J_reg is not None:
roots = torch.einsum("jv,fvi->fji", J_reg, verts)[..., 0, :]
else:
roots = verts.mean(1)
scale, cx, cz = get_ground_params_from_points(roots.detach().cpu(), verts.detach().cpu())
renderer.set_ground(scale * 1.5, cx, cz)
writer = get_writer(str(out_path), fps=float(fps_out), crf=int(args.crf))
try:
color = torch.tensor([[0.8, 0.2, 0.8]], device=device) # purple-ish
for i in range(verts.shape[0]):
cameras = renderer.create_camera(global_R[i], global_T[i])
img = renderer.render_with_ground(verts[[i]], color, cameras, global_lights)
writer.write_frame(img.astype(np.uint8))
finally:
writer.close()
if __name__ == "__main__":
main()