vr-hmr / debug_overlay.py

Upload folder using huggingface_hub

7e120dd about 1 month ago

9.02 kB

	import os
	import cv2
	import torch
	import numpy as np

	# ---- paths (edit these) ----
	OUT_DIR = "./outputs/infer_video/VRM_JG6Z7WA_3780.00_3804.00"
	VIDEO_PATH = os.path.join(OUT_DIR, "1_incam.mp4")

	BBX_PATH = os.path.join(OUT_DIR, "preprocess", "bbx.pt")
	VITPOSE_PATH = os.path.join(OUT_DIR, "preprocess", "vitpose.pt")

	OUT_BBOX_ONLY = os.path.join(OUT_DIR, "debug_bbox_only_on_incam.mp4")
	OUT_BBOX_KP = os.path.join(OUT_DIR, "debug_bbox_kp_on_incam.mp4")
	# ---------------------------

	COCO17_NAMES = [
	"nose", "l_eye", "r_eye", "l_ear", "r_ear",
	"l_sho", "r_sho", "l_elb", "r_elb", "l_wri", "r_wri",
	"l_hip", "r_hip", "l_knee", "r_knee", "l_ank", "r_ank"
	]


	def to_numpy(x):
	if isinstance(x, torch.Tensor):
	return x.detach().cpu().numpy()
	return np.array(x)


	def xyxy_to_xys(bbx_xyxy_t: torch.Tensor) -> torch.Tensor:
	"""(L,4) xyxy -> (L,3) (cx,cy,s) where s is square side = max(w,h)."""
	x1, y1, x2, y2 = bbx_xyxy_t.unbind(-1)
	cx = (x1 + x2) * 0.5
	cy = (y1 + y2) * 0.5
	w = (x2 - x1).clamp(min=1.0)
	h = (y2 - y1).clamp(min=1.0)
	s = torch.maximum(w, h)
	return torch.stack([cx, cy, s], dim=-1)


	def xys_to_xyxy(bbx_xys_t: torch.Tensor) -> torch.Tensor:
	"""(L,3) (cx,cy,s) -> (L,4) xyxy of square."""
	cx, cy, s = bbx_xys_t.unbind(-1)
	hs = s * 0.5
	x1 = cx - hs
	y1 = cy - hs
	x2 = cx + hs
	y2 = cy + hs
	return torch.stack([x1, y1, x2, y2], dim=-1)


	def draw_bbox_xyxy(frame, xyxy, color=(0, 255, 0), thickness=2):
	x1, y1, x2, y2 = [int(round(float(v))) for v in xyxy]
	cv2.rectangle(frame, (x1, y1), (x2, y2), color, thickness)
	return frame


	def draw_kps_with_names(frame, kps_xy, conf=None, radius=3, show_conf=True, conf_thr=0.0):
	"""
	kps_xy: (J,2) in image pixels
	conf: (J,) optional
	- Always renders the label (including confidence) so you can verify confidence behavior.
	- Text is BLACK with a white background box for readability.
	"""
	H, W = frame.shape[:2]

	for j, (x, y) in enumerate(kps_xy):
	x_i, y_i = int(round(float(x))), int(round(float(y)))
	if x_i < 0 or y_i < 0 or x_i >= W or y_i >= H:
	continue

	# Confidence
	if conf is None:
	c = 1.0
	else:
	c = float(conf[j])

	ok = (c >= conf_thr)

	# Draw joint marker (keep your preferred colors; this is just for visibility)
	if conf is None:
	pt_color = (0, 0, 255) # red
	else:
	pt_color = (0, 0, 255) if ok else (150, 150, 150) # red if ok, gray if low

	cv2.circle(frame, (x_i, y_i), radius, pt_color, -1)

	# Label
	name = COCO17_NAMES[j] if j < len(COCO17_NAMES) else f"j{j}"
	label = f"{name} {c:.2f}" if (conf is not None and show_conf) else name

	# Position label
	org = (x_i + 4, y_i - 6)

	# Compute text size for background box
	font = cv2.FONT_HERSHEY_SIMPLEX
	font_scale = 0.40
	thickness = 1
	(tw, th), baseline = cv2.getTextSize(label, font, font_scale, thickness)

	# Background rectangle (white), then black text on top
	x0, y0 = org[0], org[1] - th
	x1, y1 = org[0] + tw, org[1] + baseline

	# Clamp background box within frame
	x0 = max(0, min(W - 1, x0))
	y0 = max(0, min(H - 1, y0))
	x1 = max(0, min(W - 1, x1))
	y1 = max(0, min(H - 1, y1))

	cv2.rectangle(frame, (x0, y0), (x1, y1), (255, 255, 255), -1) # filled white
	cv2.putText(frame, label, org, font, font_scale, (0, 0, 0), thickness, cv2.LINE_AA) # black text

	return frame



	def convert_kp_to_image_pixels(kp, bbx_xys, crop_size=256):
	"""
	Convert kp to full-image pixel coords using HMR-style crop mapping:
	x_img = cx + x_norm * (s/2)
	y_img = cy + y_norm * (s/2)

	Supports:
	- kp in [-1,1] (normalized crop coords)
	- kp in crop pixels [0..crop_size-1]
	- kp already in image pixels (then returned unchanged)
	"""
	kp = np.asarray(kp, dtype=np.float32) # (L,J,2)
	bbx_xys = np.asarray(bbx_xys, dtype=np.float32) # (L,3)

	kp_min = float(np.nanmin(kp))
	kp_max = float(np.nanmax(kp))

	# Decide mode
	if kp_min >= -1.5 and kp_max <= 1.5:
	mode = "norm_pm1" # [-1,1]
	kp_norm = kp
	elif kp_min >= -5.0 and kp_max <= (crop_size + 5.0):
	mode = "crop_pixels"
	# crop pixels -> [-1,1]
	denom = (crop_size - 1.0)
	kp_norm = (kp / denom) * 2.0 - 1.0
	else:
	mode = "image_pixels"
	return kp, mode, (kp_min, kp_max)

	cx = bbx_xys[:, 0:1] # (L,1)
	cy = bbx_xys[:, 1:2] # (L,1)
	s = bbx_xys[:, 2:3] # (L,1)
	hs = s * 0.5

	x_img = cx + kp_norm[..., 0] * hs
	y_img = cy + kp_norm[..., 1] * hs
	kp_img = np.stack([x_img, y_img], axis=-1)
	return kp_img, mode, (kp_min, kp_max)


	def main():
	# ---- Load bbox ----
	bbx = torch.load(BBX_PATH, map_location="cpu")

	bbx_xyxy_t = bbx.get("bbx_xyxy", None)
	bbx_xys_t = bbx.get("bbx_xys", None)

	if bbx_xyxy_t is None and bbx_xys_t is None:
	raise ValueError("bbx.pt must contain 'bbx_xyxy' and/or 'bbx_xys'.")

	if bbx_xys_t is None and bbx_xyxy_t is not None:
	bbx_xys_t = xyxy_to_xys(bbx_xyxy_t)

	if bbx_xyxy_t is None and bbx_xys_t is not None:
	bbx_xyxy_t = xys_to_xyxy(bbx_xys_t)

	bbx_xyxy = to_numpy(bbx_xyxy_t) # (L,4)
	bbx_xys = to_numpy(bbx_xys_t) # (L,3)

	print("bbx_xyxy shape:", bbx_xyxy.shape, "bbx_xys shape:", bbx_xys.shape)

	# ---- Load vitpose ----
	vitpose = torch.load(VITPOSE_PATH, map_location="cpu")

	conf = None
	if isinstance(vitpose, dict):
	kp = None
	for k in ["kp2d", "keypoints", "kps", "joints_2d", "vitpose"]:
	if k in vitpose:
	kp = vitpose[k]
	break
	if kp is None:
	print("vitpose.pt keys:", list(vitpose.keys()))
	raise ValueError("Couldn't find keypoints in vitpose dict.")
	kp = to_numpy(kp)

	for k in ["conf", "confidence", "scores", "kp2d_conf", "keypoint_scores"]:
	if k in vitpose:
	conf = to_numpy(vitpose[k])
	break
	else:
	kp = to_numpy(vitpose)

	if kp.ndim != 3:
	raise ValueError(f"Unexpected kp shape: {kp.shape} (expected L x J x 2/3)")

	if kp.shape[-1] == 3 and conf is None:
	conf = kp[..., 2]
	kp = kp[..., :2]
	elif kp.shape[-1] != 2:
	raise ValueError(f"Unexpected kp last dim: {kp.shape[-1]} (expected 2 or 3)")

	# ---- Open video ----
	cap = cv2.VideoCapture(VIDEO_PATH)
	if not cap.isOpened():
	raise RuntimeError(f"Failed to open video: {VIDEO_PATH}")

	fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
	W = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
	H = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
	print("Video:", VIDEO_PATH, "W,H:", W, H, "fps:", fps)

	# ---- Align lengths ----
	L = min(len(bbx_xyxy), len(bbx_xys), kp.shape[0])
	print("Using L =", L)

	# ---- Convert keypoints to image pixels if needed ----
	kp_img, mode, (kp_min, kp_max) = convert_kp_to_image_pixels(kp[:L], bbx_xys[:L], crop_size=256)
	print(f"kp stats min/max: {kp_min:.3f} / {kp_max:.3f} -> interpreted as mode: {mode}")

	# Basic bbox sanity
	centers = np.stack(
	[(bbx_xyxy[:L, 0] + bbx_xyxy[:L, 2]) * 0.5,
	(bbx_xyxy[:L, 1] + bbx_xyxy[:L, 3]) * 0.5],
	axis=-1
	)
	center_speed = np.linalg.norm(centers[1:] - centers[:-1], axis=-1)
	if len(center_speed) > 0:
	print("bbox center jump px (p50/p90/max):",
	float(np.percentile(center_speed, 50)),
	float(np.percentile(center_speed, 90)),
	float(center_speed.max()))

	if conf is not None:
	conf_use = conf[:L]
	print("kp conf (mean/p10):",
	float(np.mean(conf_use)),
	float(np.percentile(conf_use, 10)))

	# ---- Writers ----
	fourcc = cv2.VideoWriter_fourcc(*"mp4v")
	w_bbox = cv2.VideoWriter(OUT_BBOX_ONLY, fourcc, fps, (W, H))
	w_kp = cv2.VideoWriter(OUT_BBOX_KP, fourcc, fps, (W, H))

	t = 0
	while t < L:
	ok, frame = cap.read()
	if not ok:
	break

	f1 = frame.copy()
	f2 = frame.copy()

	draw_bbox_xyxy(f1, bbx_xyxy[t])
	draw_bbox_xyxy(f2, bbx_xyxy[t])

	c_t = conf[t] if conf is not None else None
	draw_kps_with_names(f2, kp_img[t], conf=c_t, show_conf=True)

	cv2.putText(f1, f"t={t}", (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255),
	2, cv2.LINE_AA)
	cv2.putText(f2, f"t={t} mode={mode}", (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255),
	2, cv2.LINE_AA)

	w_bbox.write(f1)
	w_kp.write(f2)
	t += 1

	cap.release()
	w_bbox.release()
	w_kp.release()

	print("Saved:", OUT_BBOX_ONLY)
	print("Saved:", OUT_BBOX_KP)


	if __name__ == "__main__":
	main()