| import os |
| import cv2 |
| import torch |
| import numpy as np |
|
|
| |
| OUT_DIR = "./outputs/infer_video/VRM_JG6Z7WA_3780.00_3804.00" |
| VIDEO_PATH = os.path.join(OUT_DIR, "1_incam.mp4") |
|
|
| BBX_PATH = os.path.join(OUT_DIR, "preprocess", "bbx.pt") |
| VITPOSE_PATH = os.path.join(OUT_DIR, "preprocess", "vitpose.pt") |
|
|
| OUT_BBOX_ONLY = os.path.join(OUT_DIR, "debug_bbox_only_on_incam.mp4") |
| OUT_BBOX_KP = os.path.join(OUT_DIR, "debug_bbox_kp_on_incam.mp4") |
| |
|
|
| COCO17_NAMES = [ |
| "nose", "l_eye", "r_eye", "l_ear", "r_ear", |
| "l_sho", "r_sho", "l_elb", "r_elb", "l_wri", "r_wri", |
| "l_hip", "r_hip", "l_knee", "r_knee", "l_ank", "r_ank" |
| ] |
|
|
|
|
| def to_numpy(x): |
| if isinstance(x, torch.Tensor): |
| return x.detach().cpu().numpy() |
| return np.array(x) |
|
|
|
|
| def xyxy_to_xys(bbx_xyxy_t: torch.Tensor) -> torch.Tensor: |
| """(L,4) xyxy -> (L,3) (cx,cy,s) where s is square side = max(w,h).""" |
| x1, y1, x2, y2 = bbx_xyxy_t.unbind(-1) |
| cx = (x1 + x2) * 0.5 |
| cy = (y1 + y2) * 0.5 |
| w = (x2 - x1).clamp(min=1.0) |
| h = (y2 - y1).clamp(min=1.0) |
| s = torch.maximum(w, h) |
| return torch.stack([cx, cy, s], dim=-1) |
|
|
|
|
| def xys_to_xyxy(bbx_xys_t: torch.Tensor) -> torch.Tensor: |
| """(L,3) (cx,cy,s) -> (L,4) xyxy of square.""" |
| cx, cy, s = bbx_xys_t.unbind(-1) |
| hs = s * 0.5 |
| x1 = cx - hs |
| y1 = cy - hs |
| x2 = cx + hs |
| y2 = cy + hs |
| return torch.stack([x1, y1, x2, y2], dim=-1) |
|
|
|
|
| def draw_bbox_xyxy(frame, xyxy, color=(0, 255, 0), thickness=2): |
| x1, y1, x2, y2 = [int(round(float(v))) for v in xyxy] |
| cv2.rectangle(frame, (x1, y1), (x2, y2), color, thickness) |
| return frame |
|
|
|
|
| def draw_kps_with_names(frame, kps_xy, conf=None, radius=3, show_conf=True, conf_thr=0.0): |
| """ |
| kps_xy: (J,2) in image pixels |
| conf: (J,) optional |
| - Always renders the label (including confidence) so you can verify confidence behavior. |
| - Text is BLACK with a white background box for readability. |
| """ |
| H, W = frame.shape[:2] |
|
|
| for j, (x, y) in enumerate(kps_xy): |
| x_i, y_i = int(round(float(x))), int(round(float(y))) |
| if x_i < 0 or y_i < 0 or x_i >= W or y_i >= H: |
| continue |
|
|
| |
| if conf is None: |
| c = 1.0 |
| else: |
| c = float(conf[j]) |
|
|
| ok = (c >= conf_thr) |
|
|
| |
| if conf is None: |
| pt_color = (0, 0, 255) |
| else: |
| pt_color = (0, 0, 255) if ok else (150, 150, 150) |
|
|
| cv2.circle(frame, (x_i, y_i), radius, pt_color, -1) |
|
|
| |
| name = COCO17_NAMES[j] if j < len(COCO17_NAMES) else f"j{j}" |
| label = f"{name} {c:.2f}" if (conf is not None and show_conf) else name |
|
|
| |
| org = (x_i + 4, y_i - 6) |
|
|
| |
| font = cv2.FONT_HERSHEY_SIMPLEX |
| font_scale = 0.40 |
| thickness = 1 |
| (tw, th), baseline = cv2.getTextSize(label, font, font_scale, thickness) |
|
|
| |
| x0, y0 = org[0], org[1] - th |
| x1, y1 = org[0] + tw, org[1] + baseline |
|
|
| |
| x0 = max(0, min(W - 1, x0)) |
| y0 = max(0, min(H - 1, y0)) |
| x1 = max(0, min(W - 1, x1)) |
| y1 = max(0, min(H - 1, y1)) |
|
|
| cv2.rectangle(frame, (x0, y0), (x1, y1), (255, 255, 255), -1) |
| cv2.putText(frame, label, org, font, font_scale, (0, 0, 0), thickness, cv2.LINE_AA) |
|
|
| return frame |
|
|
|
|
|
|
| def convert_kp_to_image_pixels(kp, bbx_xys, crop_size=256): |
| """ |
| Convert kp to full-image pixel coords using HMR-style crop mapping: |
| x_img = cx + x_norm * (s/2) |
| y_img = cy + y_norm * (s/2) |
| |
| Supports: |
| - kp in [-1,1] (normalized crop coords) |
| - kp in crop pixels [0..crop_size-1] |
| - kp already in image pixels (then returned unchanged) |
| """ |
| kp = np.asarray(kp, dtype=np.float32) |
| bbx_xys = np.asarray(bbx_xys, dtype=np.float32) |
|
|
| kp_min = float(np.nanmin(kp)) |
| kp_max = float(np.nanmax(kp)) |
|
|
| |
| if kp_min >= -1.5 and kp_max <= 1.5: |
| mode = "norm_pm1" |
| kp_norm = kp |
| elif kp_min >= -5.0 and kp_max <= (crop_size + 5.0): |
| mode = "crop_pixels" |
| |
| denom = (crop_size - 1.0) |
| kp_norm = (kp / denom) * 2.0 - 1.0 |
| else: |
| mode = "image_pixels" |
| return kp, mode, (kp_min, kp_max) |
|
|
| cx = bbx_xys[:, 0:1] |
| cy = bbx_xys[:, 1:2] |
| s = bbx_xys[:, 2:3] |
| hs = s * 0.5 |
|
|
| x_img = cx + kp_norm[..., 0] * hs |
| y_img = cy + kp_norm[..., 1] * hs |
| kp_img = np.stack([x_img, y_img], axis=-1) |
| return kp_img, mode, (kp_min, kp_max) |
|
|
|
|
| def main(): |
| |
| bbx = torch.load(BBX_PATH, map_location="cpu") |
|
|
| bbx_xyxy_t = bbx.get("bbx_xyxy", None) |
| bbx_xys_t = bbx.get("bbx_xys", None) |
|
|
| if bbx_xyxy_t is None and bbx_xys_t is None: |
| raise ValueError("bbx.pt must contain 'bbx_xyxy' and/or 'bbx_xys'.") |
|
|
| if bbx_xys_t is None and bbx_xyxy_t is not None: |
| bbx_xys_t = xyxy_to_xys(bbx_xyxy_t) |
|
|
| if bbx_xyxy_t is None and bbx_xys_t is not None: |
| bbx_xyxy_t = xys_to_xyxy(bbx_xys_t) |
|
|
| bbx_xyxy = to_numpy(bbx_xyxy_t) |
| bbx_xys = to_numpy(bbx_xys_t) |
|
|
| print("bbx_xyxy shape:", bbx_xyxy.shape, "bbx_xys shape:", bbx_xys.shape) |
|
|
| |
| vitpose = torch.load(VITPOSE_PATH, map_location="cpu") |
|
|
| conf = None |
| if isinstance(vitpose, dict): |
| kp = None |
| for k in ["kp2d", "keypoints", "kps", "joints_2d", "vitpose"]: |
| if k in vitpose: |
| kp = vitpose[k] |
| break |
| if kp is None: |
| print("vitpose.pt keys:", list(vitpose.keys())) |
| raise ValueError("Couldn't find keypoints in vitpose dict.") |
| kp = to_numpy(kp) |
|
|
| for k in ["conf", "confidence", "scores", "kp2d_conf", "keypoint_scores"]: |
| if k in vitpose: |
| conf = to_numpy(vitpose[k]) |
| break |
| else: |
| kp = to_numpy(vitpose) |
|
|
| if kp.ndim != 3: |
| raise ValueError(f"Unexpected kp shape: {kp.shape} (expected L x J x 2/3)") |
|
|
| if kp.shape[-1] == 3 and conf is None: |
| conf = kp[..., 2] |
| kp = kp[..., :2] |
| elif kp.shape[-1] != 2: |
| raise ValueError(f"Unexpected kp last dim: {kp.shape[-1]} (expected 2 or 3)") |
|
|
| |
| cap = cv2.VideoCapture(VIDEO_PATH) |
| if not cap.isOpened(): |
| raise RuntimeError(f"Failed to open video: {VIDEO_PATH}") |
|
|
| fps = cap.get(cv2.CAP_PROP_FPS) or 30.0 |
| W = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) |
| H = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) |
| print("Video:", VIDEO_PATH, "W,H:", W, H, "fps:", fps) |
|
|
| |
| L = min(len(bbx_xyxy), len(bbx_xys), kp.shape[0]) |
| print("Using L =", L) |
|
|
| |
| kp_img, mode, (kp_min, kp_max) = convert_kp_to_image_pixels(kp[:L], bbx_xys[:L], crop_size=256) |
| print(f"kp stats min/max: {kp_min:.3f} / {kp_max:.3f} -> interpreted as mode: {mode}") |
|
|
| |
| centers = np.stack( |
| [(bbx_xyxy[:L, 0] + bbx_xyxy[:L, 2]) * 0.5, |
| (bbx_xyxy[:L, 1] + bbx_xyxy[:L, 3]) * 0.5], |
| axis=-1 |
| ) |
| center_speed = np.linalg.norm(centers[1:] - centers[:-1], axis=-1) |
| if len(center_speed) > 0: |
| print("bbox center jump px (p50/p90/max):", |
| float(np.percentile(center_speed, 50)), |
| float(np.percentile(center_speed, 90)), |
| float(center_speed.max())) |
|
|
| if conf is not None: |
| conf_use = conf[:L] |
| print("kp conf (mean/p10):", |
| float(np.mean(conf_use)), |
| float(np.percentile(conf_use, 10))) |
|
|
| |
| fourcc = cv2.VideoWriter_fourcc(*"mp4v") |
| w_bbox = cv2.VideoWriter(OUT_BBOX_ONLY, fourcc, fps, (W, H)) |
| w_kp = cv2.VideoWriter(OUT_BBOX_KP, fourcc, fps, (W, H)) |
|
|
| t = 0 |
| while t < L: |
| ok, frame = cap.read() |
| if not ok: |
| break |
|
|
| f1 = frame.copy() |
| f2 = frame.copy() |
|
|
| draw_bbox_xyxy(f1, bbx_xyxy[t]) |
| draw_bbox_xyxy(f2, bbx_xyxy[t]) |
|
|
| c_t = conf[t] if conf is not None else None |
| draw_kps_with_names(f2, kp_img[t], conf=c_t, show_conf=True) |
|
|
| cv2.putText(f1, f"t={t}", (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), |
| 2, cv2.LINE_AA) |
| cv2.putText(f2, f"t={t} mode={mode}", (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 255), |
| 2, cv2.LINE_AA) |
|
|
| w_bbox.write(f1) |
| w_kp.write(f2) |
| t += 1 |
|
|
| cap.release() |
| w_bbox.release() |
| w_kp.release() |
|
|
| print("Saved:", OUT_BBOX_ONLY) |
| print("Saved:", OUT_BBOX_KP) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|