| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import os |
| | import sys |
| |
|
| | sys.path.append("./") |
| |
|
| | import pdb |
| | from dataclasses import dataclass |
| |
|
| | import numpy as np |
| | import torch |
| | import torch.nn.functional as F |
| | from PIL import Image |
| |
|
| | from engine.ouputs import BaseOutput |
| | from engine.pose_estimation.model import Model |
| |
|
| | IMG_NORM_MEAN = [0.485, 0.456, 0.406] |
| | IMG_NORM_STD = [0.229, 0.224, 0.225] |
| |
|
| |
|
| | @dataclass |
| | class SMPLXOutput(BaseOutput): |
| | beta: np.ndarray |
| | is_full_body: bool |
| | msg: str |
| |
|
| |
|
| | def normalize_rgb_tensor(img, imgenet_normalization=True): |
| | img = img / 255.0 |
| | if imgenet_normalization: |
| | img = ( |
| | img - torch.tensor(IMG_NORM_MEAN, device=img.device).view(1, 3, 1, 1) |
| | ) / torch.tensor(IMG_NORM_STD, device=img.device).view(1, 3, 1, 1) |
| | return img |
| |
|
| | |
| | def load_model(ckpt_path, model_path, device=torch.device("cuda")): |
| | """Open a checkpoint, build Multi-HMR using saved arguments, load the model weigths.""" |
| | |
| |
|
| | assert os.path.isfile(ckpt_path), f"{ckpt_path} not found" |
| |
|
| | |
| | ckpt = torch.load(ckpt_path, map_location=device) |
| |
|
| | |
| | kwargs = {} |
| | for k, v in vars(ckpt["args"]).items(): |
| | kwargs[k] = v |
| | print(ckpt["args"].img_size) |
| | |
| | if isinstance(ckpt["args"].img_size, list): |
| | kwargs["img_size"] = ckpt["args"].img_size[0] |
| | else: |
| | kwargs["img_size"] = ckpt["args"].img_size |
| | kwargs["smplx_dir"] = model_path |
| | print("Loading model...") |
| | model = Model(**kwargs).to(device) |
| | print("Model loaded") |
| | |
| | model.load_state_dict(ckpt["model_state_dict"], strict=False) |
| | model.output_mesh = True |
| | model.eval() |
| | return model |
| |
|
| |
|
| | def inverse_perspective_projection(points, K, distance): |
| | """ |
| | This function computes the inverse perspective projection of a set of points given an estimated distance. |
| | Input: |
| | points (bs, N, 2): 2D points |
| | K (bs,3,3): camera intrinsics params |
| | distance (bs, N, 1): distance in the 3D world |
| | Similar to: |
| | - pts_l_norm = cv2.undistortPoints(np.expand_dims(pts_l, axis=1), cameraMatrix=K_l, distCoeffs=None) |
| | """ |
| | |
| | points = torch.cat([points, torch.ones_like(points[..., :1])], -1) |
| | points = torch.einsum("bij,bkj->bki", torch.inverse(K), points) |
| |
|
| | |
| | if distance is None: |
| | return points |
| | points = points * distance |
| | return points |
| |
|
| |
|
| | class PoseEstimator(torch.nn.Module): |
| | def __init__(self, model_path, device="cuda"): |
| | super().__init__() |
| | self.device = torch.device(device) |
| | self.mhmr_model = load_model( |
| | os.path.join(model_path, "pose_estimate", "multiHMR_896_L.pt"), |
| | model_path=model_path, |
| | device=self.device, |
| | ) |
| |
|
| | self.pad_ratio = 0.2 |
| | self.img_size = 896 |
| | self.fov = 60 |
| |
|
| | def get_camera_parameters(self): |
| | K = torch.eye(3) |
| | |
| | focal = self.img_size / (2 * np.tan(np.radians(self.fov) / 2)) |
| | K[0, 0], K[1, 1] = focal, focal |
| |
|
| | K[0, -1], K[1, -1] = self.img_size // 2, self.img_size // 2 |
| |
|
| | |
| | K = K.unsqueeze(0).to(self.device) |
| | return K |
| |
|
| | def img_center_padding(self, img_np): |
| |
|
| | ori_h, ori_w = img_np.shape[:2] |
| |
|
| | w = round((1 + self.pad_ratio) * ori_w) |
| | h = round((1 + self.pad_ratio) * ori_h) |
| |
|
| | img_pad_np = np.zeros((h, w, 3), dtype=np.uint8) |
| | offset_h, offset_w = (h - img_np.shape[0]) // 2, (w - img_np.shape[1]) // 2 |
| | img_pad_np[ |
| | offset_h : offset_h + img_np.shape[0] :, |
| | offset_w : offset_w + img_np.shape[1], |
| | ] = img_np |
| |
|
| | return img_pad_np, offset_w, offset_h |
| |
|
| | def _preprocess(self, img_np): |
| |
|
| | raw_img_size = max(img_np.shape[:2]) |
| |
|
| | img_tensor = ( |
| | torch.Tensor(img_np).to(self.device).unsqueeze(0).permute(0, 3, 1, 2) |
| | ) |
| |
|
| | _, _, h, w = img_tensor.shape |
| | scale_factor = min(self.img_size / w, self.img_size / h) |
| | img_tensor = F.interpolate( |
| | img_tensor, scale_factor=scale_factor, mode="bilinear" |
| | ) |
| |
|
| | _, _, h, w = img_tensor.shape |
| | pad_left = (self.img_size - w) // 2 |
| | pad_top = (self.img_size - h) // 2 |
| | pad_right = self.img_size - w - pad_left |
| | pad_bottom = self.img_size - h - pad_top |
| | img_tensor = F.pad( |
| | img_tensor, |
| | (pad_left, pad_right, pad_top, pad_bottom), |
| | mode="constant", |
| | value=0, |
| | ) |
| |
|
| | resize_img = normalize_rgb_tensor(img_tensor) |
| |
|
| | annotation = ( |
| | pad_left, |
| | pad_top, |
| | scale_factor, |
| | self.img_size / scale_factor, |
| | raw_img_size, |
| | ) |
| |
|
| | return resize_img, annotation |
| |
|
| | @torch.no_grad() |
| | def forward(self, img_path): |
| | |
| | |
| | |
| | |
| |
|
| | img_np = np.asarray(Image.open(img_path).convert("RGB")) |
| |
|
| | raw_h, raw_w, _ = img_np.shape |
| | img_np, offset_w, offset_h = self.img_center_padding(img_np) |
| | img_tensor, annotation = self._preprocess(img_np) |
| | K = self.get_camera_parameters() |
| |
|
| | |
| | target_human = self.mhmr_model( |
| | img_tensor, |
| | is_training=False, |
| | nms_kernel_size=int(3), |
| | det_thresh=0.3, |
| | K=K, |
| | idx=None, |
| | max_dist=None, |
| | ) |
| | |
| | if not len(target_human) == 1: |
| | return SMPLXOutput( |
| | beta=None, |
| | is_full_body=False, |
| | msg="more than one human detected" if len(target_human) > 1 else "no human detected", |
| | ) |
| |
|
| | |
| | pad_left, pad_top, scale_factor, _, _ = annotation |
| | j2d = target_human[0]["j2d"] |
| | |
| | j2d = ( |
| | j2d - torch.tensor([pad_left, pad_top], device=self.device).unsqueeze(0) |
| | ) / scale_factor |
| | j2d = j2d - torch.tensor([offset_w, offset_h], device=self.device).unsqueeze(0) |
| |
|
| | |
| | scale_ratio = 0.025 |
| |
|
| | is_full_body = ( |
| | ( |
| | (j2d[..., 0] >= 0 - raw_w * scale_ratio) |
| | & (j2d[..., 0] < raw_w * (1 + scale_ratio)) |
| | & (j2d[..., 1] >= 0 - raw_h * scale_ratio) |
| | & (j2d[..., 1] < raw_h * (1 + scale_ratio)) |
| | ) |
| | .sum(dim=-1) |
| | .item() >= 95 |
| | ) |
| |
|
| | return SMPLXOutput( |
| | beta=target_human[0]["shape"].cpu().numpy(), |
| | is_full_body=is_full_body, |
| | msg="success" if is_full_body else "no full-body human detected", |
| | ) |
| |
|