# Multi-HMR # Copyright (c) 2024-present NAVER Corp. # CC BY-NC-SA 4.0 license import torch import numpy as np import trimesh import math from scipy.spatial.transform import Rotation from PIL import ImageFont, ImageDraw, Image OPENCV_TO_OPENGL_CAMERA_CONVENTION = np.array( [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]] ) def geotrf(Trf, pts, ncol=None, norm=False): """Apply a geometric transformation to a list of 3-D points. H: 3x3 or 4x4 projection matrix (typically a Homography) p: numpy/torch/tuple of coordinates. Shape must be (...,2) or (...,3) ncol: int. number of columns of the result (2 or 3) norm: float. if != 0, the resut is projected on the z=norm plane. Returns an array of projected 2d points. """ assert Trf.ndim in (2, 3) if isinstance(Trf, np.ndarray): pts = np.asarray(pts) elif isinstance(Trf, torch.Tensor): pts = torch.as_tensor(pts, dtype=Trf.dtype) ncol = ncol or pts.shape[-1] # adapt shape if necessary output_reshape = pts.shape[:-1] if Trf.ndim == 3: assert len(Trf) == len(pts), "batch size does not match" if Trf.ndim == 3 and pts.ndim > 3: # Trf == (B,d,d) & pts == (B,H,W,d) --> (B, H*W, d) pts = pts.reshape(pts.shape[0], -1, pts.shape[-1]) elif Trf.ndim == 3 and pts.ndim == 2: # Trf == (B,d,d) & pts == (B,d) --> (B, 1, d) pts = pts[:, None, :] if pts.shape[-1] + 1 == Trf.shape[-1]: Trf = Trf.swapaxes(-1, -2) # transpose Trf pts = pts @ Trf[..., :-1, :] + Trf[..., -1:, :] elif pts.shape[-1] == Trf.shape[-1]: Trf = Trf.swapaxes(-1, -2) # transpose Trf pts = pts @ Trf else: pts = Trf @ pts.T if pts.ndim >= 2: pts = pts.swapaxes(-1, -2) if norm: pts = pts / pts[..., -1:] # DONT DO /= BECAUSE OF WEIRD PYTORCH BUG if norm != 1: pts *= norm return pts[..., :ncol].reshape(*output_reshape, ncol) def create_scene( img_pil, l_mesh, l_face, color=None, metallicFactor=0.0, roughnessFactor=0.5, focal=600, ): scene = trimesh.Scene(lights=trimesh.scene.lighting.Light(intensity=3.0)) # Human meshes for i, mesh in enumerate(l_mesh): if color is None: _color = ( np.random.choice(range(1, 225)) / 255, np.random.choice(range(1, 225)) / 255, np.random.choice(range(1, 225)) / 255, ) else: if isinstance(color, list): _color = color[i] elif isinstance(color, tuple): _color = color else: raise NotImplementedError mesh = trimesh.Trimesh(mesh, l_face[i]) mesh.visual = trimesh.visual.TextureVisuals( uv=None, material=trimesh.visual.material.PBRMaterial( metallicFactor=metallicFactor, roughnessFactor=roughnessFactor, alphaMode="OPAQUE", baseColorFactor=(_color[0], _color[1], _color[2], 1.0), ), image=None, face_materials=None, ) scene.add_geometry(mesh) # Image H, W = img_pil.size[0], img_pil.size[1] screen_width = 0.3 height = focal * screen_width / H width = screen_width * 0.5**0.5 rot45 = np.eye(4) rot45[:3, :3] = Rotation.from_euler("z", np.deg2rad(45)).as_matrix() rot45[2, 3] = -height # set the tip of the cone = optical center aspect_ratio = np.eye(4) aspect_ratio[0, 0] = W / H transform = OPENCV_TO_OPENGL_CAMERA_CONVENTION @ aspect_ratio @ rot45 cam = trimesh.creation.cone(width, height, sections=4, transform=transform) # cam.apply_transform(transform) # import ipdb # ipdb.set_trace() # vertices = geotrf(transform, cam.vertices[[4,5,1,3]]) vertices = cam.vertices[[4, 5, 1, 3]] faces = np.array([[0, 1, 2], [0, 2, 3], [2, 1, 0], [3, 2, 0]]) img = trimesh.Trimesh(vertices=vertices, faces=faces) uv_coords = np.float32([[0, 0], [1, 0], [1, 1], [0, 1]]) # img_pil = Image.fromarray((255. * np.ones((20,20,3))).astype(np.uint8)) # white only! material = trimesh.visual.texture.SimpleMaterial( image=img_pil, diffuse=[255, 255, 255, 0], ambient=[255, 255, 255, 0], specular=[255, 255, 255, 0], glossiness=1.0, ) img.visual = trimesh.visual.TextureVisuals( uv=uv_coords, image=img_pil ) # , material=material) # _main_color = [255,255,255,0] # print(img.visual.material.ambient) # print(img.visual.material.diffuse) # print(img.visual.material.specular) # print(img.visual.material.main_color) # img.visual.material.ambient = _main_color # img.visual.material.diffuse = _main_color # img.visual.material.specular = _main_color # img.visual.material.main_color = _main_color # img.visual.material.glossiness = _main_color scene.add_geometry(img) # this is the camera mesh rot2 = np.eye(4) rot2[:3, :3] = Rotation.from_euler("z", np.deg2rad(2)).as_matrix() # import ipdb # ipdb.set_trace() # vertices = cam.vertices # print(rot2) vertices = np.r_[cam.vertices, 0.95 * cam.vertices, geotrf(rot2, cam.vertices)] # vertices = np.r_[cam.vertices, 0.95*cam.vertices, 1.05*cam.vertices] faces = [] for face in cam.faces: if 0 in face: continue a, b, c = face a2, b2, c2 = face + len(cam.vertices) a3, b3, c3 = face + 2 * len(cam.vertices) # add 3 pseudo-edges faces.append((a, b, b2)) faces.append((a, a2, c)) faces.append((c2, b, c)) faces.append((a, b, b3)) faces.append((a, a3, c)) faces.append((c3, b, c)) # no culling faces += [(c, b, a) for a, b, c in faces] cam = trimesh.Trimesh(vertices=vertices, faces=faces) cam.visual.face_colors[:, :3] = (255, 0, 0) scene.add_geometry(cam) # OpenCV to OpenGL rot = np.eye(4) cams2world = np.eye(4) rot[:3, :3] = Rotation.from_euler("y", np.deg2rad(180)).as_matrix() scene.apply_transform( np.linalg.inv(cams2world @ OPENCV_TO_OPENGL_CAMERA_CONVENTION @ rot) ) return scene def length(v): return math.sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]) def cross(v0, v1): return [ v0[1] * v1[2] - v1[1] * v0[2], v0[2] * v1[0] - v1[2] * v0[0], v0[0] * v1[1] - v1[0] * v0[1], ] def dot(v0, v1): return v0[0] * v1[0] + v0[1] * v1[1] + v0[2] * v1[2] def normalize(v, eps=1e-13): l = length(v) return [v[0] / (l + eps), v[1] / (l + eps), v[2] / (l + eps)] def lookAt(eye, target, *args, **kwargs): """ eye is the point of view, target is the point which is looked at and up is the upwards direction. Input should be in OpenCV format - we transform arguments to OpenGL Do compute in OpenGL and then transform back to OpenCV """ # Transform from OpenCV to OpenGL format # eye = [eye[0], -eye[1], -eye[2]] # target = [target[0], -target[1], -target[2]] up = [0, -1, 0] eye, at, up = eye, target, up zaxis = normalize((at[0] - eye[0], at[1] - eye[1], at[2] - eye[2])) xaxis = normalize(cross(zaxis, up)) yaxis = cross(xaxis, zaxis) zaxis = [-zaxis[0], -zaxis[1], -zaxis[2]] viewMatrix = np.asarray( [ [xaxis[0], xaxis[1], xaxis[2], -dot(xaxis, eye)], [yaxis[0], yaxis[1], yaxis[2], -dot(yaxis, eye)], [zaxis[0], zaxis[1], zaxis[2], -dot(zaxis, eye)], [0, 0, 0, 1], ] ).reshape(4, 4) # OpenGL to OpenCV viewMatrix = OPENCV_TO_OPENGL_CAMERA_CONVENTION @ viewMatrix return viewMatrix def print_distance_on_image(pred_rend_array, humans, _color): # Add distance to the image. font = ImageFont.load_default() rend_pil = Image.fromarray(pred_rend_array) draw = ImageDraw.Draw(rend_pil) for i_hum, hum in enumerate(humans): # distance transl = hum["transl_pelvis"].cpu().numpy().reshape(3) dist_cam = np.sqrt(((transl[[0, 2]]) ** 2).sum()) # discarding Y axis # 2d - bbox bbox = get_bbox( hum["j2d_smplx"].cpu().numpy(), factor=1.35, output_format="x1y1x2y2" ) loc = [(bbox[0] + bbox[2]) / 2.0, bbox[1]] txt = f"{dist_cam:.2f}m" length = font.getlength(txt) loc[0] = loc[0] - length // 2 fill = tuple((np.asarray(_color[i_hum]) * 255).astype(np.int32).tolist()) draw.text((loc[0], loc[1]), txt, fill=fill, font=font) return np.asarray(rend_pil) def get_bbox(points, factor=1.0, output_format="xywh"): """ Args: - y: [k,2] Return: - bbox: [4] in a specific format """ assert ( len(points.shape) == 2 ), f"Wrong shape, expected two-dimensional array. Got shape {points.shape}" assert points.shape[1] == 2 x1, x2 = points[:, 0].min(), points[:, 0].max() y1, y2 = points[:, 1].min(), points[:, 1].max() cx, cy = (x2 + x1) / 2.0, (y2 + y1) / 2.0 sx, sy = np.abs(x2 - x1), np.abs(y2 - y1) sx, sy = int(factor * sx), int(factor * sy) x1, y1 = int(cx - sx / 2.0), int(cy - sy / 2.0) x2, y2 = int(cx + sx / 2.0), int(cy + sy / 2.0) if output_format == "xywh": return [x1, y1, sx, sy] elif output_format == "x1y1x2y2": return [x1, y1, x2, y2] else: raise NotImplementedError