|
|
import argparse |
|
|
import os |
|
|
from pathlib import Path |
|
|
import tempfile |
|
|
import tarfile |
|
|
import sys |
|
|
import cv2 |
|
|
import gradio as gr |
|
|
import numpy as np |
|
|
import torch |
|
|
from PIL import Image |
|
|
|
|
|
|
|
|
print(os.path.abspath(__file__)) |
|
|
os.environ["PYOPENGL_PLATFORM"] = "egl" |
|
|
os.environ["MESA_GL_VERSION_OVERRIDE"] = "4.1" |
|
|
os.system('pip install /home/user/app/pyrender') |
|
|
sys.path.append('/home/user/app/pyrender') |
|
|
|
|
|
from hamer.configs import get_config |
|
|
from hamer.datasets.vitdet_dataset import (DEFAULT_MEAN, DEFAULT_STD, |
|
|
ViTDetDataset) |
|
|
from hamer.models import HAMER |
|
|
from hamer.utils import recursive_to |
|
|
from hamer.utils.renderer import Renderer, cam_crop_to_full |
|
|
|
|
|
def extract_tar() -> None: |
|
|
if Path('mmdet_configs/configs').exists(): |
|
|
return |
|
|
with tarfile.open('mmdet_configs/configs.tar') as f: |
|
|
f.extractall('mmdet_configs') |
|
|
|
|
|
extract_tar() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from vitpose_model import ViTPoseModel |
|
|
|
|
|
OUT_FOLDER = 'demo_out' |
|
|
os.makedirs(OUT_FOLDER, exist_ok=True) |
|
|
|
|
|
|
|
|
LIGHT_BLUE=(0.65098039, 0.74117647, 0.85882353) |
|
|
DEFAULT_CHECKPOINT='_DATA/hamer_ckpts/checkpoints/hamer.ckpt' |
|
|
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') |
|
|
model_cfg = str(Path(DEFAULT_CHECKPOINT).parent.parent / 'model_config.yaml') |
|
|
model_cfg = get_config(model_cfg) |
|
|
|
|
|
if (model_cfg.MODEL.BACKBONE.TYPE == 'vit') and ('BBOX_SHAPE' not in model_cfg.MODEL): |
|
|
model_cfg.defrost() |
|
|
assert model_cfg.MODEL.IMAGE_SIZE == 256, f"MODEL.IMAGE_SIZE ({model_cfg.MODEL.IMAGE_SIZE}) should be 256 for ViT backbone" |
|
|
model_cfg.MODEL.BBOX_SHAPE = [192,256] |
|
|
model_cfg.freeze() |
|
|
model = HAMER.load_from_checkpoint(DEFAULT_CHECKPOINT, strict=False, cfg=model_cfg).to(device) |
|
|
model.eval() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
renderer = Renderer(model_cfg, faces=model.mano.faces) |
|
|
|
|
|
|
|
|
|
|
|
det_model = torch.hub.load('ultralytics/yolov5', 'yolov5x6') |
|
|
|
|
|
|
|
|
cpm = ViTPoseModel(device) |
|
|
|
|
|
import numpy as np |
|
|
|
|
|
def infer(in_pil_img, in_threshold=0.4, out_pil_img=None): |
|
|
|
|
|
print(in_threshold) |
|
|
|
|
|
open_cv_image = np.array(in_pil_img) |
|
|
det_out = det_model(open_cv_image) |
|
|
det_out = det_out.xyxy[0] |
|
|
|
|
|
open_cv_image = open_cv_image[:, :, ::-1].copy() |
|
|
print("EEEEE", open_cv_image.shape) |
|
|
print(det_out) |
|
|
|
|
|
scores = det_out[:,4] |
|
|
det_instances = det_out[:,5] |
|
|
print(scores) |
|
|
print(det_instances) |
|
|
valid_idx = (det_instances==0) & (scores > in_threshold) |
|
|
print(valid_idx) |
|
|
pred_bboxes=det_out[valid_idx,:4].cpu().numpy() |
|
|
pred_scores=scores[valid_idx].cpu().numpy() |
|
|
|
|
|
|
|
|
|
|
|
vitposes_out = cpm.predict_pose( |
|
|
open_cv_image, |
|
|
[np.concatenate([pred_bboxes, pred_scores[:, None]], axis=1)], |
|
|
) |
|
|
|
|
|
bboxes = [] |
|
|
is_right = [] |
|
|
|
|
|
|
|
|
for vitposes in vitposes_out: |
|
|
left_hand_keyp = vitposes['keypoints'][-42:-21] |
|
|
right_hand_keyp = vitposes['keypoints'][-21:] |
|
|
|
|
|
|
|
|
keyp = left_hand_keyp |
|
|
valid = keyp[:,2] > 0.5 |
|
|
if sum(valid) > 3: |
|
|
bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()] |
|
|
bboxes.append(bbox) |
|
|
is_right.append(0) |
|
|
keyp = right_hand_keyp |
|
|
valid = keyp[:,2] > 0.5 |
|
|
if sum(valid) > 3: |
|
|
bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()] |
|
|
bboxes.append(bbox) |
|
|
is_right.append(1) |
|
|
|
|
|
if len(bboxes) == 0: |
|
|
return None, [] |
|
|
|
|
|
boxes = np.stack(bboxes) |
|
|
right = np.stack(is_right) |
|
|
print(boxes) |
|
|
print(right) |
|
|
print(open_cv_image) |
|
|
|
|
|
|
|
|
|
|
|
dataset = ViTDetDataset(model_cfg, open_cv_image, boxes, right) |
|
|
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=0) |
|
|
|
|
|
all_verts = [] |
|
|
all_cam_t = [] |
|
|
all_right = [] |
|
|
all_mesh_paths = [] |
|
|
|
|
|
temp_name = next(tempfile._get_candidate_names()) |
|
|
|
|
|
for batch in dataloader: |
|
|
batch = recursive_to(batch, device) |
|
|
print(batch['img']) |
|
|
with torch.no_grad(): |
|
|
out = model(batch) |
|
|
|
|
|
multiplier = (2*batch['right']-1) |
|
|
pred_cam = out['pred_cam'] |
|
|
print(out['pred_vertices']) |
|
|
print(pred_cam) |
|
|
pred_cam[:,1] = multiplier*pred_cam[:,1] |
|
|
box_center = batch["box_center"].float() |
|
|
box_size = batch["box_size"].float() |
|
|
img_size = batch["img_size"].float() |
|
|
multiplier = (2*batch['right']-1) |
|
|
render_size = img_size |
|
|
scaled_focal_length = model_cfg.EXTRA.FOCAL_LENGTH / model_cfg.MODEL.IMAGE_SIZE * img_size.max() |
|
|
pred_cam_t = cam_crop_to_full(pred_cam, box_center, box_size, render_size, scaled_focal_length).detach().cpu().numpy() |
|
|
|
|
|
|
|
|
batch_size = batch['img'].shape[0] |
|
|
for n in range(batch_size): |
|
|
|
|
|
|
|
|
person_id = int(batch['personid'][n]) |
|
|
white_img = (torch.ones_like(batch['img'][n]).cpu() - DEFAULT_MEAN[:,None,None]/255) / (DEFAULT_STD[:,None,None]/255) |
|
|
input_patch = batch['img'][n].cpu() * (DEFAULT_STD[:,None,None]/255) + (DEFAULT_MEAN[:,None,None]/255) |
|
|
input_patch = input_patch.permute(1,2,0).numpy() |
|
|
|
|
|
|
|
|
verts = out['pred_vertices'][n].detach().cpu().numpy() |
|
|
is_right = batch['right'][n].cpu().numpy() |
|
|
verts[:,0] = (2*is_right-1)*verts[:,0] |
|
|
cam_t = pred_cam_t[n] |
|
|
|
|
|
all_verts.append(verts) |
|
|
all_cam_t.append(cam_t) |
|
|
all_right.append(is_right) |
|
|
|
|
|
|
|
|
|
|
|
if True: |
|
|
camera_translation = cam_t.copy() |
|
|
tmesh = renderer.vertices_to_trimesh(verts, camera_translation, LIGHT_BLUE, is_right=is_right) |
|
|
|
|
|
temp_path = os.path.join(f'{OUT_FOLDER}/{temp_name}_{person_id}.obj') |
|
|
tmesh.export(temp_path) |
|
|
all_mesh_paths.append(temp_path) |
|
|
|
|
|
|
|
|
if len(all_verts) > 0: |
|
|
misc_args = dict( |
|
|
mesh_base_color=LIGHT_BLUE, |
|
|
scene_bg_color=(1, 1, 1), |
|
|
focal_length=scaled_focal_length, |
|
|
) |
|
|
cam_view = renderer.render_rgba_multiple(all_verts, cam_t=all_cam_t, render_res=render_size[n], is_right=all_right, **misc_args) |
|
|
|
|
|
|
|
|
input_img = open_cv_image.astype(np.float32)[:,:,::-1]/255.0 |
|
|
input_img = np.concatenate([input_img, np.ones_like(input_img[:,:,:1])], axis=2) |
|
|
input_img_overlay = input_img[:,:,:3] * (1-cam_view[:,:,3:]) + cam_view[:,:,:3] * cam_view[:,:,3:] |
|
|
|
|
|
|
|
|
out_pil_img = Image.fromarray((input_img_overlay*255).astype(np.uint8)) |
|
|
|
|
|
return out_pil_img, all_mesh_paths |
|
|
else: |
|
|
return None, [] |
|
|
|
|
|
|
|
|
with gr.Blocks(title="HaMeR", css=".gradio-container") as demo: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
gr.HTML("""<div style="font-weight:bold; text-align:center; font-size: 30px;">HaMeR</div>""") |
|
|
gr.HTML("""<div style="text-align:left; font-size: 20px;">Demo for HaMeR. You can drop an image at the top-left panel |
|
|
(or select one of the examples) and you will get the 3D reconstructions of the detected hands on the right. |
|
|
You can also download the .obj files for each hand reconstruction.</div>""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
input_image = gr.Image(label="Input image", type="pil") |
|
|
with gr.Column(): |
|
|
output_image = gr.Image(label="Reconstructions", type="pil") |
|
|
output_meshes = gr.File(label="3D meshes") |
|
|
|
|
|
gr.HTML("""<br/>""") |
|
|
|
|
|
with gr.Row(): |
|
|
threshold = gr.Slider(0, 1.0, value=0.6, label='Detection Threshold') |
|
|
send_btn = gr.Button("Infer") |
|
|
send_btn.click(fn=infer, inputs=[input_image, threshold], outputs=[output_image, output_meshes]) |
|
|
|
|
|
|
|
|
example_images = gr.Examples([ |
|
|
['/home/user/app/assets/test1.jpg'], |
|
|
['/home/user/app/assets/test2.jpg'], |
|
|
['/home/user/app/assets/test3.jpg'], |
|
|
['/home/user/app/assets/test5.jpg'], |
|
|
], |
|
|
inputs=input_image) |
|
|
|
|
|
|
|
|
|
|
|
demo.launch(debug=True) |
|
|
|
|
|
|
|
|
|
|
|
|