Spaces:
Build error
Build error
| import argparse | |
| import os | |
| from pathlib import Path | |
| import tempfile | |
| import tarfile | |
| import sys | |
| import cv2 | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| from PIL import Image | |
| # print file path | |
| print(os.path.abspath(__file__)) | |
| os.environ["PYOPENGL_PLATFORM"] = "egl" | |
| os.environ["MESA_GL_VERSION_OVERRIDE"] = "4.1" | |
| os.system('pip install /home/user/app/pyrender') | |
| sys.path.append('/home/user/app/pyrender') | |
| from hamer.configs import get_config | |
| from hamer.datasets.vitdet_dataset import (DEFAULT_MEAN, DEFAULT_STD, | |
| ViTDetDataset) | |
| from hamer.models import HAMER | |
| from hamer.utils import recursive_to | |
| from hamer.utils.renderer import Renderer, cam_crop_to_full | |
| def extract_tar() -> None: | |
| if Path('mmdet_configs/configs').exists(): | |
| return | |
| with tarfile.open('mmdet_configs/configs.tar') as f: | |
| f.extractall('mmdet_configs') | |
| extract_tar() | |
| #from vitpose_model import DetModel | |
| #try: | |
| # import detectron2 | |
| #except: | |
| # import os | |
| # os.system('pip install --upgrade pip') | |
| # os.system('pip install git+https://github.com/facebookresearch/detectron2.git') | |
| #try: | |
| # from vitpose_model import ViTPoseModel | |
| #except: | |
| # os.system('pip install -v -e /home/user/app/vendor/ViTPose') | |
| # from vitpose_model import ViTPoseModel | |
| from vitpose_model import ViTPoseModel | |
| OUT_FOLDER = 'demo_out' | |
| os.makedirs(OUT_FOLDER, exist_ok=True) | |
| # Setup HaMeR model | |
| LIGHT_BLUE=(0.65098039, 0.74117647, 0.85882353) | |
| DEFAULT_CHECKPOINT='_DATA/hamer_ckpts/checkpoints/hamer.ckpt' | |
| device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') | |
| model_cfg = str(Path(DEFAULT_CHECKPOINT).parent.parent / 'model_config.yaml') | |
| model_cfg = get_config(model_cfg) | |
| # Override some config values, to crop bbox correctly | |
| if (model_cfg.MODEL.BACKBONE.TYPE == 'vit') and ('BBOX_SHAPE' not in model_cfg.MODEL): | |
| model_cfg.defrost() | |
| assert model_cfg.MODEL.IMAGE_SIZE == 256, f"MODEL.IMAGE_SIZE ({model_cfg.MODEL.IMAGE_SIZE}) should be 256 for ViT backbone" | |
| model_cfg.MODEL.BBOX_SHAPE = [192,256] | |
| model_cfg.freeze() | |
| model = HAMER.load_from_checkpoint(DEFAULT_CHECKPOINT, strict=False, cfg=model_cfg).to(device) | |
| model.eval() | |
| # Load detector | |
| #from detectron2.config import LazyConfig | |
| #from hamer.utils.utils_detectron2 import DefaultPredictor_Lazy | |
| #detectron2_cfg = LazyConfig.load(f"vendor/detectron2/projects/ViTDet/configs/COCO/cascade_mask_rcnn_vitdet_h_75ep.py") | |
| #detectron2_cfg.train.init_checkpoint = "https://dl.fbaipublicfiles.com/detectron2/ViTDet/COCO/cascade_mask_rcnn_vitdet_h/f328730692/model_final_f05665.pkl" | |
| #for i in range(3): | |
| # detectron2_cfg.model.roi_heads.box_predictors[i].test_score_thresh = 0.25 | |
| #detector = DefaultPredictor_Lazy(detectron2_cfg) | |
| # Setup the renderer | |
| renderer = Renderer(model_cfg, faces=model.mano.faces) | |
| # mmdet detector | |
| #det_model = DetModel() | |
| det_model = torch.hub.load('ultralytics/yolov5', 'yolov5x6') | |
| # keypoint detector | |
| cpm = ViTPoseModel(device) | |
| import numpy as np | |
| def infer(in_pil_img, in_threshold=0.4, out_pil_img=None): | |
| print(in_threshold) | |
| open_cv_image = np.array(in_pil_img) | |
| det_out = det_model(open_cv_image) | |
| det_out = det_out.xyxy[0] | |
| # Convert RGB to BGR | |
| open_cv_image = open_cv_image[:, :, ::-1].copy() | |
| print("EEEEE", open_cv_image.shape) | |
| print(det_out) | |
| #det_out = detector(open_cv_image) | |
| scores = det_out[:,4] | |
| det_instances = det_out[:,5] | |
| print(scores) | |
| print(det_instances) | |
| valid_idx = (det_instances==0) & (scores > in_threshold) | |
| print(valid_idx) | |
| pred_bboxes=det_out[valid_idx,:4].cpu().numpy() | |
| pred_scores=scores[valid_idx].cpu().numpy() | |
| # Detect human keypoints for each person | |
| vitposes_out = cpm.predict_pose( | |
| open_cv_image, | |
| [np.concatenate([pred_bboxes, pred_scores[:, None]], axis=1)], | |
| ) | |
| bboxes = [] | |
| is_right = [] | |
| # Use hands based on hand keypoint detections | |
| for vitposes in vitposes_out: | |
| left_hand_keyp = vitposes['keypoints'][-42:-21] | |
| right_hand_keyp = vitposes['keypoints'][-21:] | |
| # Rejecting not confident detections (this could be improved) | |
| keyp = left_hand_keyp | |
| valid = keyp[:,2] > 0.5 | |
| if sum(valid) > 3: | |
| bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()] | |
| bboxes.append(bbox) | |
| is_right.append(0) | |
| keyp = right_hand_keyp | |
| valid = keyp[:,2] > 0.5 | |
| if sum(valid) > 3: | |
| bbox = [keyp[valid,0].min(), keyp[valid,1].min(), keyp[valid,0].max(), keyp[valid,1].max()] | |
| bboxes.append(bbox) | |
| is_right.append(1) | |
| if len(bboxes) == 0: | |
| return None, [] | |
| boxes = np.stack(bboxes) | |
| right = np.stack(is_right) | |
| print(boxes) | |
| print(right) | |
| print(open_cv_image) | |
| # Run HaMeR on all detected humans | |
| dataset = ViTDetDataset(model_cfg, open_cv_image, boxes, right) | |
| dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=False, num_workers=0) | |
| all_verts = [] | |
| all_cam_t = [] | |
| all_right = [] | |
| all_mesh_paths = [] | |
| temp_name = next(tempfile._get_candidate_names()) | |
| for batch in dataloader: | |
| batch = recursive_to(batch, device) | |
| print(batch['img']) | |
| with torch.no_grad(): | |
| out = model(batch) | |
| multiplier = (2*batch['right']-1) | |
| pred_cam = out['pred_cam'] | |
| print(out['pred_vertices']) | |
| print(pred_cam) | |
| pred_cam[:,1] = multiplier*pred_cam[:,1] | |
| box_center = batch["box_center"].float() | |
| box_size = batch["box_size"].float() | |
| img_size = batch["img_size"].float() | |
| multiplier = (2*batch['right']-1) | |
| render_size = img_size | |
| scaled_focal_length = model_cfg.EXTRA.FOCAL_LENGTH / model_cfg.MODEL.IMAGE_SIZE * img_size.max() | |
| pred_cam_t = cam_crop_to_full(pred_cam, box_center, box_size, render_size, scaled_focal_length).detach().cpu().numpy() | |
| # Render the result | |
| batch_size = batch['img'].shape[0] | |
| for n in range(batch_size): | |
| # Get filename from path img_path | |
| # img_fn, _ = os.path.splitext(os.path.basename(img_path)) | |
| person_id = int(batch['personid'][n]) | |
| white_img = (torch.ones_like(batch['img'][n]).cpu() - DEFAULT_MEAN[:,None,None]/255) / (DEFAULT_STD[:,None,None]/255) | |
| input_patch = batch['img'][n].cpu() * (DEFAULT_STD[:,None,None]/255) + (DEFAULT_MEAN[:,None,None]/255) | |
| input_patch = input_patch.permute(1,2,0).numpy() | |
| verts = out['pred_vertices'][n].detach().cpu().numpy() | |
| is_right = batch['right'][n].cpu().numpy() | |
| verts[:,0] = (2*is_right-1)*verts[:,0] | |
| cam_t = pred_cam_t[n] | |
| all_verts.append(verts) | |
| all_cam_t.append(cam_t) | |
| all_right.append(is_right) | |
| # Save all meshes to disk | |
| # if args.save_mesh: | |
| if True: | |
| camera_translation = cam_t.copy() | |
| tmesh = renderer.vertices_to_trimesh(verts, camera_translation, LIGHT_BLUE, is_right=is_right) | |
| temp_path = os.path.join(f'{OUT_FOLDER}/{temp_name}_{person_id}.obj') | |
| tmesh.export(temp_path) | |
| all_mesh_paths.append(temp_path) | |
| # Render front view | |
| if len(all_verts) > 0: | |
| misc_args = dict( | |
| mesh_base_color=LIGHT_BLUE, | |
| scene_bg_color=(1, 1, 1), | |
| focal_length=scaled_focal_length, | |
| ) | |
| cam_view = renderer.render_rgba_multiple(all_verts, cam_t=all_cam_t, render_res=render_size[n], is_right=all_right, **misc_args) | |
| # Overlay image | |
| input_img = open_cv_image.astype(np.float32)[:,:,::-1]/255.0 | |
| input_img = np.concatenate([input_img, np.ones_like(input_img[:,:,:1])], axis=2) # Add alpha channel | |
| input_img_overlay = input_img[:,:,:3] * (1-cam_view[:,:,3:]) + cam_view[:,:,:3] * cam_view[:,:,3:] | |
| # convert to PIL image | |
| out_pil_img = Image.fromarray((input_img_overlay*255).astype(np.uint8)) | |
| return out_pil_img, all_mesh_paths | |
| else: | |
| return None, [] | |
| with gr.Blocks(title="HaMeR", css=".gradio-container") as demo: | |
| #title="HaMeR" | |
| #description="Gradio Demo for HaMeR." | |
| #gr.HTML("""<h1>HaMeR</h1>""") | |
| #gr.HTML("""<h3>Gradio Demo for HaMeR. You can select an </h3>""") | |
| gr.HTML("""<div style="font-weight:bold; text-align:center; font-size: 30px;">HaMeR</div>""") | |
| gr.HTML("""<div style="text-align:left; font-size: 20px;">Demo for HaMeR. You can drop an image at the top-left panel | |
| (or select one of the examples) and you will get the 3D reconstructions of the detected hands on the right. | |
| You can also download the .obj files for each hand reconstruction.</div>""") | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_image = gr.Image(label="Input image", type="pil") | |
| with gr.Column(): | |
| output_image = gr.Image(label="Reconstructions", type="pil") | |
| output_meshes = gr.File(label="3D meshes") | |
| gr.HTML("""<br/>""") | |
| with gr.Row(): | |
| threshold = gr.Slider(0, 1.0, value=0.6, label='Detection Threshold') | |
| send_btn = gr.Button("Infer") | |
| send_btn.click(fn=infer, inputs=[input_image, threshold], outputs=[output_image, output_meshes]) | |
| # with gr.Row(): | |
| example_images = gr.Examples([ | |
| ['/home/user/app/assets/test1.jpg'], | |
| ['/home/user/app/assets/test2.jpg'], | |
| ['/home/user/app/assets/test3.jpg'], | |
| ['/home/user/app/assets/test5.jpg'], | |
| ], | |
| inputs=input_image) | |
| #demo.queue() | |
| demo.launch(debug=True) | |
| ### EOF ### |