ChiSu001 commited on Jan 18, 2025

Commit

ff07ed4

verified ·

1 Parent(s): 3063eac

Upload model files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
.gitignore +9 -0
configs/__init__.py +0 -0
configs/models/default.yaml +29 -0
configs/paths.py +7 -0
configs/run/demo.yaml +12 -0
datasets/__init__.py +0 -0
datasets/agora.py +111 -0
datasets/base.py +521 -0
datasets/bedlam.py +72 -0
datasets/common.py +34 -0
datasets/multiple_datasets.py +49 -0
demo/img0.png +3 -0
demo/img1.jpeg +0 -0
demo/img2.jpg +3 -0
docs/fix_chumpy.md +44 -0
engines/__init__.py +0 -0
engines/engine.py +347 -0
engines/funcs/__init__.py +0 -0
engines/funcs/eval_funcs.py +362 -0
engines/funcs/infer_funcs.py +86 -0
figures/pipeline.png +3 -0
figures/qualitative_results.png +3 -0
figures/results.png +3 -0
figures/results_3d.gif +3 -0
main.py +52 -0
models/__init__.py +16 -0
models/criterion.py +449 -0
models/decoder.py +388 -0
models/dn_components.py +193 -0
models/encoders/__init__.py +52 -0
models/encoders/dinov2/layers/__init__.py +11 -0
models/encoders/dinov2/layers/attention.py +89 -0
models/encoders/dinov2/layers/block.py +260 -0
models/encoders/dinov2/layers/dino_head.py +58 -0
models/encoders/dinov2/layers/drop_path.py +34 -0
models/encoders/dinov2/layers/layer_scale.py +27 -0
models/encoders/dinov2/layers/mlp.py +40 -0
models/encoders/dinov2/layers/patch_embed.py +88 -0
models/encoders/dinov2/layers/swiglu_ffn.py +72 -0
models/encoders/dinov2/models/__init__.py +43 -0
models/encoders/dinov2/models/vision_transformer.py +542 -0
models/human_models/__init__.py +1 -0
models/human_models/smpl_models.py +69 -0
models/matcher.py +159 -0
models/position_encoding.py +155 -0
models/sat_model.py +767 -0
requirements.txt +13 -0
utils/__init__.py +1 -0
utils/box_ops.py +139 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+demo/img0.png filter=lfs diff=lfs merge=lfs -text
+demo/img2.jpg filter=lfs diff=lfs merge=lfs -text
+figures/pipeline.png filter=lfs diff=lfs merge=lfs -text
+figures/qualitative_results.png filter=lfs diff=lfs merge=lfs -text
+figures/results_3d.gif filter=lfs diff=lfs merge=lfs -text
+figures/results.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+debug_datas.py
+outputs/
+weights/
+results/
+tmps/
+**.out
+**/__pycache__/
+datasets_visualization/
+demo_results/

configs/__init__.py ADDED Viewed

File without changes

configs/models/default.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+input_size: 1288
+encoder: 'vitb'
+# decoder
+hidden_dim: 768
+nheads: 4
+dec_layers: 6
+dim_feedforward: 2048
+dropout: 0.0
+num_queries: 50
+transformer_activation: "relu"
+sat_cfg:
+  use_sat: True
+  share_patch_embed: False
+  preprocess_pos_embed: False
+  num_lvls: 3
+  lvl_embed: True
+  get_map_layer: 3
+  use_additional_blocks: True
+  conf_thresh: 0.3
+  scale_thresh: 0.5
+dn_cfg:
+  use_dn: True
+  dn_number: 10
+  tgt_embed_type: "params"
+  box_noise_scale: 0.4
+  tgt_noise_scale: 0.2

configs/paths.py ADDED Viewed

	@@ -0,0 +1,7 @@

+dataset_root = '../datasets'
+smpl_model_path = './weights/smpl_data'
+smpl_mean_path = './weights/smpl_data/smpl/smpl_mean_params.npz'
+dinov2_vitb14_path = './weights/dinov2/dinov2_vitb14_pretrain.pth'
+dinov2_vitl14_path = './weights/dinov2/dinov2_vitl14_pretrain.pth'

configs/run/demo.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+model: default
+pretrain: True
+pretrain_path: './weights/sat_hmr/sat_644.pth'
+input_dir: './demo'
+output_dir: './demo_results'
+conf_thresh: [0.3]
+infer_batch_size: 1
+infer_num_workers: 8
+distributed_infer: True

datasets/__init__.py ADDED Viewed

File without changes

datasets/agora.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import numpy as np
+import torch
+from torch.utils.data.dataset import Dataset
+import os
+from configs.paths import dataset_root
+import copy
+from tqdm import tqdm
+from .base import BASE
+class AGORA(BASE):
+    def __init__(self, split='train', **kwargs):
+        super(AGORA, self).__init__(**kwargs)
+        assert split in ['train','test','validation']
+        self.ds_name = 'agora'
+        self.split = split
+        self.dataset_path = os.path.join(dataset_root,'agora')
+        # no annotations are available for AGORA-test
+        if split == 'test':
+            self.mode = 'infer'
+            self.img_names = os.listdir(os.path.join(self.dataset_path, self.split))
+        else:
+            if self.split == 'train':
+                annots_path = os.path.join(self.dataset_path,'smpl_neutral_annots','annots_smpl_{}_fit.npz'.format(split))
+            else:
+                annots_path = os.path.join(self.dataset_path,'smpl_neutral_annots','annots_smpl_{}.npz'.format(split))
+            self.annots = np.load(annots_path, allow_pickle=True)['annots'][()]
+            self.img_names = list(self.annots.keys())
+    def __len__(self):
+        return len(self.img_names)
+    def get_raw_data(self, idx):
+        img_id = idx % len(self.img_names)
+        img_name = self.img_names[img_id]
+        if self.mode == 'infer':
+            img_path = os.path.join(self.dataset_path, self.split,img_name)
+            raw_data = {'img_path': img_path,
+                        'img_name': img_name,
+                        'ds': 'agora'
+                        }
+            return raw_data
+        annots = copy.deepcopy(self.annots[img_name])
+        img_path = os.path.join(self.dataset_path, self.split,img_name)
+        valid_mask = np.where(annots['isValid'])[0]
+        # this should not happen
+        if len(valid_mask) ==0:
+            print(img_name, 'lack valid person')
+            exit(0)
+        cam_intrinsics = torch.from_numpy(np.array(annots['cam_intrinsics']))
+        cam_rot = torch.from_numpy(np.array(annots['cam_rot'])[valid_mask])
+        cam_trans = torch.from_numpy(np.array(annots['cam_trans'])[valid_mask])
+        betas_list = []
+        poses_list = []
+        transl_list = []
+        kid = []
+        if self.mode == 'eval':
+            occ_leval_list = []
+        for pNum in range(len(annots['isValid'])):
+            if not annots['isValid'][pNum]:
+                continue
+            gt = annots['smpl_gt'][pNum]
+            betas = gt['betas'].flatten()[:10]
+            betas_list.append(torch.from_numpy(betas))
+            full_poses = torch.cat([torch.from_numpy(gt['global_orient'].flatten()), torch.from_numpy(gt['body_pose'].flatten())])
+            poses_list.append(full_poses)
+            transl_list.append(torch.from_numpy(gt['transl'].flatten()))
+            kid.append(annots['kid'][pNum])
+            if self.mode == 'eval':
+                occ_leval_list.append(int(annots['occlusion'][pNum]//10))
+        betas = torch.stack(betas_list)
+        poses = torch.stack(poses_list)
+        transl = torch.stack(transl_list)
+        raw_data={'img_path': img_path,
+                'ds': 'agora',
+                'pnum': len(betas),
+                'betas': betas.float(),
+                'poses': poses.float(),
+                'transl': transl.float(),
+                'kid': torch.tensor(kid),
+                'cam_rot': cam_rot.float(),
+                'cam_trans': cam_trans.float(),
+                'cam_intrinsics':cam_intrinsics.float(),
+                '3d_valid': True,
+                'age_valid': True,
+                'detect_all_people':True
+                    }
+        if self.mode == 'eval':
+            raw_data['occ_level'] = torch.tensor(occ_leval_list)
+        return raw_data

datasets/base.py ADDED Viewed

	@@ -0,0 +1,521 @@

+import random
+import os
+import numpy as np
+from torch.utils.data.dataset import Dataset
+from torchvision import transforms
+from utils.visualization import tensor_to_BGR, vis_meshes_img, vis_boxes, vis_scale_img, pad_img, get_colors_rgb, vis_sat
+from utils.transforms import unNormalize, to_zorder
+from PIL import Image
+import math
+from tqdm import tqdm
+import cv2
+import torch
+import copy
+from math import radians,sin,cos
+from utils import constants
+from utils.box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh
+from utils.constants import smpl_24_flip, smpl_root_idx
+from utils.map import gen_scale_map, build_z_map
+from configs.paths import smpl_model_path
+from models.human_models import SMPL_Layer, smpl_gendered
+class BASE(Dataset):
+    def __init__(self, input_size = 1288, aug = True, mode = 'train',
+                        human_type = 'smpl',
+                        sat_cfg = None,
+                        aug_cfg = None):
+        self.input_size = input_size
+        self.aug = aug
+        if mode not in ['train', 'eval', 'infer']:
+            raise NotImplementedError
+        if human_type not in ['smpl', 'no']:
+            raise NotImplementedError
+        self.mode = mode
+        self.human_type = human_type
+        assert sat_cfg is not None
+        self.use_sat = sat_cfg['use_sat']
+        self.sat_cfg = sat_cfg
+        if self.use_sat:
+            assert input_size % 56 == 0
+        if self.mode == 'train' and aug_cfg is None:
+            aug_cfg = {'rot_range': [-15, 15],
+                       'scale_range': [0.8, 1.8],
+                       'flip_ratio': 0.5,
+                       'crop_ratio': 0.}
+        self.aug_cfg = aug_cfg
+        if human_type == 'smpl':
+            self.poses_flip = smpl_24_flip
+            self.num_poses = 24
+            self.num_betas = 10
+            self.num_kpts = 45
+            self.human_model = smpl_gendered
+        self.vis_thresh = 4    # least num visible kpts for a valid individual
+        self.img_keys = ['img_path', 'ds',
+                         'pnum', 'img_size',
+                         'resize_rate', 'cam_intrinsics',
+                         '3d_valid', 'detect_all_people',
+                         'scale_map', 'scale_map_pos', 'scale_map_hw']
+        self.human_keys = ['boxes', 'labels',
+                           'poses', 'betas',
+                           'transl', 'verts',
+                           'j3ds', 'j2ds', 'j2ds_mask',
+                            'depths', 'focals', 'genders']
+        z_depth = math.ceil(math.log2(self.input_size//28))
+        self.z_order_map, self.y_coords, self.x_coords = build_z_map(z_depth)
+    def get_raw_data(self, idx):
+        raise NotImplementedError
+    def get_aug_dict(self):
+        if self.aug:
+            rot = random.uniform(*self.aug_cfg['rot_range'])
+            flip = random.random() <= self.aug_cfg['flip_ratio']
+            scale = random.uniform(*self.aug_cfg['scale_range'])
+            crop = random.random() <= self.aug_cfg['crop_ratio']
+        else:
+            rot = 0.
+            flip = False
+            scale = 1.
+            crop = False
+        return {'rot':rot, 'flip':flip, 'scale':scale, 'crop': crop}
+    def process_img(self, img, meta_data, rot = 0., flip = False, scale = 1.0, crop = False):
+        # randomly crop (similar to scale)
+        if self.mode == 'train' and crop:
+            h, w = img.shape[:2]
+            if h < w :
+                clip_ratio = random.uniform(0.5, 0.9)
+                tgt_h, tgt_w = int(h*clip_ratio), int(w*clip_ratio)
+                img = img[:tgt_h,(w-tgt_w)//2:(w+tgt_w)//2,:].copy()
+                cam_intrinsics = meta_data['cam_intrinsics']
+                cam_intrinsics[:,0,2] -= (w-tgt_w)//2
+                meta_data.update({'cam_intrinsics': cam_intrinsics})
+        # resize
+        img_size = torch.tensor(img.shape[:2])
+        if img_size[1] >= img_size[0]:
+            resize_rate = self.input_size/img_size[1]
+            img = cv2.resize(img,dsize=(self.input_size,int(resize_rate*img_size[0])))
+            img_size = torch.tensor([int(resize_rate*img_size[0]),self.input_size])
+        else:
+            resize_rate = self.input_size/img_size[0]
+            img = cv2.resize(img,dsize=(int(resize_rate*img_size[1]),self.input_size))
+            img_size = torch.tensor([self.input_size,int(resize_rate*img_size[1])])
+        meta_data.update({'img_size': img_size, 'resize_rate': resize_rate})
+        # flip
+        if flip:
+            img = np.flip(img, axis = 1)
+            rot = -rot
+        # rot and scale
+        img_valid = np.full((img.shape[0], img.shape[1]), 255, dtype = np.uint8)
+        M  = cv2.getRotationMatrix2D((int(img_size[1]/2),int(img_size[0]/2)), rot, scale)
+        img = cv2.warpAffine(img, M, dsize = (img.shape[1],img.shape[0]))
+        img_valid = cv2.warpAffine(img_valid, M, dsize = (img.shape[1],img.shape[0]))
+        meta_data.update({'img_valid': img_valid})
+        return img
+    def occlusion_aug(self, meta_data):
+        occ_boxes = []
+        imght, imgwidth = meta_data['img_size']
+        for bbox in box_cxcywh_to_xyxy(meta_data['boxes']):
+            bbox = bbox.clone()
+            bbox *= self.input_size
+            xmin, ymin = bbox[:2]
+            xmax, ymax = bbox[2:]
+            if random.random() <= 0.6:
+                counter = 0
+                while True:
+                    # force to break if no suitable occlusion
+                    if counter > 5:
+                        synth_ymin, synth_h, synth_xmin, synth_w = 0, 0, 0, 0
+                        break
+                    counter += 1
+                    area_min = 0.0
+                    area_max = 0.3
+                    synth_area = (random.random() * (area_max - area_min) + area_min) * (xmax - xmin) * (ymax - ymin)
+                    ratio_min = 0.5
+                    ratio_max = 1 / 0.5
+                    synth_ratio = (random.random() * (ratio_max - ratio_min) + ratio_min)
+                    synth_h = math.sqrt(synth_area * synth_ratio)
+                    synth_w = math.sqrt(synth_area / synth_ratio)
+                    synth_xmin = random.random() * ((xmax - xmin) - synth_w - 1) + xmin
+                    synth_ymin = random.random() * ((ymax - ymin) - synth_h - 1) + ymin
+                    if synth_xmin >= 0 and synth_ymin >= 0 and synth_xmin + synth_w < imgwidth and synth_ymin + synth_h < imght:
+                        synth_xmin = int(synth_xmin)
+                        synth_ymin = int(synth_ymin)
+                        synth_w = int(synth_w)
+                        synth_h = int(synth_h)
+                        break
+            else:
+                synth_ymin, synth_h, synth_xmin, synth_w = 0, 0, 0, 0
+            occ_boxes.append((synth_ymin, synth_h, synth_xmin, synth_w))
+        return occ_boxes
+    def get_boxes(self, meta_data):
+        j2ds = meta_data['j2ds']
+        j2ds_mask = meta_data['j2ds_mask']
+        pnum = meta_data['pnum']
+        bboxes_list = []
+        for i in range(pnum):
+            kpts = j2ds[i].clone()
+            min_xy = kpts.min(dim = 0)[0]
+            max_xy = kpts.max(dim = 0)[0]
+            bbox_xyxy = torch.cat([min_xy, max_xy], dim = 0)
+            bboxes_list.append(bbox_xyxy)
+        imght, imgwidth = meta_data['img_size']
+        boxes = box_xyxy_to_cxcywh(torch.stack(bboxes_list)) / self.input_size
+        boxes[...,2:] *= 1.2
+        boxes = box_cxcywh_to_xyxy(boxes)
+        boxes[...,[0,2]] = boxes[...,[0,2]].clamp(min=0.01,max=(imgwidth-1)/self.input_size)
+        boxes[...,[1,3]] = boxes[...,[1,3]].clamp(min=0.01,max=(imght-1)/self.input_size)
+        boxes = box_xyxy_to_cxcywh(boxes)
+        meta_data.update({'boxes': boxes})
+    def process_cam(self, meta_data, rot = 0., flip = False, scale = 1.):
+        img_size = meta_data['img_size']
+        resize_rate = meta_data['resize_rate']
+        rot_aug_mat = meta_data['rot_aug_mat']
+        cam_intrinsics = meta_data['cam_intrinsics']
+        # cam_int
+        # resize
+        cam_intrinsics[:,0:2,2] *= resize_rate * scale
+        cam_intrinsics[:,[0,1],[0,1]] *= resize_rate * scale
+        cam_intrinsics[:,0,2] += (1-scale)*img_size[1]/2
+        cam_intrinsics[:,1,2] += (1-scale)*img_size[0]/2
+        # rotation
+        princpt = cam_intrinsics[:,0:2,2].clone()
+        princpt[...,0] -= img_size[1]/2
+        princpt[...,1] -= img_size[0]/2
+        princpt = torch.matmul(princpt,rot_aug_mat[:2,:2].transpose(-1,-2))
+        princpt[...,0] += img_size[1]/2
+        princpt[...,1] += img_size[0]/2
+        cam_intrinsics[:,0:2,2] = princpt
+        # flip
+        if flip:
+            cam_intrinsics[:,0,2] = img_size[1]-cam_intrinsics[:,0,2]
+        meta_data.update({'cam_intrinsics': cam_intrinsics})
+        #cam_ext
+        new_cam_rot = torch.matmul(rot_aug_mat.unsqueeze(0),meta_data['cam_rot'])
+        new_cam_trans = torch.matmul(meta_data['cam_trans'],rot_aug_mat.transpose(-1,-2))
+        meta_data.update({'cam_rot': new_cam_rot,'cam_trans':new_cam_trans})
+    def process_smpl(self, meta_data, rot = 0., flip = False, scale = 1.):
+        poses = meta_data['poses']
+        bs = poses.shape[0]
+        assert poses.ndim == 2
+        assert tuple(poses.shape) == (bs, self.num_poses*3)
+        # Merge rotation to smpl global_orient
+        global_orient = poses[:,:3].clone()
+        cam_rot = meta_data['cam_rot'].numpy()
+        for i in range(global_orient.shape[0]):
+            root_pose = global_orient[i].view(1, 3).numpy()
+            R = cam_rot[i].reshape(3,3)
+            root_pose, _ = cv2.Rodrigues(root_pose)
+            root_pose, _ = cv2.Rodrigues(np.dot(R, root_pose))
+            root_pose = torch.from_numpy(root_pose).flatten()
+            global_orient[i] = root_pose
+        poses[:,:3] = global_orient
+        # Flip smpl parameters
+        if flip:
+            poses = poses.reshape(bs, self.num_poses, 3)
+            poses = poses[:, self.poses_flip, :]
+            poses[..., 1:3] *= -1  # multiply -1 to y and z axis of axis-angle
+            poses = poses.reshape(bs, -1)
+        # Update all pose params
+        meta_data.update({'poses': poses})
+        # Get vertices and joints in cam_coords
+        with torch.no_grad():
+            smpl_kwargs = {'poses': meta_data['poses'], 'betas': meta_data['betas']}
+            if 'genders' in meta_data:
+                smpl_kwargs.update({'genders': meta_data['genders']})
+            verts, j3ds = self.human_model(**smpl_kwargs)
+        j3ds = j3ds[:, :self.num_kpts, :]
+        root = j3ds[:,smpl_root_idx,:].clone() # smpl root
+        # new translation in cam_coords
+        transl = torch.bmm((root+meta_data['transl']).reshape(-1,1,3),meta_data['cam_rot'].transpose(-1,-2)).reshape(-1,3)\
+            +meta_data['cam_trans']-root
+        if flip:
+            transl[...,0] = -transl[...,0]
+        meta_data.update({'transl': transl})
+        verts = verts + transl.reshape(-1,1,3)
+        j3ds = j3ds + transl.reshape(-1,1,3)
+        meta_data.update({'verts': verts, 'j3ds': j3ds})
+    def project_joints(self, meta_data):
+        j3ds = meta_data['j3ds']
+        cam_intrinsics = meta_data['cam_intrinsics']
+        j2ds_homo = torch.matmul(j3ds,cam_intrinsics.transpose(-1,-2))
+        j2ds = j2ds_homo[...,:2]/(j2ds_homo[...,2,None])
+        meta_data.update({'j3ds': j3ds, 'j2ds': j2ds})
+    def check_visibility(self, meta_data):
+        img_valid = meta_data['img_valid']
+        img_size = meta_data['img_size']
+        j2ds = meta_data['j2ds']
+        j2ds_mask = meta_data['j2ds_mask'] if 'j2ds_mask' in meta_data else torch.ones_like(j2ds, dtype=bool)
+        j2ds_vis = torch.from_numpy(img_valid[j2ds[...,1].int().clip(0,img_size[0]-1), j2ds[...,0].int().clip(0,img_size[1]-1)] > 0)
+        j2ds_vis &= (j2ds[...,1] >= 0) & (j2ds[...,1] < img_size[0])
+        j2ds_vis &= (j2ds[...,0] >= 0) & (j2ds[...,0] < img_size[1])
+        j2ds_invalid = ~j2ds_vis
+        j2ds_mask[j2ds_invalid] = False
+        meta_data.update({'j2ds_mask': j2ds_mask})
+        vis_cnt = j2ds_mask[...,0].sum(dim = -1) # num of visible joints per person
+        valid_msk = (vis_cnt >= self.vis_thresh)
+        pnum = valid_msk.sum().item()
+        if pnum == 0:
+            meta_data['pnum'] = pnum
+            return
+        if pnum < meta_data['pnum']:
+            meta_data['pnum'] = pnum
+            for key in self.human_keys:
+                if key in meta_data:
+                    if isinstance(meta_data[key], list):
+                        meta_data[key] = np.array(meta_data[key])[valid_msk].tolist()
+                    else:
+                        meta_data[key] = meta_data[key][valid_msk]
+            if 'cam_intrinsics' in meta_data and len(meta_data['cam_intrinsics']) > 1:
+                meta_data['cam_intrinsics'] = meta_data['cam_intrinsics'][valid_msk]
+        return
+    def process_data(self, img, raw_data, rot = 0., flip = False, scale = 1., crop = False):
+        meta_data = copy.deepcopy(raw_data)
+        # prepare rotation augmentation mat.
+        rot_aug_mat = torch.tensor([[cos(radians(-rot)), -sin(radians(-rot)), 0.],
+                            [sin(radians(-rot)), cos(radians(-rot)), 0.],
+                            [0., 0., 1.]])
+        meta_data.update({'rot_aug_mat': rot_aug_mat})
+        img = self.process_img(img, meta_data, rot, flip, scale, crop)
+        self.process_cam(meta_data, rot, flip, scale)
+        self.process_smpl(meta_data, rot, flip, scale)
+        self.project_joints(meta_data)
+        self.check_visibility(meta_data)
+        matcher_vis = meta_data['j2ds_mask'][:,:22,0].sum(dim = -1) # num of visible joints used in Hungarian Matcher
+        if meta_data['pnum'] == 0 or not torch.all(matcher_vis):
+            if self.mode == 'train':
+                meta_data['pnum'] = 0
+                return img, meta_data
+        j3ds = meta_data['j3ds']
+        depths = j3ds[:, smpl_root_idx, [2]].clone()
+        if len(meta_data['cam_intrinsics']) == 1:
+            focals = torch.full_like(depths, meta_data['cam_intrinsics'][0,0,0])
+        else:
+            focals = meta_data['cam_intrinsics'][:,0,0][:, None]
+        depths = torch.cat([depths, depths/focals],dim=-1)
+        meta_data.update({'depths': depths, 'focals': focals})
+        self.get_boxes(meta_data)
+        meta_data.update({'labels': torch.zeros(meta_data['pnum'], dtype=int)})
+        # VI. Occlusion augmentation
+        if self.aug:
+            occ_boxes = self.occlusion_aug(meta_data)
+            for (synth_ymin, synth_h, synth_xmin, synth_w) in occ_boxes:
+                img[synth_ymin:synth_ymin + synth_h, synth_xmin:synth_xmin + synth_w, :] = np.random.rand(synth_h, synth_w, 3) * 255
+        if self.use_sat:
+            # scale map
+            boxes = meta_data['boxes']
+            scales = boxes[:,2:].norm(p=2,dim=1)
+            v3ds = meta_data['verts']
+            depths_norm = meta_data['depths'][:,1]
+            cam_intrinsics = meta_data['cam_intrinsics']
+            sorted_idx = torch.argsort(depths_norm, descending=True)
+            map_size = (meta_data['img_size'] + 27)//28
+            scale_map = gen_scale_map(scales[sorted_idx], v3ds[sorted_idx],
+                                        faces = self.human_model.faces,
+                                        cam_intrinsics = cam_intrinsics[sorted_idx] if len(cam_intrinsics) > 1 else cam_intrinsics,
+                                        map_size = map_size,
+                                        patch_size = 28,
+                                        pad = True)
+            scale_map_z, _, pos_y, pos_x = to_zorder(scale_map,
+                                                z_order_map = self.z_order_map,
+                                                y_coords = self.y_coords,
+                                                x_coords = self.x_coords)
+            meta_data['scale_map'] = scale_map_z
+            meta_data['scale_map_pos'] = {'pos_y': pos_y, 'pos_x': pos_x}
+            meta_data['scale_map_hw'] = scale_map.shape[:2]
+        return img, meta_data
+    def __getitem__(self, index):
+        raw_data = self.get_raw_data(index)
+        # Load original image
+        ori_img = cv2.imread(raw_data['img_path'])
+        if raw_data['ds'] == 'bedlam' and 'closeup' in raw_data['img_path']:
+            ori_img = cv2.rotate(ori_img, cv2.ROTATE_90_CLOCKWISE)
+        img_size = torch.tensor(ori_img.shape[:2])
+        raw_data.update({'img_size': img_size})
+        if self.mode == 'train':
+            cnt = 0
+            while (True):
+                aug_dict = self.get_aug_dict()
+                img, meta_data = self.process_data(ori_img, raw_data, **aug_dict)
+                if meta_data['pnum'] > 0:
+                    break
+                cnt+=1
+                if cnt >= 10:
+                    aug_dict.update({'rot':0., 'scale':1., 'crop': False})
+                    img, meta_data = self.process_data(ori_img, raw_data, **aug_dict)
+                    if meta_data['pnum'] == 0:
+                        print('skipping: ' + meta_data['img_path'])
+                    return self.__getitem__(index + 1)
+        elif self.mode == 'eval':
+            assert not self.aug, f'No need to use augmentation when mode is {self.mode}!'
+            aug_dict = self.get_aug_dict()
+            img, meta_data = self.process_data(ori_img, raw_data, **aug_dict)
+        else:
+            assert not self.aug, f'No need to use augmentation when mode is {self.mode}!'
+            meta_data = raw_data
+            img = self.process_img(ori_img, meta_data)
+        # delete unwanted keys
+        if self.mode == 'train':
+            for key in list(meta_data.keys()):
+                if key not in self.img_keys and key not in self.human_keys:
+                    del meta_data[key]
+        if self.aug:
+            array2tensor = transforms.Compose([
+                transforms.ColorJitter(0.2, 0.2, 0.2),
+                transforms.ToTensor(),
+                transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
+                ])
+        else:
+            array2tensor = transforms.Compose([
+                transforms.ToTensor(),
+                transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
+                ])
+        patch_size = 14
+        if self.use_sat:
+            patch_size = 56
+        pad_img = np.zeros((math.ceil(img.shape[0]/patch_size)*patch_size, math.ceil(img.shape[1]/patch_size)*patch_size, 3), dtype=img.dtype)
+        pad_img[:img.shape[0], :img.shape[1]] = img
+        assert max(pad_img.shape[:2]) == self.input_size
+        pad_img = Image.fromarray(pad_img[:,:,::-1].copy())
+        norm_img = array2tensor(pad_img)
+        if 'j2ds_mask' in meta_data:
+            meta_data['j2ds_mask'][:,:,:] = True
+        return norm_img, meta_data
+    def visualize(self, results_save_dir = None, vis_num = 100):
+        if results_save_dir is None:
+            results_save_dir = os.path.join('datasets_visualization',f'{self.ds_name}_{self.split}')
+        os.makedirs(results_save_dir, exist_ok=True)
+        vis_interval = len(self)//vis_num
+        for idx in tqdm(range(len(self))):
+            if idx % vis_interval != 0:
+                continue
+            norm_img, targets = self.__getitem__(idx)
+            ori_img = tensor_to_BGR(unNormalize(norm_img).cpu())
+            img_name = targets['img_path'].split('/')[-1].split('.')[-2]
+            pnum = targets['pnum']
+            if 'verts' in targets:
+                colors = get_colors_rgb(len(targets['verts']))
+                mesh_img = vis_meshes_img(img = ori_img.copy(),
+                                        verts = targets['verts'],
+                                        smpl_faces = self.human_model.faces,
+                                        cam_intrinsics = targets['cam_intrinsics'].cpu(),
+                                        colors=colors,
+                                        padding=False)
+                cv2.imwrite(os.path.join(results_save_dir,f'{idx}_{img_name}_mesh.jpg'), mesh_img)
+            if 'boxes' in targets:
+                gt_img = ori_img.copy()
+                boxes = box_cxcywh_to_xyxy(targets['boxes']) * self.input_size
+                for i, bbox in enumerate(boxes):
+                    bbox = bbox.int().tolist()
+                    cv2.rectangle(gt_img, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
+                                        color=(0,0,255), thickness = 2 )
+                cv2.imwrite(os.path.join(results_save_dir,f'{idx}_{img_name}_boxes.jpg'), gt_img)
+            if 'scale_map' in targets:
+                gt_img = ori_img.copy()
+                flatten_map = targets['scale_map']
+                ys, xs = targets['scale_map_pos']['pos_y'], targets['scale_map_pos']['pos_x']
+                h, w = targets['scale_map_hw']
+                scale_map = torch.zeros((h,w,2))
+                scale_map[ys,xs] = flatten_map
+                img = vis_scale_img(gt_img, scale_map, patch_size=28)
+                cv2.imwrite(os.path.join(results_save_dir,f'{idx}_{img_name}_scales.jpg'), img)
+            # if 'j2ds' in targets:
+            #     gt_img = ori_img.copy()
+            #     j2ds = targets['j2ds']
+            #     j2ds_mask = targets['j2ds_mask']
+            #     for kpts, valids in zip(j2ds, j2ds_mask):
+            #         for kpt, valid in zip(kpts, valids):
+            #             if not valid.all():
+            #                 continue
+            #             kpt_int = kpt.numpy().astype(int)
+            #             cv2.circle(gt_img, kpt_int, 2, (0, 0, 255), -1)
+            #     cv2.imwrite(os.path.join(results_save_dir,f'{idx}_{img_name}_joints.png'), np.hstack([ori_img, gt_img]))

datasets/bedlam.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import numpy as np
+import torch
+from torch.utils.data.dataset import Dataset
+import os
+from configs.paths import dataset_root
+import copy
+from tqdm import tqdm
+from .base import BASE
+class BEDLAM(BASE):
+    def __init__(self, split='train_6fps',**kwargs):
+        super(BEDLAM, self).__init__(**kwargs)
+        assert split in ['train_1fps','train_3fps','train_6fps','validation_6fps']
+        assert not self.kid_offset
+        self.ds_name = 'bedlam'
+        self.dataset_path = os.path.join(dataset_root,'bedlam')
+        annots_path = os.path.join(self.dataset_path,f'bedlam_smpl_{split}.npz')
+        self.annots = np.load(annots_path, allow_pickle=True)['annots'][()]
+        self.img_names = list(self.annots.keys())
+        self.split = 'train' if 'train' in split else 'validation'
+    def __len__(self):
+        return len(self.img_names)
+    def cnt_instances(self):
+        ins_cnt = 0
+        for idx in tqdm(range(len(self))):
+            img_id = idx
+            img_name = self.img_names[img_id]
+            # ins_cnt += len(self.annots[img_name]['isValid'])
+            ins_cnt += len(self.annots[img_name]['shape'])
+            # tqdm.write(str(ins_cnt))
+        print(f'TOTAL: {ins_cnt}')
+    def get_raw_data(self, idx):
+        img_id = idx%len(self.img_names)
+        img_name = self.img_names[img_id]
+        annots = copy.deepcopy(self.annots[img_name])
+        img_path = os.path.join(self.dataset_path,self.split,img_name)
+        cam_intrinsics = torch.from_numpy(annots['cam_int']).unsqueeze(0)
+        cam_rot = torch.from_numpy(np.stack(annots['cam_rot']))
+        cam_trans = torch.from_numpy(np.stack(annots['cam_trans']))
+        betas = torch.from_numpy(np.stack(annots['shape']))
+        poses = torch.from_numpy(np.stack(annots['pose_world']))
+        transl = torch.from_numpy(np.stack(annots['trans_world']))
+        raw_data={'img_path': img_path,
+                'ds': 'bedlam',
+                'pnum': len(betas),
+                'betas': betas.float(),
+                'poses': poses.float(),
+                'transl': transl.float(),
+                'cam_rot': cam_rot.float(),
+                'cam_trans': cam_trans.float(),
+                'cam_intrinsics':cam_intrinsics.float(),
+                '3d_valid': True,
+                'age_valid': False,
+                'detect_all_people':True
+                    }
+        if self.mode == 'eval':
+            raw_data['occ_level'] = torch.zeros(len(betas),dtype=int)
+        return raw_data

datasets/common.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import numpy as np
+import torch
+from torch.utils.data.dataset import Dataset
+import os
+from configs.paths import dataset_root
+import copy
+from .base import BASE
+# dataset for inference
+class COMMON(BASE):
+    def __init__(self, img_folder, **kwargs):
+        super(COMMON, self).__init__(**kwargs)
+        self.dataset_path = img_folder
+        self.img_names = sorted([img_name\
+                                 for img_name\
+                                 in os.listdir(self.dataset_path)\
+                                 if img_name.endswith('.png') or img_name.endswith('.jpg')  or img_name.endswith('.jpeg')])
+        assert self.mode == 'infer'
+    def __len__(self):
+        return len(self.img_names)
+    def get_raw_data(self, idx):
+        img_id=idx%len(self.img_names)
+        img_name=self.img_names[img_id]
+        img_path=os.path.join(self.dataset_path,img_name)
+        raw_data={'img_path': img_path,
+                'img_name': img_name,
+                'ds': 'common'
+                    }
+        return raw_data

datasets/multiple_datasets.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import random
+from torch.utils.data.dataset import Dataset
+import numpy as np
+from .agora import AGORA
+from .bedlam import BEDLAM
+datasets_dict = {'bedlam': BEDLAM, 'agora': AGORA}
+class MultipleDatasets(Dataset):
+    def __init__(self, datasets_used, datasets_split = None, make_same_len = False, **kwargs):
+        if datasets_split is None:
+            self.dbs = [datasets_dict[ds](**kwargs) for ds in datasets_used]
+        else:
+            self.dbs = [datasets_dict[ds](split, **kwargs) for ds, split in zip(datasets_used, datasets_split)]
+        self.db_num = len(self.dbs)
+        self.max_db_data_num = max([len(db) for db in self.dbs])
+        self.db_len_cumsum = np.cumsum([len(db) for db in self.dbs])
+        self.make_same_len = make_same_len
+        self.human_model = self.dbs[0].human_model
+    def __len__(self):
+        # all dbs have the same length
+        if self.make_same_len:
+            return self.max_db_data_num * self.db_num
+        # each db has different length
+        else:
+            return sum([len(db) for db in self.dbs])
+    def __getitem__(self, index):
+        if self.make_same_len:
+            db_idx = index // self.max_db_data_num
+            data_idx = index % self.max_db_data_num
+            if data_idx >= len(self.dbs[db_idx]) * (self.max_db_data_num // len(self.dbs[db_idx])): # last batch: random sampling
+                data_idx = random.randint(0,len(self.dbs[db_idx])-1)
+            else: # before last batch: use modular
+                data_idx = data_idx % len(self.dbs[db_idx])
+        else:
+            for i in range(self.db_num):
+                if index < self.db_len_cumsum[i]:
+                    db_idx = i
+                    break
+            if db_idx == 0:
+                data_idx = index
+            else:
+                data_idx = index - self.db_len_cumsum[db_idx-1]
+        norm_img, meta_data = self.dbs[db_idx][data_idx]
+        return norm_img, meta_data

demo/img0.png ADDED Viewed

Git LFS Details

SHA256: 1473b45a1c82b64d90f01ad9f43db9719cc3858bbf3fe0dc36f0e8bee717e3fc
Pointer size: 132 Bytes
Size of remote file: 6.34 MB

demo/img1.jpeg ADDED Viewed

demo/img2.jpg ADDED Viewed

Git LFS Details

SHA256: d1be2c06e55a514a47d2fd6880f9cb702196b08e220997ae8571399efb7d7ab7
Pointer size: 132 Bytes
Size of remote file: 2.39 MB

docs/fix_chumpy.md ADDED Viewed

	@@ -0,0 +1,44 @@

+You may need to modify `chumpy` package to avoid errors.
+  * Comment line 11 in `${Your_Conda_Environment}/lib/python3.11/site-packages/chumpy/__init__.py`:
+  ```
+  from .ch import *
+  from .logic import *
+  from .optimization import minimize
+  from . import extras
+  from . import testing
+  from .version import version as __version__
+  from .version import version as __version__
+  # from numpy import bool, int, float, complex, object, unicode, str, nan, inf
+  ```
+  * Add *"inspect.getargspec = inspect.getfullargspec"* in `${Your_Conda_Environment}/lib/python3.11/site-packages/chumpy/ch.py` (line 25). Now it should look like:
+  ```
+  #!/usr/bin/env python
+  # encoding: utf-8
+  """
+  Author(s): Matthew Loper
+  See LICENCE.txt for licensing and contact information.
+  """
+  __all__ = ['Ch', 'depends_on', 'MatVecMult', 'ChHandle', 'ChLambda']
+  import os, sys, time
+  import inspect
+  import scipy.sparse as sp
+  import numpy as np
+  import numbers
+  import weakref
+  import copy as external_copy
+  from functools import wraps
+  from scipy.sparse.linalg.interface import LinearOperator
+  from .utils import row, col, timer, convert_inputs_to_sparse_if_necessary
+  import collections
+  from copy import deepcopy
+  from functools import reduce
+  inspect.getargspec = inspect.getfullargspec
+  ```

engines/__init__.py ADDED Viewed

File without changes

engines/engine.py ADDED Viewed

	@@ -0,0 +1,347 @@

+from accelerate import Accelerator
+from tqdm.auto import tqdm
+import torch
+from torch.utils.data import DataLoader
+from datasets.multiple_datasets import MultipleDatasets, datasets_dict
+from datasets.common import COMMON
+from transformers import get_scheduler
+from safetensors.torch import load_file
+import os
+import re
+import time
+import datetime
+from models import build_sat_model
+from .funcs.eval_funcs import *
+from .funcs.infer_funcs import inference
+from utils import misc
+from utils.misc import get_world_size
+import torch.multiprocessing
+import numpy as np
+class Engine():
+    def __init__(self, args, mode='train'):
+        self.exp_name = args.exp_name
+        self.mode = mode
+        assert mode in ['train','eval','infer']
+        self.conf_thresh = args.conf_thresh
+        self.eval_func_maps = {'agora_validation': evaluate_agora,
+                                'bedlam_validation_6fps': evaluate_agora,
+                                'agora_test': test_agora}
+        self.inference_func = inference
+        if self.mode == 'train':
+            self.output_dir = os.path.join('./outputs')
+            self.log_dir = os.path.join(self.output_dir,'logs')
+            self.ckpt_dir = os.path.join(self.output_dir,'ckpts')
+            self.distributed_eval = args.distributed_eval
+            self.eval_vis_num = args.eval_vis_num
+        elif self.mode == 'eval':
+            self.output_dir = os.path.join('./results')
+            self.distributed_eval = args.distributed_eval
+            self.eval_vis_num = args.eval_vis_num
+        elif self.mode == 'infer':
+            output_dir = getattr(args, 'output_dir', None)
+            if output_dir is not None:
+                self.output_dir = output_dir
+            else:
+                now = datetime.datetime.now()
+                timestamp = now.strftime("%Y%m%d_%H%M%S")
+                self.output_dir = os.path.join('./results',f'{self.exp_name}_infer_{timestamp}')
+            self.distributed_infer = args.distributed_infer
+        self.prepare_accelerator()
+        self.prepare_models(args)
+        self.prepare_datas(args)
+        if self.mode == 'train':
+            self.prepare_training(args)
+        total_cnt = sum(p.numel() for p in self.model.parameters())
+        trainable_cnt = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
+        self.accelerator.print(f'Initialization finished.\n{trainable_cnt} trainable parameters({total_cnt} total).')
+    def prepare_accelerator(self):
+        if self.mode == 'train':
+            self.accelerator = Accelerator(
+                log_with="tensorboard",
+                project_dir=os.path.join(self.log_dir)
+            )
+            if self.accelerator.is_main_process:
+                os.makedirs(self.log_dir, exist_ok=True)
+                os.makedirs(os.path.join(self.ckpt_dir,self.exp_name),exist_ok=True)
+                self.accelerator.init_trackers(self.exp_name)
+        else:
+            self.accelerator = Accelerator()
+            if self.accelerator.is_main_process:
+                os.makedirs(self.output_dir, exist_ok=True)
+    def prepare_models(self, args):
+        # load model and criterion
+        self.accelerator.print('Preparing models...')
+        self.unwrapped_model, self.criterion = build_sat_model(args, set_criterion = (self.mode == 'train'))
+        if self.criterion is not None:
+            self.weight_dict = self.criterion.weight_dict
+        # load weights
+        if args.pretrain:
+            self.accelerator.print(f'Loading pretrained weights: {args.pretrain_path}')
+            state_dict = torch.load(args.pretrain_path)
+            self.unwrapped_model.load_state_dict(state_dict,strict=False)
+        # to gpu
+        self.model = self.accelerator.prepare(self.unwrapped_model)
+    def prepare_datas(self, args):
+        # load dataset and dataloader
+        if self.mode == 'train':
+            self.accelerator.print('Loading training datasets:\n',
+                            [f'{d}_{s}' for d,s in zip(args.train_datasets_used, args.train_datasets_split)])
+            self.train_batch_size = args.train_batch_size
+            train_dataset = MultipleDatasets(args.train_datasets_used, args.train_datasets_split,
+                                        make_same_len=False, input_size=args.input_size, aug=True,
+                                        mode = 'train', sat_cfg=args.sat_cfg,
+                                        aug_cfg=args.aug_cfg)
+            self.train_dataloader = DataLoader(dataset=train_dataset, batch_size=self.train_batch_size,
+                                                shuffle=True,collate_fn=misc.collate_fn,
+                                                num_workers=args.train_num_workers,pin_memory=True)
+            self.train_dataloader = self.accelerator.prepare(self.train_dataloader)
+        if self.mode != 'infer':
+            self.accelerator.print('Loading evaluation datasets:',
+                                [f'{d}_{s}' for d,s in zip(args.eval_datasets_used, args.eval_datasets_split)])
+            self.eval_batch_size = args.eval_batch_size
+            eval_ds = {f'{ds}_{split}': datasets_dict[ds](split = split,
+                                                          mode = 'eval',
+                                                          input_size = args.input_size,
+                                                          aug = False,
+                                                          sat_cfg=args.sat_cfg)\
+                        for (ds, split) in zip(args.eval_datasets_used, args.eval_datasets_split)}
+            self.eval_dataloaders = {k: DataLoader(dataset=v, batch_size=self.eval_batch_size,
+                                        shuffle=False,collate_fn=misc.collate_fn,
+                                        num_workers=args.eval_num_workers,pin_memory=True)\
+                                    for (k,v) in eval_ds.items()}
+            if self.distributed_eval:
+                for (k,v) in self.eval_dataloaders.items():
+                    self.eval_dataloaders.update({k: self.accelerator.prepare(v)})
+        else:
+            img_folder = args.input_dir
+            self.accelerator.print(f'Loading inference images from {img_folder}')
+            self.infer_batch_size = args.infer_batch_size
+            infer_ds = COMMON(img_folder = img_folder, input_size=args.input_size,aug=False,
+                                mode = 'infer', sat_cfg=args.sat_cfg)
+            self.infer_dataloader = DataLoader(dataset=infer_ds, batch_size=self.infer_batch_size,
+                                        shuffle=False,collate_fn=misc.collate_fn,
+                                        num_workers=args.infer_num_workers,pin_memory=True)
+            if self.distributed_infer:
+                self.infer_dataloader = self.accelerator.prepare(self.infer_dataloader)
+    def prepare_training(self, args):
+        self.start_epoch = 0
+        self.num_epochs = args.num_epochs
+        self.global_step = 0
+        if hasattr(args, 'sat_gt_epoch'):
+            self.sat_gt_epoch = args.sat_gt_epoch
+            self.accelerator.print(f'Use GT for the first {self.sat_gt_epoch} epoch(s)...')
+        else:
+            self.sat_gt_epoch = -1
+        self.save_and_eval_epoch = args.save_and_eval_epoch
+        self.least_eval_epoch = args.least_eval_epoch
+        self.detach_j3ds = args.detach_j3ds
+        self.accelerator.print('Preparing optimizer and lr_scheduler...')
+        param_dicts = [
+            {
+                "params":
+                    [p for n, p in self.unwrapped_model.named_parameters()
+                    if not misc.match_name_keywords(n, args.lr_encoder_names) and p.requires_grad],
+                "lr": args.lr,
+            },
+            {
+                "params":
+                    [p for n, p in self.unwrapped_model.named_parameters()
+                    if misc.match_name_keywords(n, args.lr_encoder_names) and p.requires_grad],
+                "lr": args.lr_encoder,
+            }
+        ]
+        # optimizer
+        if args.optimizer == 'adamw':
+            self.optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
+                                        weight_decay=args.weight_decay)
+        else:
+            raise NotImplementedError
+        # lr_scheduler
+        if args.lr_scheduler == 'cosine':
+            self.lr_scheduler = get_scheduler(name="cosine", optimizer=self.optimizer,
+                                          num_warmup_steps=args.num_warmup_steps,
+                                          num_training_steps=get_world_size() * self.num_epochs * len(self.train_dataloader))
+        elif args.lr_scheduler == 'multistep':
+            self.lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(self.optimizer, args.milestones, gamma=args.gamma)
+        else:
+            raise NotImplementedError
+        self.optimizer, self.lr_scheduler = self.accelerator.prepare(self.optimizer, self.lr_scheduler)
+        # resume
+        if args.resume: #load model, optimizer, lr_scheduler and random_state
+            if hasattr(args, 'ckpt_epoch'):
+                self.load_ckpt(args.ckpt_epoch,args.ckpt_step)
+            else:
+                self.accelerator.print('Auto resume from latest ckpt...')
+                epoch, step = -1, -1
+                pattern = re.compile(r'epoch_(\d+)_step_(\d+)')
+                for folder_name in os.listdir(os.path.join(self.output_dir,'ckpts',self.exp_name)):
+                    match = pattern.match(folder_name)
+                    if match:
+                        i, j = int(match.group(1)), int(match.group(2))
+                        if i > epoch:
+                            epoch, step = i, j
+                if epoch >= 0:
+                    self.load_ckpt(epoch, step)
+                else:
+                    self.accelerator.print('No existing ckpts! Train from scratch.')
+    def load_ckpt(self, epoch, step):
+        self.accelerator.print(f'Loading checkpoint: epoch_{epoch}_step_{step}')
+        ckpts_save_path = os.path.join(self.output_dir,'ckpts',self.exp_name, f'epoch_{epoch}_step_{step}')
+        self.start_epoch = epoch + 1
+        self.global_step = step + 1
+        self.accelerator.load_state(ckpts_save_path)
+    def train(self):
+        # torch.autograd.set_detect_anomaly(True)
+        self.accelerator.print('Start training!')
+        for epoch in range(self.start_epoch, self.num_epochs):
+            torch.cuda.empty_cache()
+            progress_bar = tqdm(total=len(self.train_dataloader), disable=not self.accelerator.is_local_main_process)
+            progress_bar.set_description(f"Epoch {epoch}")
+            self.model.train()
+            self.criterion.train()
+            sat_use_gt = (epoch < self.sat_gt_epoch)
+            for step, (samples,targets) in enumerate(self.train_dataloader):
+                outputs = self.model(samples, targets, sat_use_gt = sat_use_gt, detach_j3ds = self.detach_j3ds)
+                loss_dict = self.criterion(outputs, targets)
+                loss = sum(loss_dict[k] * self.weight_dict[k] for k in loss_dict.keys())
+                self.accelerator.backward(loss)
+                if self.accelerator.sync_gradients:
+                    self.accelerator.clip_grad_norm_(self.model.parameters(), 1.0)
+                self.optimizer.step()
+                self.lr_scheduler.step()
+                self.optimizer.zero_grad()
+                reduced_dict = self.accelerator.reduce(loss_dict,reduction='mean')
+                simplified_logs = {k: v.item() for k, v in reduced_dict.items() if '.' not in k}
+                # logs.update({"lr": self.lr_scheduler.get_last_lr()[0], "step": self.global_step})
+                if self.accelerator.is_main_process:
+                    tqdm.write(f'[{epoch}-{step+1}/{len(self.train_dataloader)}]: ' + str(simplified_logs))
+                if step % 10 == 0:
+                    self.accelerator.log({('train/'+k):v for k,v in simplified_logs.items()},
+                                            step=self.global_step)
+                progress_bar.update(1)
+                progress_bar.set_postfix(**{"lr": self.lr_scheduler.get_last_lr()[0], "step": self.global_step})
+                self.global_step += 1
+                self.accelerator.wait_for_everyone()
+            # self.lr_scheduler.step()
+            if epoch % self.save_and_eval_epoch == 0 or epoch == self.num_epochs-1:
+                self.save_and_eval(epoch, save_ckpt=True)
+        self.accelerator.end_training()
+    def eval(self, results_save_path = None, epoch = -1):
+        if results_save_path is None:
+            results_save_path = os.path.join(self.output_dir,self.exp_name,'evaluation')
+        # preparing
+        self.model.eval()
+        unwrapped_model = self.unwrapped_model # self.accelerator.unwrap_model(self.model)
+        if self.accelerator.is_main_process:
+            os.makedirs(results_save_path,exist_ok=True)
+        # evaluate
+        for i, (key, eval_dataloader) in enumerate(self.eval_dataloaders.items()):
+            assert key in self.eval_func_maps
+            img_cnt = len(eval_dataloader) * self.eval_batch_size
+            if self.distributed_eval:
+                img_cnt *= self.accelerator.num_processes
+            self.accelerator.print(f'Evaluate on {key}: {img_cnt} images')
+            self.accelerator.print('Using following threshold(s): ', self.conf_thresh)
+            conf_thresh = self.conf_thresh if 'agora' in key or 'bedlam' in key else [0.2]
+            for thresh in conf_thresh:
+                if self.accelerator.is_main_process or self.distributed_eval:
+                    error_dict = self.eval_func_maps[key](model = unwrapped_model,
+                                    eval_dataloader = eval_dataloader,
+                                    conf_thresh = thresh,
+                                    vis_step = img_cnt // self.eval_vis_num,
+                                    results_save_path = os.path.join(results_save_path,key,f'thresh_{thresh}'),
+                                    distributed = self.distributed_eval,
+                                    accelerator = self.accelerator,
+                                    vis=True)
+                    if isinstance(error_dict,dict) and self.mode == 'train':
+                        log_dict = flatten_dict(error_dict)
+                        self.accelerator.log({(f'{key}_thresh_{thresh}/'+k):v for k,v in log_dict.items()}, step=epoch)
+                    self.accelerator.print(f'thresh_{thresh}: ',error_dict)
+                self.accelerator.wait_for_everyone()
+    def save_and_eval(self, epoch, save_ckpt=False):
+        torch.cuda.empty_cache()
+        # save current state and model
+        if self.accelerator.is_main_process and save_ckpt:
+            ckpts_save_path = os.path.join(self.output_dir,'ckpts',self.exp_name, f'epoch_{epoch}_step_{self.global_step-1}')
+            os.makedirs(ckpts_save_path,exist_ok=True)
+            self.accelerator.save_state(ckpts_save_path, safe_serialization=False)
+        self.accelerator.wait_for_everyone()
+        if epoch < self.least_eval_epoch:
+            return
+        results_save_path = os.path.join(self.output_dir,'results',self.exp_name, f'epoch_{epoch}_step_{self.global_step-1}')
+        self.eval(results_save_path, epoch=epoch)
+    def infer(self):
+        self.model.eval()
+        # unwrapped_model = self.accelerator.unwrap_model(self.model)
+        unwrapped_model = self.unwrapped_model
+        results_save_path = self.output_dir
+        if self.accelerator.is_main_process:
+            os.makedirs(results_save_path,exist_ok=True)
+        self.accelerator.print('Using following threshold(s): ', self.conf_thresh)
+        for thresh in self.conf_thresh:
+            if self.accelerator.is_main_process or self.distributed_infer:
+                self.inference_func(model = unwrapped_model,
+                        infer_dataloader = self.infer_dataloader,
+                        conf_thresh = thresh,
+                        results_save_path = os.path.join(results_save_path,f'thresh_{thresh}'),
+                        distributed = self.distributed_infer,
+                        accelerator = self.accelerator)
+            self.accelerator.wait_for_everyone()
+def flatten_dict(d, parent_key='', sep='-'):
+    items = []
+    for k, v in d.items():
+        new_key = f"{parent_key}{sep}{k}" if parent_key else k
+        if isinstance(v, dict):
+            items.extend(flatten_dict(v, new_key, sep=sep).items())
+        else:
+            items.append((new_key, v))
+    return dict(items)

engines/funcs/__init__.py ADDED Viewed

File without changes

engines/funcs/eval_funcs.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import os
+from tqdm.auto import tqdm
+import torch
+import numpy as np
+from utils.evaluation import cal_3d_position_error, match_2d_greedy, get_matching_dict, compute_prf1, vectorize_distance, calculate_iou
+from utils.transforms import pelvis_align, root_align, unNormalize
+from utils.visualization import tensor_to_BGR, pad_img
+from utils.visualization import vis_meshes_img, vis_boxes, vis_sat, vis_scale_img, get_colors_rgb
+from utils.box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh
+from utils.constants import human36_eval_joint, J24_TO_H36M, H36M_TO_MPII
+import time
+import datetime
+import scipy.io as sio
+import cv2
+import zipfile
+import pickle
+# for agora evaluation
+def select_and_align(smpl_joints, smpl_verts, body_verts_ind):
+    joints = smpl_joints[:24, :]
+    verts = smpl_verts[body_verts_ind, :]
+    assert len(verts.shape) == 2
+    verts = pelvis_align(joints, verts)
+    joints = pelvis_align(joints)
+    return joints, verts
+# Modified from agora_evaluation
+def evaluate_agora(model, eval_dataloader, conf_thresh,
+                        vis = True, vis_step = 40, results_save_path = None,
+                        distributed = False, accelerator = None):
+    assert results_save_path is not None
+    assert accelerator is not None
+    num_processes = accelerator.num_processes
+    has_kid = ('train' in eval_dataloader.dataset.split and eval_dataloader.dataset.ds_name == 'agora')
+    os.makedirs(results_save_path,exist_ok=True)
+    if vis:
+        imgs_save_dir = os.path.join(results_save_path, 'imgs')
+        os.makedirs(imgs_save_dir, exist_ok = True)
+    step = 0
+    total_miss_count = 0
+    total_count = 0
+    total_fp = 0
+    mve, mpjpe = [0.], [0.]
+    if has_kid:
+        kid_total_miss_count = 0
+        kid_total_count = 0
+        kid_mve, kid_mpjpe = [0.], [0.]
+    cur_device = next(model.parameters()).device
+    smpl_layer = model.human_model
+    body_verts_ind = smpl_layer.body_vertex_idx
+    progress_bar = tqdm(total=len(eval_dataloader), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description('evaluate')
+    for itr, (samples, targets) in enumerate(eval_dataloader):
+        samples=[sample.to(device = cur_device, non_blocking = True) for sample in samples]
+        with torch.no_grad():
+           outputs = model(samples, targets)
+        bs = len(targets)
+        for idx in range(bs):
+            #gt
+            gt_j2ds = targets[idx]['j2ds'].cpu().numpy()[:,:24,:]
+            gt_j3ds = targets[idx]['j3ds'].cpu().numpy()[:,:24,:]
+            gt_verts = targets[idx]['verts'].cpu().numpy()
+            #pred
+            select_queries_idx = torch.where(outputs['pred_confs'][idx] > conf_thresh)[0]
+            pred_j2ds = outputs['pred_j2ds'][idx][select_queries_idx].detach().cpu().numpy()[:,:24,:]
+            pred_j3ds = outputs['pred_j3ds'][idx][select_queries_idx].detach().cpu().numpy()[:,:24,:]
+            pred_verts = outputs['pred_verts'][idx][select_queries_idx].detach().cpu().numpy()
+            matched_verts_idx = []
+            assert len(gt_j2ds.shape) == 3 and len(pred_j2ds.shape) == 3
+            #matching
+            greedy_match = match_2d_greedy(pred_j2ds, gt_j2ds) # tuples are (idx_pred_kps, idx_gt_kps)
+            matchDict, falsePositive_count = get_matching_dict(greedy_match)
+            #align with matching result
+            gt_verts_list, pred_verts_list, gt_joints_list, pred_joints_list = [], [], [], []
+            gtIdxs = np.arange(len(gt_j3ds))
+            miss_flag = []
+            for gtIdx in gtIdxs:
+                gt_verts_list.append(gt_verts[gtIdx])
+                gt_joints_list.append(gt_j3ds[gtIdx])
+                if matchDict[str(gtIdx)] == 'miss' or matchDict[str(
+                        gtIdx)] == 'invalid':
+                    miss_flag.append(1)
+                    pred_verts_list.append([])
+                    pred_joints_list.append([])
+                else:
+                    miss_flag.append(0)
+                    pred_joints_list.append(pred_j3ds[matchDict[str(gtIdx)]])
+                    pred_verts_list.append(pred_verts[matchDict[str(gtIdx)]])
+                    matched_verts_idx.append(matchDict[str(gtIdx)])
+            if has_kid:
+                gt_kid_list = targets[idx]['kid']
+            #calculating 3d errors
+            for i, (gt3d, pred) in enumerate(zip(gt_joints_list, pred_joints_list)):
+                total_count += 1
+                if has_kid and gt_kid_list[i]:
+                    kid_total_count += 1
+                # Get corresponding ground truth and predicted 3d joints and verts
+                if miss_flag[i] == 1:
+                    total_miss_count += 1
+                    if has_kid and gt_kid_list[i]:
+                        kid_total_miss_count += 1
+                    continue
+                gt3d = gt3d.reshape(-1, 3)
+                pred3d = pred.reshape(-1, 3)
+                gt3d_verts = gt_verts_list[i].reshape(-1, 3)
+                pred3d_verts = pred_verts_list[i].reshape(-1, 3)
+                gt3d, gt3d_verts = select_and_align(gt3d, gt3d_verts, body_verts_ind)
+                pred3d, pred3d_verts = select_and_align(pred3d, pred3d_verts, body_verts_ind)
+                #joints
+                error_j, pa_error_j = cal_3d_position_error(pred3d, gt3d)
+                mpjpe.append(error_j)
+                if has_kid and gt_kid_list[i]:
+                    kid_mpjpe.append(error_j)
+                #vertices
+                error_v,pa_error_v = cal_3d_position_error(pred3d_verts, gt3d_verts)
+                mve.append(error_v)
+                if has_kid and gt_kid_list[i]:
+                    kid_mve.append(error_v)
+            #counting
+            step += 1
+            total_fp += falsePositive_count
+            img_idx = step + accelerator.process_index*len(eval_dataloader)*bs
+            if vis and (img_idx%vis_step == 0):
+                img_name = targets[idx]['img_path'].split('/')[-1].split('.')[0]
+                ori_img = tensor_to_BGR(unNormalize(samples[idx]).cpu())
+                # render mesh
+                colors = [(1.0, 1.0, 0.9)] * len(gt_verts)
+                gt_mesh_img = vis_meshes_img(img = ori_img.copy(),
+                                            verts = gt_verts,
+                                            smpl_faces = smpl_layer.faces,
+                                            cam_intrinsics = targets[idx]['cam_intrinsics'].reshape(3,3).detach().cpu(),
+                                            colors = colors)
+                colors = [(1.0, 0.6, 0.6)] * len(pred_verts)
+                for i in matched_verts_idx:
+                    colors[i] = (0.7, 1.0, 0.4)
+                # colors = get_colors_rgb(len(pred_verts))
+                pred_mesh_img = vis_meshes_img(img = ori_img.copy(),
+                                            verts = pred_verts,
+                                            smpl_faces = smpl_layer.faces,
+                                            cam_intrinsics = outputs['pred_intrinsics'][idx].reshape(3,3).detach().cpu(),
+                                            colors = colors,
+                                            )
+                if 'enc_outputs' not in outputs:
+                    pred_scale_img = np.zeros_like(pred_mesh_img)
+                else:
+                    enc_out = outputs['enc_outputs']
+                    h, w = enc_out['hw'][idx]
+                    flatten_map = enc_out['scale_map'].split(enc_out['lens'])[idx].detach().cpu()
+                    ys = enc_out['pos_y'].split(enc_out['lens'])[idx]
+                    xs = enc_out['pos_x'].split(enc_out['lens'])[idx]
+                    scale_map = torch.zeros((h,w,2))
+                    scale_map[ys,xs] = flatten_map
+                    pred_scale_img = vis_scale_img(img = ori_img.copy(),
+                                                   scale_map = scale_map,
+                                                   conf_thresh = model.sat_cfg['conf_thresh'],
+                                                   patch_size=28)
+                pred_boxes = outputs['pred_boxes'][idx][select_queries_idx].detach().cpu()
+                pred_boxes = box_cxcywh_to_xyxy(pred_boxes) * model.input_size
+                pred_box_img = vis_boxes(ori_img.copy(), pred_boxes, color = (255,0,255))
+                # sat
+                sat_img = vis_sat(ori_img.copy(),
+                                    input_size = model.input_size,
+                                    patch_size = 14,
+                                    sat_dict = outputs['sat'],
+                                    bid = idx)
+                ori_img = pad_img(ori_img, model.input_size)
+                full_img = np.vstack([np.hstack([ori_img, sat_img]),
+                                      np.hstack([pred_scale_img, pred_box_img]),
+                                      np.hstack([gt_mesh_img, pred_mesh_img])])
+                cv2.imwrite(os.path.join(imgs_save_dir, f'{img_idx}_{img_name}.png'), full_img)
+        progress_bar.update(1)
+    if distributed:
+        mve = accelerator.gather_for_metrics(mve)
+        mpjpe = accelerator.gather_for_metrics(mpjpe)
+        total_miss_count = sum(accelerator.gather_for_metrics([total_miss_count]))
+        total_count = sum(accelerator.gather_for_metrics([total_count]))
+        total_fp = sum(accelerator.gather_for_metrics([total_fp]))
+        if has_kid:
+            kid_mve = accelerator.gather_for_metrics(kid_mve)
+            kid_mpjpe = accelerator.gather_for_metrics(kid_mpjpe)
+            kid_total_miss_count = sum(accelerator.gather_for_metrics([kid_total_miss_count]))
+            kid_total_count = sum(accelerator.gather_for_metrics([kid_total_count]))
+    if len(mpjpe) <= num_processes:
+        return "Failed to evaluate. Keep training!"
+    if has_kid and len(kid_mpjpe) <= num_processes:
+        return "Failed to evaluate. Keep training!"
+    precision, recall, f1 = compute_prf1(total_count,total_miss_count,total_fp)
+    error_dict = {}
+    error_dict['precision'] = precision
+    error_dict['recall'] = recall
+    error_dict['f1'] = f1
+    error_dict['MPJPE'] = round(sum(mpjpe)/(len(mpjpe)-num_processes), 1)
+    error_dict['NMJE'] = round(error_dict['MPJPE'] / (f1), 1)
+    error_dict['MVE'] = round(sum(mve)/(len(mve)-num_processes), 1)
+    error_dict['NMVE'] = round(error_dict['MVE'] / (f1), 1)
+    if has_kid:
+        kid_precision, kid_recall, kid_f1 = compute_prf1(kid_total_count,kid_total_miss_count,total_fp)
+        error_dict['kid_precision'] = kid_precision
+        error_dict['kid_recall'] = kid_recall
+        error_dict['kid_f1'] = kid_f1
+        error_dict['kid-MPJPE'] = round(sum(kid_mpjpe)/(len(kid_mpjpe)-num_processes), 1)
+        error_dict['kid-NMJE'] = round(error_dict['kid-MPJPE'] / (kid_f1), 1)
+        error_dict['kid-MVE'] = round(sum(kid_mve)/(len(kid_mve)-num_processes), 1)
+        error_dict['kid-NMVE'] = round(error_dict['kid-MVE'] / (kid_f1), 1)
+    if accelerator.is_main_process:
+        with open(os.path.join(results_save_path,'results.txt'),'w') as f:
+            for k,v in error_dict.items():
+                f.write(f'{k}: {v}\n')
+    return error_dict
+def test_agora(model, eval_dataloader, conf_thresh,
+                vis = True, vis_step = 400, results_save_path = None,
+                distributed = False, accelerator = None):
+    assert results_save_path is not None
+    assert accelerator is not None
+    os.makedirs(os.path.join(results_save_path,'predictions'),exist_ok=True)
+    if vis:
+        imgs_save_dir = os.path.join(results_save_path, 'imgs')
+        os.makedirs(imgs_save_dir, exist_ok = True)
+    step = 0
+    cur_device = next(model.parameters()).device
+    smpl_layer = model.human_model
+    progress_bar = tqdm(total=len(eval_dataloader), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description('testing')
+    for itr, (samples, targets) in enumerate(eval_dataloader):
+        samples=[sample.to(device = cur_device, non_blocking = True) for sample in samples]
+        with torch.no_grad():
+           outputs = model(samples, targets)
+        bs = len(targets)
+        for idx in range(bs):
+            #gt
+            img_name = targets[idx]['img_name'].split('.')[0]
+            #pred
+            select_queries_idx = torch.where(outputs['pred_confs'][idx] > conf_thresh)[0]
+            pred_j2ds = np.array(outputs['pred_j2ds'][idx][select_queries_idx].detach().to('cpu'))[:,:24,:]*(3840/model.input_size)
+            pred_j3ds = np.array(outputs['pred_j3ds'][idx][select_queries_idx].detach().to('cpu'))[:,:24,:]
+            pred_verts = np.array(outputs['pred_verts'][idx][select_queries_idx].detach().to('cpu'))
+            pred_poses = np.array(outputs['pred_poses'][idx][select_queries_idx].detach().to('cpu'))
+            pred_betas = np.array(outputs['pred_betas'][idx][select_queries_idx].detach().to('cpu'))
+            #visualization
+            step+=1
+            img_idx = step + accelerator.process_index*len(eval_dataloader)*bs
+            if vis and (img_idx%vis_step == 0):
+                ori_img = tensor_to_BGR(unNormalize(samples[idx]).cpu())
+                ori_img = pad_img(ori_img, model.input_size)
+                sat_img = vis_sat(ori_img.copy(),
+                                    input_size = model.input_size,
+                                    patch_size = 14,
+                                    sat_dict = outputs['sat'],
+                                    bid = idx)
+                colors = get_colors_rgb(len(pred_verts))
+                mesh_img = vis_meshes_img(img = ori_img.copy(),
+                                          verts = pred_verts,
+                                          smpl_faces = smpl_layer.faces,
+                                          colors = colors,
+                                          cam_intrinsics = outputs['pred_intrinsics'][idx].detach().cpu())
+                if 'enc_outputs' not in outputs:
+                    pred_scale_img = np.zeros_like(ori_img)
+                else:
+                    enc_out = outputs['enc_outputs']
+                    h, w = enc_out['hw'][idx]
+                    flatten_map = enc_out['scale_map'].split(enc_out['lens'])[idx].detach().cpu()
+                    ys = enc_out['pos_y'].split(enc_out['lens'])[idx]
+                    xs = enc_out['pos_x'].split(enc_out['lens'])[idx]
+                    scale_map = torch.zeros((h,w,2))
+                    scale_map[ys,xs] = flatten_map
+                    pred_scale_img = vis_scale_img(img = ori_img.copy(),
+                                                   scale_map = scale_map,
+                                                   conf_thresh = model.sat_cfg['conf_thresh'],
+                                                   patch_size=28)
+                full_img = np.vstack([np.hstack([ori_img, mesh_img]),
+                                      np.hstack([pred_scale_img, sat_img])])
+                cv2.imwrite(os.path.join(imgs_save_dir, f'{img_idx}_{img_name}.jpg'), full_img)
+            # submit
+            for pnum in range(len(pred_j2ds)):
+                smpl_dict = {}
+                # smpl_dict['age'] = 'kid'
+                smpl_dict['joints'] = pred_j2ds[pnum].reshape(24,2)
+                smpl_dict['params'] = {'transl': np.zeros((1,3)),
+                                        'betas': pred_betas[pnum].reshape(1,10),
+                                        'global_orient': pred_poses[pnum][:3].reshape(1,1,3),
+                                        'body_pose': pred_poses[pnum][3:].reshape(1,23,3)}
+                # smpl_dict['verts'] = pred_verts[pnum].reshape(6890,3)
+                # smpl_dict['allSmplJoints3d'] = pred_j3ds[pnum].reshape(24,3)
+                with open(os.path.join(results_save_path,'predictions',f'{img_name}_personId_{pnum}.pkl'), 'wb') as f:
+                    pickle.dump(smpl_dict, f)
+        progress_bar.update(1)
+    accelerator.print('Packing...')
+    folder_path = os.path.join(results_save_path,'predictions')
+    now = datetime.datetime.now()
+    timestamp = now.strftime("%Y%m%d_%H%M%S")
+    output_path = os.path.join(results_save_path,f'pred_{timestamp}.zip')
+    with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
+        for root, dirs, files in os.walk(folder_path):
+            for file in files:
+                file_path = os.path.join(root, file)
+                arcname = os.path.relpath(file_path, os.path.dirname(folder_path))
+                zipf.write(file_path, arcname)
+    return 'Results saved at: ' + os.path.join(results_save_path,'predictions')

engines/funcs/infer_funcs.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+import pickle
+from tqdm.auto import tqdm
+import torch
+import numpy as np
+from utils.transforms import unNormalize
+from utils.visualization import tensor_to_BGR, pad_img
+from utils.visualization import vis_meshes_img, vis_boxes, vis_sat, vis_scale_img, get_colors_rgb
+from utils.box_ops import box_cxcywh_to_xyxy, box_xyxy_to_cxcywh
+import time
+import cv2
+import trimesh
+def inference(model, infer_dataloader, conf_thresh, results_save_path = None,
+                        distributed = False, accelerator = None):
+    assert results_save_path is not None
+    assert accelerator is not None
+    accelerator.print(f'Results will be saved at: {results_save_path}')
+    os.makedirs(results_save_path,exist_ok=True)
+    cur_device = next(model.parameters()).device
+    smpl_layer = model.human_model
+    progress_bar = tqdm(total=len(infer_dataloader), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description('inference')
+    for itr, (samples, targets) in enumerate(infer_dataloader):
+        samples=[sample.to(device = cur_device, non_blocking = True) for sample in samples]
+        with torch.no_grad():
+           outputs = model(samples, targets)
+        bs = len(targets)
+        for idx in range(bs):
+            img_size = targets[idx]['img_size'].detach().cpu().int().numpy()
+            img_name = targets[idx]['img_path'].split('/')[-1].split('.')[0]
+            #pred
+            select_queries_idx = torch.where(outputs['pred_confs'][idx] > conf_thresh)[0]
+            pred_verts = outputs['pred_verts'][idx][select_queries_idx].detach().cpu().numpy()
+            ori_img = tensor_to_BGR(unNormalize(samples[idx]).cpu())
+            ori_img[img_size[0]:,:,:] = 255
+            ori_img[:,img_size[1]:,:] = 255
+            ori_img[img_size[0]:,img_size[1]:,:] = 255
+            ori_img = pad_img(ori_img, model.input_size, pad_color_offset=255)
+            sat_img = vis_sat(ori_img.copy(),
+                                input_size = model.input_size,
+                                patch_size = 14,
+                                sat_dict = outputs['sat'],
+                                bid = idx)[:img_size[0],:img_size[1]]
+            colors = get_colors_rgb(len(pred_verts))
+            pred_mesh_img = vis_meshes_img(img = ori_img.copy(),
+                                        verts = pred_verts,
+                                        smpl_faces = smpl_layer.faces,
+                                        cam_intrinsics = outputs['pred_intrinsics'][idx].reshape(3,3).detach().cpu(),
+                                        colors=colors)[:img_size[0],:img_size[1]]
+            if 'enc_outputs' not in outputs:
+                pred_scale_img = np.zeros_like(ori_img)[:img_size[0],:img_size[1]]
+            else:
+                enc_out = outputs['enc_outputs']
+                h, w = enc_out['hw'][idx]
+                flatten_map = enc_out['scale_map'].split(enc_out['lens'])[idx].detach().cpu()
+                ys = enc_out['pos_y'].split(enc_out['lens'])[idx]
+                xs = enc_out['pos_x'].split(enc_out['lens'])[idx]
+                scale_map = torch.zeros((h,w,2))
+                scale_map[ys,xs] = flatten_map
+                pred_scale_img = vis_scale_img(img = ori_img.copy(),
+                                                scale_map = scale_map,
+                                                conf_thresh = model.sat_cfg['conf_thresh'],
+                                                patch_size=28)[:img_size[0],:img_size[1]]
+            pred_boxes = outputs['pred_boxes'][idx][select_queries_idx].detach().cpu()
+            pred_boxes = box_cxcywh_to_xyxy(pred_boxes) * model.input_size
+            pred_box_img = vis_boxes(ori_img.copy(), pred_boxes, color = (255,0,255))[:img_size[0],:img_size[1]]
+            cv2.imwrite(os.path.join(results_save_path, f'{img_name}.png'), np.vstack([np.hstack([pred_box_img, pred_mesh_img]),
+                                                                                        np.hstack([pred_scale_img, sat_img])]))
+        progress_bar.update(1)

figures/pipeline.png ADDED Viewed

Git LFS Details

SHA256: 1256facd5fe87da77b173205e5a4466081b5723aa52ee76736b89c51d6928153
Pointer size: 132 Bytes
Size of remote file: 1.9 MB

figures/qualitative_results.png ADDED Viewed

Git LFS Details

SHA256: 6139ee8ad610f83eaffabc4917e4a864180e298e8ccf438edaed8f9b117327e5
Pointer size: 132 Bytes
Size of remote file: 3.16 MB

figures/results.png ADDED Viewed

Git LFS Details

SHA256: 84a5282d996b7fabe3dd8a6ba09d23ddbf065fac414ebf8af6f61e8f16e68a45
Pointer size: 132 Bytes
Size of remote file: 1.82 MB

figures/results_3d.gif ADDED Viewed

Git LFS Details

SHA256: f7734e0a6f37aaabf03888b7d79d1b43b53c5b6a9cd7c1f1fe042079d6c9823d
Pointer size: 132 Bytes
Size of remote file: 7.09 MB

main.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import argparse
+import os
+import yaml
+import numpy as np
+from engines.engine import Engine
+def get_args_parser():
+    parser = argparse.ArgumentParser('SAT-HMR', add_help=False)
+    parser.add_argument('--cfg', default=None, type=str)
+    parser.add_argument('--mode',default='train',type=str)
+    return parser
+def update_args(args, cfg_path):
+    with open(cfg_path) as f:
+        config = yaml.safe_load(f)
+        args_dict = vars(args)
+        args_dict.update(config)
+        args = argparse.Namespace(**args_dict)
+    return args
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('SAT-HMR training and evaluation script', parents=[get_args_parser()])
+    args = parser.parse_args()
+    assert args.cfg is not None
+    args = update_args(args, os.path.join('configs', 'run', f'{args.cfg}.yaml'))
+    args.exp_name = args.cfg
+    args = update_args(args, os.path.join('configs', 'models', f'{args.model}.yaml'))
+    if args.mode.lower() == 'train':
+        raise NotImplementedError
+        from accelerate.utils import set_seed
+        seed = args.seed
+        set_seed(args.seed)
+        engine = Engine(args, mode='train')
+        engine.train()
+    elif args.mode.lower() == 'eval':
+        raise NotImplementedError
+        engine = Engine(args, mode='eval')
+        engine.eval()
+    elif args.mode.lower() == 'infer':
+        engine = Engine(args, mode='infer')
+        engine.infer()
+    else:
+        print('Wrong mode!')
+        exit(1)

models/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# ------------------------------------------------------------------------
+# Modified from DAB-DETR (https://github.com/IDEA-Research/DAB-DETR)
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from Conditional DETR (https://github.com/Atten4Vis/ConditionalDETR)
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+from .sat_model import build_sat_model

models/criterion.py ADDED Viewed

	@@ -0,0 +1,449 @@

+# Modified from DAB-DETR (https://github.com/IDEA-Research/DAB-DETR)
+import os
+import torch
+from torch import nn
+import torch.nn.functional as F
+from utils import box_ops
+from utils.misc import (NestedTensor, nested_tensor_from_tensor_list,
+                       accuracy, get_world_size, interpolate,
+                       is_dist_avail_and_initialized, inverse_sigmoid)
+def focal_loss(inputs, targets, valid_mask = None, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        Loss tensor
+    """
+    # prob = inputs.sigmoid()
+    # ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    prob = inputs
+    ce_loss = F.binary_cross_entropy(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+    # if valid_mask is not None:
+    #     loss = loss * valid_mask
+    return loss.mean()
+class SetCriterion(nn.Module):
+    """ This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+    def __init__(self, matcher, weight_dict, losses = ['confs','boxes', 'poses','betas', 'j3ds','j2ds', 'depths', 'kid_offsets'],
+                focal_alpha=0.25, focal_gamma = 2.0, j2ds_norm_scale = 518):
+        """ Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+            focal_alpha: alpha in Focal Loss
+        """
+        super().__init__()
+        self.matcher = matcher
+        self.losses = losses
+        if 'boxes' in losses and 'giou' not in weight_dict:
+            weight_dict.update({'giou': weight_dict['boxes']})
+        self.weight_dict = weight_dict
+        self.betas_weight = torch.tensor([2.56, 1.28, 0.64, 0.64, 0.32, 0.32, 0.32, 0.32, 0.32, 0.32]).unsqueeze(0).float()
+        self.focal_alpha = focal_alpha
+        self.focal_gamma = focal_gamma
+        self.j2ds_norm_scale = j2ds_norm_scale
+        self.device = None
+    def loss_boxes(self, loss, outputs, targets, indices, num_instances, **kwargs):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert 'pred_boxes' in outputs
+        assert loss == 'boxes'
+        idx = self._get_src_permutation_idx(indices)
+        valid_idx = torch.where(torch.cat([torch.ones(len(i), dtype=bool, device = self.device)*(loss in t) for t, (_, i) in zip(targets, indices)]))[0]
+        if len(valid_idx) == 0:
+            return {loss: torch.tensor(0.).to(self.device)}
+        src = outputs['pred_'+loss][idx][valid_idx]
+        target = torch.cat([t[loss][i] for t, (_, i) in zip(targets, indices) if loss in t], dim=0)
+        assert src.shape == target.shape
+        src_boxes = src
+        target_boxes = target
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
+        losses = {}
+        losses['boxes'] = loss_bbox.sum() / num_instances
+        loss_giou = 1 - torch.diag(box_ops.generalized_box_iou(
+            box_ops.box_cxcywh_to_xyxy(src_boxes),
+            box_ops.box_cxcywh_to_xyxy(target_boxes)))
+        losses['giou'] = loss_giou.sum() / num_instances
+        # # calculate the x,y and h,w loss
+        # with torch.no_grad():
+        #     losses['loss_xy'] = loss_bbox[..., :2].sum() / num_boxes
+        #     losses['loss_hw'] = loss_bbox[..., 2:].sum() / num_boxes
+        return losses
+    def loss_boxes_enc(self, loss, outputs, targets, indices, num_instances, **kwargs):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert 'pred_boxes' in outputs
+        assert loss == 'boxes_enc'
+        loss = 'boxes'
+        valid_idx = torch.where(torch.cat([torch.ones(len(i), dtype=bool, device = self.device)*(loss in t) for t, (_, i) in zip(targets, indices)]))[0]
+        if len(valid_idx) == 0:
+            return {loss: torch.tensor(0.).to(self.device)}
+        lens = outputs['lens']
+        pred_boxes = outputs['pred_boxes']
+        src = torch.cat([s[i] for s, (i, _) in zip(pred_boxes.split(lens), indices)], dim=0)[valid_idx]
+        target = torch.cat([t[loss][i] for t, (_, i) in zip(targets, indices) if loss in t], dim=0)
+        assert src.shape == target.shape
+        src_boxes = src
+        target_boxes = target
+        loss_bbox = F.l1_loss(src_boxes, target_boxes, reduction='none')
+        losses = {}
+        losses['boxes'] = loss_bbox.sum() / num_instances
+        loss_giou = 1 - torch.diag(box_ops.generalized_box_iou(
+            box_ops.box_cxcywh_to_xyxy(src_boxes),
+            box_ops.box_cxcywh_to_xyxy(target_boxes)))
+        losses['giou'] = loss_giou.sum() / num_instances
+        # # calculate the x,y and h,w loss
+        # with torch.no_grad():
+        #     losses['loss_xy'] = loss_bbox[..., :2].sum() / num_boxes
+        #     losses['loss_hw'] = loss_bbox[..., 2:].sum() / num_boxes
+        return losses
+    # For computing ['boxes', 'poses', 'betas', 'j3ds', 'j2ds'] losses
+    def loss_L1(self, loss, outputs, targets, indices, num_instances, **kwargs):
+        idx = self._get_src_permutation_idx(indices)
+        valid_idx = torch.where(torch.cat([torch.ones(len(i), dtype=bool, device = self.device)*(loss in t) for t, (_, i) in zip(targets, indices)]))[0]
+        if len(valid_idx) == 0:
+            return {loss: torch.tensor(0.).to(self.device)}
+        src = outputs['pred_'+loss][idx][valid_idx]
+        target = torch.cat([t[loss][i] for t, (_, i) in zip(targets, indices) if loss in t], dim=0)
+        assert src.shape == target.shape
+        losses = {}
+        loss_mask = None
+        if loss == 'j3ds':
+            # Root aligned
+            src = src - src[...,[0],:].clone()
+            target = target - target[...,[0],:].clone()
+            # Use 54 smpl joints
+            src = src[:,:54,:]
+            target = target[:,:54,:]
+        elif loss == 'j2ds':
+            src = src / self.j2ds_norm_scale
+            target = target / self.j2ds_norm_scale
+            # Need to exclude invalid kpts in 2d datasets
+            loss_mask = torch.cat([t['j2ds_mask'][i] for t, (_, i) in zip(targets, indices) if 'j2ds' in t], dim=0)
+            # Use 54 smpl joints
+            src = src[:,:54,:]
+            target = target[:,:54,:]
+            loss_mask = loss_mask[:,:54,:]
+        valid_loss = torch.abs(src-target)
+        # if loss == 'j2ds':
+        #     print(src.shape)
+        #     print(target.shape)
+        #     print(num_instances)
+        #     exit(0)
+        if loss_mask is not None:
+            valid_loss = valid_loss * loss_mask
+        if loss == 'betas':
+            valid_loss = valid_loss*self.betas_weight.to(src.device)
+        losses[loss] = valid_loss.flatten(1).mean(-1).sum()/num_instances
+        return losses
+    def loss_scale_map(self, loss, outputs, targets, indices, num_instances, **kwargs):
+        assert loss == 'scale_map'
+        pred_map = outputs['enc_outputs']['scale_map']
+        tgt_map = torch.cat([t['scale_map'] for t in targets], dim=0)
+        assert pred_map.shape == tgt_map.shape
+        labels = tgt_map[:,0]
+        pred_scales = pred_map[:,1]
+        tgt_scales = tgt_map[:, 1]
+        detection_valid_mask = labels.bool()
+        cur = 0
+        lens = [len(t['scale_map']) for t in targets]
+        for i, tgt in enumerate(targets):
+            if tgt['detect_all_people']:
+                detection_valid_mask[cur:cur+lens[i]] = True
+            cur += lens[i]
+        losses = {}
+        losses['map_confs'] = focal_loss(pred_map[:,0], labels, valid_mask=detection_valid_mask)/1.
+        losses['map_scales'] = torch.abs((pred_scales - tgt_scales)[torch.where(labels)[0]]).sum()/num_instances
+        return losses
+    def loss_confs(self, loss, outputs, targets, indices, num_instances, is_dn=False, **kwargs):
+        assert loss == 'confs'
+        idx = self._get_src_permutation_idx(indices)
+        pred_confs = outputs['pred_'+loss]
+        with torch.no_grad():
+            labels = torch.zeros_like(pred_confs)
+            labels[idx] = 1
+            detection_valid_mask = torch.zeros_like(pred_confs,dtype=bool)
+            detection_valid_mask[idx] = True
+            valid_batch_idx = torch.where(torch.tensor([t['detect_all_people'] for t in targets]))[0]
+            detection_valid_mask[valid_batch_idx] = True
+        losses = {}
+        if is_dn:
+            losses[loss] = focal_loss(pred_confs, labels) / num_instances
+        else:
+            losses[loss] = focal_loss(pred_confs, labels, valid_mask = detection_valid_mask) / num_instances
+        return losses
+    def loss_confs_enc(self, loss, outputs, targets, indices, num_instances, **kwargs):
+        assert loss == 'confs_enc'
+        loss = 'confs'
+        lens = outputs['lens']
+        pred_confs = outputs['pred_confs']
+        detection_valid_mask = torch.zeros_like(pred_confs,dtype=bool)
+        labels = torch.zeros_like(pred_confs)
+        cur = 0
+        idx = []
+        for i, (src, tgt) in enumerate(indices):
+            idx += (src + cur).tolist()
+            if targets[i]['detect_all_people']:
+                detection_valid_mask[cur:cur+lens[i]] = True
+            cur += lens[i]
+        detection_valid_mask[idx] = True
+        labels[idx] = 1
+        pred_confs = pred_confs.unsqueeze(0)
+        labels = labels.unsqueeze(0)
+        detection_valid_mask = detection_valid_mask.unsqueeze(0)
+        losses = {}
+        # losses[loss] = focal_loss(pred_confs, labels, valid_mask = detection_valid_mask)
+        losses[loss] = focal_loss(pred_confs, labels)
+        return losses
+    def loss_L2(self, loss, outputs, targets, indices, num_instances, **kwargs):
+        pass
+    def loss_absolute_depths(self, loss, outputs, targets, indices, num_instances, **kwargs):
+        assert loss == 'depths'
+        losses = {}
+        idx = self._get_src_permutation_idx(indices)
+        valid_idx = torch.where(torch.cat([torch.ones(len(i), dtype=bool, device = self.device)*(loss in t) for t, (_, i) in zip(targets, indices)]))[0]
+        if len(valid_idx) == 0:
+            return {loss: torch.tensor(0.).to(self.device)}
+        src = outputs['pred_'+loss][idx][valid_idx][...,[1]]  # [d d/f]
+        target = torch.cat([t[loss][i] for t, (_, i) in zip(targets, indices) if loss in t], dim=0)[...,[0]]
+        target_focals = torch.cat([t['focals'][i] for t, (_, i) in zip(targets, indices) if loss in t], dim=0)
+        # print(src.shape, target.shape, target_focals.shape)
+        src = target_focals * src
+        assert src.shape == target.shape
+        valid_loss = torch.abs(1./(src + 1e-8) - 1./(target + 1e-8))
+        losses[loss] = valid_loss.flatten(1).mean(-1).sum()/num_instances
+        return losses
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+    def get_loss(self, loss, outputs, targets, indices, num_instances, **kwargs):
+        loss_map = {
+            'confs': self.loss_confs,
+            'boxes': self.loss_boxes,
+            'confs_enc': self.loss_confs_enc,
+            'boxes_enc': self.loss_boxes_enc,
+            'poses': self.loss_L1,
+            'betas': self.loss_L1,
+            'j3ds': self.loss_L1,
+            'j2ds': self.loss_L1,
+            'depths': self.loss_absolute_depths,
+            'scale_map': self.loss_scale_map,
+        }
+        # assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](loss, outputs, targets, indices, num_instances, **kwargs)
+    def get_valid_instances(self, targets):
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        # Losses: 'confs','centers','anchors', 'poses', 'betas', 'j3ds', 'j2ds', 'depths', 'ages', 'heatmap'
+        num_valid_instances = {}
+        for loss in self.losses:
+            num_instances = 0
+            if loss != 'scale_map':
+                for t in targets:
+                    num_instances += t['pnum'] if loss in t else 0
+                num_instances = torch.as_tensor([num_instances], dtype=torch.float, device=self.device)
+            else:
+                for t in targets:
+                    num_instances += t['scale_map'][...,0].sum().item()
+                num_instances = torch.as_tensor([num_instances], dtype=torch.float, device=self.device)
+            if is_dist_avail_and_initialized():
+                torch.distributed.all_reduce(num_instances)
+            num_instances = torch.clamp(num_instances / get_world_size(), min=1).item()
+            num_valid_instances[loss] = num_instances
+        num_valid_instances['confs'] = 1.
+        return num_valid_instances
+    def prep_for_dn(self, dn_meta):
+        output_known = dn_meta['output_known']
+        num_dn_groups, pad_size = dn_meta['num_dn_group'], dn_meta['pad_size']
+        assert pad_size % num_dn_groups == 0
+        single_pad = pad_size//num_dn_groups
+        return output_known, single_pad, num_dn_groups
+    def forward(self, outputs, targets):
+        """ This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        # remove invalid information in targets
+        for t in targets:
+            if not t['3d_valid']:
+                for key in ['betas', 'kid_offsets', 'poses', 'j3ds', 'depths', 'focals']:
+                    if key in t:
+                        del t[key]
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != 'aux_outputs' and k != 'enc_outputs' and k != 'sat'}
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+        self.device = outputs['pred_poses'].device
+        num_valid_instances = self.get_valid_instances(targets)
+        # Compute all the requested losses
+        losses = {}
+        # prepare for dn loss
+        if 'dn_meta' in outputs:
+            dn_meta = outputs['dn_meta']
+            output_known, single_pad, scalar = self.prep_for_dn(dn_meta)
+            dn_pos_idx = []
+            dn_neg_idx = []
+            for i in range(len(targets)):
+                assert len(targets[i]['boxes']) > 0
+                # t = torch.range(0, len(targets[i]['labels']) - 1).long().to(self.device)
+                t = torch.arange(0, len(targets[i]['labels'])).long().to(self.device)
+                t = t.unsqueeze(0).repeat(scalar, 1)
+                tgt_idx = t.flatten()
+                output_idx = (torch.tensor(range(scalar)) * single_pad).long().to(self.device).unsqueeze(1) + t
+                output_idx = output_idx.flatten()
+                dn_pos_idx.append((output_idx, tgt_idx))
+                dn_neg_idx.append((output_idx + single_pad // 2, tgt_idx))
+            l_dict = {}
+            for loss in self.losses:
+                if loss == 'scale_map':
+                    continue
+                l_dict.update(self.get_loss(loss, output_known, targets, dn_pos_idx, num_valid_instances[loss]*scalar, is_dn=True))
+            l_dict = {k + f'_dn': v for k, v in l_dict.items()}
+            losses.update(l_dict)
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_valid_instances[loss]))
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if 'aux_outputs' in outputs:
+            for i, aux_outputs in enumerate(outputs['aux_outputs']):
+                indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    if loss == 'scale_map':
+                        continue
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices, num_valid_instances[loss])
+                    l_dict = {f'{k}.{i}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+                if 'dn_meta' in outputs:
+                    if loss == 'scale_map':
+                        continue
+                    aux_outputs_known = output_known['aux_outputs'][i]
+                    l_dict={}
+                    for loss in self.losses:
+                        l_dict.update(self.get_loss(loss, aux_outputs_known, targets, dn_pos_idx, num_valid_instances[loss]*scalar, is_dn=True))
+                    l_dict = {k + f'_dn.{i}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+        # if 'scale_map' in outputs:
+        #     enc_outputs = outputs['enc_outputs']
+        #     indices = self.matcher.forward_enc(enc_outputs, targets)
+        #     for loss in ['confs_enc', 'boxes_enc']:
+        #         l_dict = self.get_loss(loss, enc_outputs, targets, indices, num_valid_instances[loss.replace('_enc','')])
+        #         l_dict = {k + f'_enc': v for k, v in l_dict.items()}
+        #         losses.update(l_dict)
+        return losses

models/decoder.py ADDED Viewed

	@@ -0,0 +1,388 @@

+# Modified from DAB-DETR (https://github.com/IDEA-Research/DAB-DETR)
+import math
+import copy
+import os
+from typing import Optional, List
+from utils.misc import inverse_sigmoid
+import torch
+import torch.nn.functional as F
+from torch.nn.functional import scaled_dot_product_attention
+from torch import nn, Tensor
+from torch.nn.init import constant_
+from .position_encoding import position_encoding_xy
+from xformers.ops import memory_efficient_attention, fmha
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+class TransformerDecoder(nn.Module):
+    def __init__(self, d_model=512, nhead=8, num_queries=300,
+                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.0,
+                 activation="relu",
+                 return_intermediate_dec=False, query_dim=4,
+                 keep_query_pos=False, query_scale_type='cond_elewise',
+                 modulate_hw_attn=True,
+                 bbox_embed_diff_each_layer=True,
+                 ):
+        super().__init__()
+        decoder_layer = XformerDecoderLayer(d_model, nhead, dim_feedforward,
+                                                dropout, activation, keep_query_pos=keep_query_pos)
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = XformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
+                                          return_intermediate=return_intermediate_dec,
+                                          d_model=d_model, query_dim=query_dim, keep_query_pos=keep_query_pos, query_scale_type=query_scale_type,
+                                          modulate_hw_attn=modulate_hw_attn,
+                                          bbox_embed_diff_each_layer=bbox_embed_diff_each_layer)
+        self._reset_parameters()
+        assert query_scale_type in ['cond_elewise', 'cond_scalar', 'fix_elewise']
+        self.d_model = d_model
+        self.nhead = nhead
+        self.dec_layers = num_decoder_layers
+        self.num_queries = num_queries
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def mask2bias(self, mask, batch_size):
+        if mask is None:
+            return None
+        assert mask.dtype == torch.bool
+        assert mask.ndim == 2
+        L, S = mask.shape[0], mask.shape[1]
+        pad_size = (S + 7) // 8 * 8
+        bias = torch.zeros((batch_size, self.nhead, L, pad_size), device = mask.device)[:,:,:,:S]
+        bias.masked_fill_(mask.unsqueeze(0).unsqueeze(0), float('-inf'))
+        return bias
+    def forward(self, memory, memory_lens, tgt, tgt_lens, refpoint_embed, pos_embed, self_attn_mask):
+        self_attn_bias = self.mask2bias(self_attn_mask, batch_size=len(memory_lens))
+        hs, references = self.decoder(memory=memory, memory_lens=memory_lens,
+                          tgt=tgt, tgt_lens=tgt_lens,
+                          pos=pos_embed, refpoints_unsigmoid=refpoint_embed,
+                          self_attn_bias = self_attn_bias)
+        return hs, references
+class XformerDecoder(nn.Module):
+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=True,
+                    d_model=512, query_dim=4, keep_query_pos=False, query_scale_type='cond_elewise',
+                    modulate_hw_attn=False,
+                    bbox_embed_diff_each_layer=False,
+                    ):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+        assert return_intermediate
+        self.query_dim = query_dim
+        assert query_scale_type in ['cond_elewise', 'cond_scalar', 'fix_elewise']
+        self.query_scale_type = query_scale_type
+        if query_scale_type == 'cond_elewise':
+            self.query_scale = MLP(d_model, d_model, d_model, 2)
+        elif query_scale_type == 'cond_scalar':
+            self.query_scale = MLP(d_model, d_model, 1, 2)
+        elif query_scale_type == 'fix_elewise':
+            self.query_scale = nn.Embedding(num_layers, d_model)
+        else:
+            raise NotImplementedError("Unknown query_scale_type: {}".format(query_scale_type))
+        self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2)
+        self.bbox_embed = None
+        self.d_model = d_model
+        self.modulate_hw_attn = modulate_hw_attn
+        self.bbox_embed_diff_each_layer = bbox_embed_diff_each_layer
+        if modulate_hw_attn:
+            self.ref_anchor_head = MLP(d_model, d_model, 2, 2)
+        if not keep_query_pos:
+            for layer_id in range(num_layers - 1):
+                self.layers[layer_id + 1].ca_qpos_proj = None
+    def forward(self, memory, memory_lens, tgt, tgt_lens,
+                pos: Optional[Tensor] = None,
+                refpoints_unsigmoid: Optional[Tensor] = None, # L_tgt, 4
+                self_attn_bias = None):
+        B, num_queries = len(tgt_lens), tgt_lens[0]
+        output = tgt
+        intermediate = []
+        reference_points = refpoints_unsigmoid.sigmoid()
+        ref_points = [reference_points.view(B, num_queries, self.query_dim)]
+        # import ipdb; ipdb.set_trace()
+        for layer_id, layer in enumerate(self.layers):
+            obj_center = reference_points[:, :self.query_dim]  # [L_tgt, 4]
+            # get sine embedding for the query vector
+            xy_embed = position_encoding_xy(obj_center[:,0], obj_center[:,1], self.d_model)
+            wh_embed = position_encoding_xy(obj_center[:,2], obj_center[:,3], self.d_model)
+            query_sine_embed = torch.cat([xy_embed,wh_embed],dim=1) #[L_tgt, 2*d_model]
+            query_pos = self.ref_point_head(query_sine_embed)
+            # For the first decoder layer, we do not apply transformation over p_s
+            if self.query_scale_type != 'fix_elewise':
+                if layer_id == 0:
+                    pos_transformation = 1
+                else:
+                    pos_transformation = self.query_scale(output)
+            else:
+                pos_transformation = self.query_scale.weight[layer_id]
+            # apply transformation
+            query_sine_embed = query_sine_embed[:,:self.d_model] * pos_transformation
+            # modulated HW attentions
+            if self.modulate_hw_attn:
+                refHW_cond = self.ref_anchor_head(output).sigmoid() # nq, bs, 2
+                query_sine_embed[..., self.d_model // 2:] *= (refHW_cond[..., 0] / obj_center[..., 2]).unsqueeze(-1)
+                query_sine_embed[..., :self.d_model // 2] *= (refHW_cond[..., 1] / obj_center[..., 3]).unsqueeze(-1)
+            output = layer(memory=memory, memory_lens=memory_lens,
+                           tgt=output, tgt_lens=tgt_lens,
+                           pos=pos, query_pos=query_pos, query_sine_embed=query_sine_embed,
+                           is_first=(layer_id == 0),
+                           self_attn_bias = self_attn_bias)
+            # iter update
+            if self.bbox_embed is not None:
+                if self.bbox_embed_diff_each_layer:
+                    tmp = self.bbox_embed[layer_id](self.norm(output))
+                else:
+                    tmp = self.bbox_embed(self.norm(output))
+                # import ipdb; ipdb.set_trace()
+                tmp[..., :self.query_dim] += inverse_sigmoid(reference_points)
+                new_reference_points = tmp[..., :self.query_dim].sigmoid()
+                if layer_id != self.num_layers - 1:
+                    ref_points.append(new_reference_points.view(B, num_queries, self.query_dim))
+                reference_points = new_reference_points.detach()
+            if self.return_intermediate:
+                intermediate.append(self.norm(output).view(B, num_queries, self.d_model))
+        # if self.norm is not None:
+        #     output = self.norm(output)
+        #     if self.return_intermediate:
+        #         intermediate.pop()
+        #         intermediate.append(output.view(B, num_queries, self.d_model))
+        if self.return_intermediate:
+            if self.bbox_embed is not None:
+                return [
+                    torch.stack(intermediate),
+                    torch.stack(ref_points),
+                ]
+            else:
+                return [
+                    torch.stack(intermediate),
+                    reference_points.unsqueeze(0)
+                ]
+        return output.unsqueeze(0)
+class XformerDecoderLayer(nn.Module):
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.0,
+                 activation="relu", keep_query_pos=False):
+        super().__init__()
+        # Decoder Self-Attention
+        self.sa_qcontent_proj = nn.Linear(d_model, d_model)
+        self.sa_qpos_proj = nn.Linear(d_model, d_model)
+        self.sa_kcontent_proj = nn.Linear(d_model, d_model)
+        self.sa_kpos_proj = nn.Linear(d_model, d_model)
+        self.sa_v_proj = nn.Linear(d_model, d_model)
+        self.sa_out_proj = nn.Linear(d_model, d_model)
+        constant_(self.sa_out_proj.bias, 0.)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        # Decoder Cross-Attention
+        self.ca_qcontent_proj = nn.Linear(d_model, d_model)
+        self.ca_qpos_proj = nn.Linear(d_model, d_model)
+        self.ca_kcontent_proj = nn.Linear(d_model, d_model)
+        self.ca_kpos_proj = nn.Linear(d_model, d_model)
+        self.ca_v_proj = nn.Linear(d_model, d_model)
+        self.ca_qpos_sine_proj = nn.Linear(d_model, d_model)
+        self.ca_out_proj = nn.Linear(d_model, d_model)
+        constant_(self.ca_out_proj.bias, 0.)
+        self.d_model = d_model
+        self.nhead = nhead
+        assert self.d_model%self.nhead == 0
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+        self.keep_query_pos = keep_query_pos
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+    def forward(self, memory, memory_lens, pos,
+                    tgt, tgt_lens, query_pos, query_sine_embed,
+                    is_first=False,
+                    self_attn_bias=None):
+        # self_attn_bias is only used for dn_training
+        # 'True' indicates that the element should take part in attention
+        B, num_queries = len(tgt_lens), tgt_lens[0]
+        L_mem, C_mem = memory.shape
+        L_tgt, C_tgt = tgt.shape
+        assert C_mem == C_tgt
+        # ========== Begin of Self-Attention =============
+        tgt_b4n = tgt
+        tgt = self.norm1(tgt)
+        q_content = self.sa_qcontent_proj(tgt)
+        q_pos = self.sa_qpos_proj(query_pos)
+        k_content = self.sa_kcontent_proj(tgt)
+        k_pos = self.sa_kpos_proj(query_pos)
+        v = self.sa_v_proj(tgt)
+        q = q_content + q_pos
+        k = k_content + k_pos
+        q = q.view(B, num_queries, self.nhead, self.d_model // self.nhead)
+        k = k.view(B, num_queries, self.nhead, self.d_model // self.nhead)
+        v = v.view(B, num_queries, self.nhead, self.d_model // self.nhead)
+        tgt2 = memory_efficient_attention(q, k, v, attn_bias=self_attn_bias)
+        tgt2 = self.sa_out_proj(tgt2.view(L_tgt, self.d_model))
+        tgt = tgt_b4n + self.dropout1(tgt2)
+        # ========== End of Self-Attention =============
+        # ========== Begin of Cross-Attention =============
+        tgt_b4n = tgt
+        tgt = self.norm2(tgt)
+        q_content = self.ca_qcontent_proj(tgt)
+        k_content = self.ca_kcontent_proj(memory)
+        v = self.ca_v_proj(memory)
+        k_pos = self.ca_kpos_proj(pos)
+        # For the first decoder layer, we concatenate the positional embedding predicted from
+        # the object query (the positional embedding) into the original query (key) in DETR.
+        if is_first or self.keep_query_pos:
+            q_pos = self.ca_qpos_proj(query_pos)
+            q = q_content + q_pos
+            k = k_content + k_pos
+        else:
+            q = q_content
+            k = k_content
+        q = q.view(1, L_tgt, self.nhead, self.d_model//self.nhead)
+        query_sine_embed = self.ca_qpos_sine_proj(query_sine_embed)
+        query_sine_embed = query_sine_embed.view(1, L_tgt, self.nhead, self.d_model//self.nhead)
+        q = torch.cat([q, query_sine_embed], dim=3)
+        k = k.view(1, L_mem, self.nhead, self.d_model//self.nhead)
+        k_pos = k_pos.view(1, L_mem, self.nhead, self.d_model//self.nhead)
+        k = torch.cat([k, k_pos], dim=3)
+        v = v.view(1, L_mem, self.nhead, self.d_model//self.nhead)
+        attn_bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens(q_seqlen = tgt_lens, kv_seqlen = memory_lens)
+        tgt2 = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        tgt2 = self.ca_out_proj(tgt2.view(L_tgt, self.d_model))
+        tgt = tgt_b4n + self.dropout2(tgt2)
+        # ========== End of Cross-Attention =============
+        # FFN
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(self.norm3(tgt)))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    if activation == "prelu":
+        return nn.PReLU()
+    if activation == "selu":
+        return F.selu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+def build_decoder(args):
+    return TransformerDecoder(
+        d_model=args.hidden_dim,
+        dropout=args.dropout,
+        nhead=args.nheads,
+        num_queries=args.num_queries,
+        dim_feedforward=args.dim_feedforward,
+        num_decoder_layers=args.dec_layers,
+        return_intermediate_dec=True,
+        query_dim=4,
+        activation=args.transformer_activation
+    )
+def torch_attention(query, key, value, attn_bias = None):
+    scale = 1.0 / query.shape[-1] ** 0.5
+    query = query * scale
+    query = query.transpose(1, 2)
+    key = key.transpose(1, 2)
+    value = value.transpose(1, 2)
+    attn = query @ key.transpose(-2, -1)
+    if attn_bias is not None:
+        attn = attn + attn_bias
+    attn = attn.softmax(-1)
+    # attn = F.dropout(attn, p)
+    attn = attn @ value
+    return attn.transpose(1, 2)

models/dn_components.py ADDED Viewed

	@@ -0,0 +1,193 @@

+# Modified from DINO (https://github.com/IDEA-Research/DINO)
+import torch
+from utils.misc import (NestedTensor, nested_tensor_from_tensor_list,
+                       accuracy, get_world_size, interpolate,
+                       is_dist_avail_and_initialized, inverse_sigmoid)
+from utils import box_ops
+import torch.nn.functional as F
+def prepare_for_cdn(targets, dn_cfg, num_queries, hidden_dim, dn_enc):
+    """
+        A major difference of DINO from DN-DETR is that the author process pattern embedding pattern embedding in its detector
+        forward function and use learnable tgt embedding, so we change this function a little bit.
+        :param dn_args: targets, dn_number, label_noise_ratio, box_noise_scale
+        :param training: if it is training or inference
+        :param num_queries: number of queires
+        :param num_classes: number of classes
+        :param hidden_dim: transformer hidden dim
+        :param label_enc: encode labels in dn
+        :return:
+        """
+    device = targets[0]['boxes'].device
+    dn_number = dn_cfg['dn_number']
+    box_noise_scale = dn_cfg['box_noise_scale']
+    tgt_noise_scale = dn_cfg['tgt_noise_scale']
+    known = [(torch.ones_like(t['labels'])) for t in targets]
+    batch_size = len(known)
+    known_num = [sum(k) for k in known]
+    if int(max(known_num)) == 0:
+        dn_number = 1
+    else:
+        if dn_number >= 100:
+            dn_number = dn_number // (int(max(known_num) * 2))
+        elif dn_number < 1:
+            dn_number = 1
+    if dn_number == 0:
+        dn_number = 1
+    unmask_bbox = torch.cat(known)
+    boxes = torch.cat([t['boxes'] for t in targets])
+    assert boxes.ndim == 2
+    batch_idx = torch.cat([torch.full_like(t['labels'].long(), i) for i, t in enumerate(targets)])
+    known_indice = torch.nonzero(unmask_bbox)
+    known_indice = known_indice.view(-1)
+    known_indice = known_indice.repeat(2 * dn_number, 1).view(-1)
+    known_bid = batch_idx.repeat(2 * dn_number, 1).view(-1)
+    single_pad = int(max(known_num))
+    pad_size = int(single_pad * 2 * dn_number)
+    positive_idx = torch.tensor(range(len(boxes))).long().to(device=device).unsqueeze(0).repeat(dn_number, 1)
+    positive_idx += (torch.tensor(range(dn_number)) * len(boxes) * 2).long().to(device=device).unsqueeze(1)
+    positive_idx = positive_idx.flatten()
+    negative_idx = positive_idx + len(boxes)
+    # box queries
+    known_bboxs = boxes.repeat(2 * dn_number, 1)
+    known_bbox_expand = known_bboxs.clone()
+    if box_noise_scale > 0:
+        known_bbox_ = torch.zeros_like(known_bboxs)
+        known_bbox_[:, :2] = known_bboxs[:, :2] - known_bboxs[:, 2:] / 2
+        known_bbox_[:, 2:] = known_bboxs[:, :2] + known_bboxs[:, 2:] / 2
+        diff = torch.zeros_like(known_bboxs)
+        diff[:, :2] = known_bboxs[:, 2:] / 2
+        diff[:, 2:] = known_bboxs[:, 2:] / 2
+        rand_sign = torch.randint_like(known_bboxs, low=0, high=2, dtype=torch.float32) * 2.0 - 1.0
+        rand_part = torch.rand_like(known_bboxs)
+        rand_part[negative_idx] += 1.0
+        rand_part *= rand_sign
+        known_bbox_ = known_bbox_ + torch.mul(rand_part,
+                                                diff).to(device=device) * box_noise_scale
+        known_bbox_ = known_bbox_.clamp(min=0.0, max=1.0)
+        known_bbox_expand[:, :2] = (known_bbox_[:, :2] + known_bbox_[:, 2:]) / 2
+        known_bbox_expand[:, 2:] = known_bbox_[:, 2:] - known_bbox_[:, :2]
+    input_bbox_embed = inverse_sigmoid(known_bbox_expand)
+    # tgt queries
+    if dn_cfg['tgt_embed_type'] == 'labels':
+        labels = torch.cat([t['labels'] for t in targets])
+        known_labels = labels.repeat(2 * dn_number, 1).view(-1)
+        known_labels_expaned = known_labels.clone()
+        if tgt_noise_scale > 0:
+            p = torch.rand_like(known_labels_expaned.float())
+            chosen_indice = torch.nonzero(p < tgt_noise_scale).view(-1)
+            new_label = torch.randint_like(chosen_indice, 0, dn_cfg['dn_labelbook_size'])  # randomly put a new one here
+            known_labels_expaned.scatter_(0, chosen_indice, new_label)
+        m = known_labels_expaned.long().to(device=device)
+        input_tgt_embed = dn_enc(m)
+    elif dn_cfg['tgt_embed_type'] == 'params':
+        poses = torch.cat([t['poses'] for t in targets])
+        betas = torch.cat([t['betas'] for t in targets])
+        params = torch.cat([poses, betas], dim=-1)
+        assert params.ndim == 2
+        known_params = params.repeat(2 * dn_number, 1)
+        known_params_expaned = known_params.clone()
+        if tgt_noise_scale > 0:
+            rand_sign = torch.randint_like(known_params, low=0, high=2, dtype=torch.float32) * 2.0 - 1.0
+            rand_part = torch.rand_like(known_params)
+            rand_part[negative_idx] += 1.0
+            rand_part *= rand_sign
+            known_params_expaned = known_params_expaned + rand_part * tgt_noise_scale
+        m = known_params_expaned.to(device=device)
+        input_tgt_embed = dn_enc(m)
+    padding_tgt = torch.zeros((pad_size, hidden_dim), device=device)
+    padding_bbox = torch.zeros((pad_size, 4), device=device)
+    input_query_tgt = padding_tgt.repeat(batch_size, 1, 1)
+    input_query_bbox = padding_bbox.repeat(batch_size, 1, 1)
+    map_known_indice = torch.tensor([]).to(device=device)
+    if len(known_num):
+        map_known_indice = torch.cat([torch.tensor(range(num)) for num in known_num])  # [1,2, 1,2,3]
+        map_known_indice = torch.cat([map_known_indice + single_pad * i for i in range(2 * dn_number)]).long()
+    if len(known_bid):
+        input_query_tgt[(known_bid.long(), map_known_indice)] = input_tgt_embed
+        input_query_bbox[(known_bid.long(), map_known_indice)] = input_bbox_embed
+    # prepare attn_mask
+    tgt_size = pad_size + num_queries
+    attn_mask = torch.zeros((tgt_size, tgt_size), dtype=bool, device=device)
+    # match query cannot see the reconstruct
+    attn_mask[pad_size:, :pad_size] = True
+    # reconstruct cannot see each other
+    for i in range(dn_number):
+        if i == 0:
+            attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), single_pad * 2 * (i + 1):pad_size] = True
+        if i == dn_number - 1:
+            attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), :single_pad * i * 2] = True
+        else:
+            attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), single_pad * 2 * (i + 1):pad_size] = True
+            attn_mask[single_pad * 2 * i:single_pad * 2 * (i + 1), :single_pad * 2 * i] = True
+    dn_meta = {
+        'pad_size': pad_size,
+        'num_dn_group': dn_number,
+    }
+    return input_query_tgt, input_query_bbox, attn_mask, dn_meta
+def dn_post_process(pred_poses, pred_betas,
+                    pred_boxes, pred_confs,
+                    pred_j3ds, pred_j2ds, pred_depths,
+                    pred_verts, pred_transl,
+                    dn_meta, aux_loss, _set_aux_loss):
+    """
+        post process of dn after output from the transformer
+        put the dn part in the dn_meta
+    """
+    assert dn_meta['pad_size'] > 0
+    pad_size = dn_meta['pad_size']
+    known_poses, pred_poses = pred_poses[:,:,:pad_size], pred_poses[:,:,pad_size:]
+    known_betas, pred_betas = pred_betas[:,:,:pad_size], pred_betas[:,:,pad_size:]
+    known_boxes, pred_boxes = pred_boxes[:,:,:pad_size], pred_boxes[:,:,pad_size:]
+    known_confs, pred_confs = pred_confs[:,:,:pad_size], pred_confs[:,:,pad_size:]
+    known_j3ds, pred_j3ds = pred_j3ds[:,:,:pad_size], pred_j3ds[:,:,pad_size:]
+    known_j2ds, pred_j2ds = pred_j2ds[:,:,:pad_size], pred_j2ds[:,:,pad_size:]
+    known_depths, pred_depths = pred_depths[:,:,:pad_size], pred_depths[:,:,pad_size:]
+    known_verts, pred_verts = pred_verts[:,:pad_size], pred_verts[:,pad_size:]
+    known_transl, pred_transl = pred_transl[:,:pad_size], pred_transl[:,pad_size:]
+    out = {'pred_poses': known_poses[-1], 'pred_betas': known_betas[-1],
+                'pred_boxes': known_boxes[-1], 'pred_confs': known_confs[-1],
+               'pred_j3ds': known_j3ds[-1], 'pred_j2ds': known_j2ds[-1],
+               'pred_depths': known_depths[-1]}
+    if aux_loss:
+        out['aux_outputs'] = _set_aux_loss(known_poses, known_betas,
+                                            known_boxes, known_confs,
+                                            known_j3ds, known_j2ds, known_depths)
+    dn_meta['output_known'] = out
+    return pred_poses, pred_betas,\
+           pred_boxes, pred_confs,\
+           pred_j3ds, pred_j2ds,\
+           pred_depths, pred_verts,\
+           pred_transl,

models/encoders/__init__.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Modified from DINOv2 (https://github.com/facebookresearch/dinov2)
+from models.encoders.dinov2.models.vision_transformer import vit_base, vit_large
+import torch
+from configs.paths import dinov2_vitb14_path, dinov2_vitl14_path
+import copy
+def build_encoder(args):
+    num_additional_blocks = 0
+    if args.sat_cfg['use_sat'] and args.sat_cfg['use_additional_blocks']:
+        num_additional_blocks = args.sat_cfg['get_map_layer']
+    weights = None
+    if args.encoder == 'vitb':
+        model = vit_base(img_size = 518,
+            patch_size  = 14,
+            init_values = 1.0,
+            ffn_layer = "mlp",
+            block_chunks = 0,
+            num_register_tokens = 0,
+            interpolate_antialias = False,
+            interpolate_offset = 0.1,
+            num_additional_blocks = num_additional_blocks)
+        if args.mode.lower() == 'train':
+            weights = torch.load(dinov2_vitb14_path)
+    elif args.encoder == 'vitl':
+        model = vit_large(img_size = 518,
+            patch_size  = 14,
+            init_values = 1.0,
+            ffn_layer = "mlp",
+            block_chunks = 0,
+            num_register_tokens = 0,
+            interpolate_antialias = False,
+            interpolate_offset = 0.1,
+            num_additional_blocks = num_additional_blocks)
+        if args.mode.lower() == 'train':
+            weights = torch.load(dinov2_vitl14_path)
+    else:
+        raise NotImplementedError
+    if weights is not None:
+        if args.sat_cfg['use_sat'] and args.sat_cfg['use_additional_blocks']:
+            add_blocks_weights(weights, args.sat_cfg['get_map_layer'])
+        print('Loading pretrained DINOv2...')
+        model.load_state_dict(weights,strict=True)
+    return model
+def add_blocks_weights(weights, num_layers):
+    for k in list(weights.keys()):
+        if k.startswith('blocks') and int(k.split('.')[1]) < num_layers:
+            new_k = k.replace('blocks', 'additional_blocks')
+            weights[new_k] = copy.deepcopy(weights[k])

models/encoders/dinov2/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+from .dino_head import DINOHead
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention

models/encoders/dinov2/layers/attention.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+import os
+import warnings
+from torch import Tensor
+from torch import nn
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import memory_efficient_attention, unbind
+        XFORMERS_AVAILABLE = True
+        warnings.warn("xFormers is available (Attention)")
+    else:
+        warnings.warn("xFormers is disabled (Attention)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    warnings.warn("xFormers is not available (Attention)")
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            if attn_bias is not None:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

models/encoders/dinov2/layers/block.py ADDED Viewed

	@@ -0,0 +1,260 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+import os
+from typing import Callable, List, Any, Tuple, Dict
+import warnings
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+logger = logging.getLogger("dinov2")
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import fmha, scaled_index_add, index_select_cat
+        XFORMERS_AVAILABLE = True
+        warnings.warn("xFormers is available (Block)")
+    else:
+        warnings.warn("xFormers is disabled (Block)")
+        raise ImportError
+except ImportError:
+    XFORMERS_AVAILABLE = False
+    warnings.warn("xFormers is not available (Block)")
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.attn_bias.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            if not XFORMERS_AVAILABLE:
+                raise AssertionError("xFormers is required for using nested tensors")
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

models/encoders/dinov2/layers/dino_head.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from torch.nn.utils import weight_norm
+class DINOHead(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        use_bn=False,
+        nlayers=3,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        mlp_bias=True,
+    ):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
+        self.apply(self._init_weights)
+        self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
+        self.last_layer.weight_g.data.fill_(1)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        x = self.mlp(x)
+        eps = 1e-6 if x.dtype == torch.float16 else 1e-12
+        x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
+        x = self.last_layer(x)
+        return x
+def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
+    if nlayers == 1:
+        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
+    else:
+        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
+        if use_bn:
+            layers.append(nn.BatchNorm1d(hidden_dim))
+        layers.append(nn.GELU())
+        for _ in range(nlayers - 2):
+            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
+        return nn.Sequential(*layers)

models/encoders/dinov2/layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

models/encoders/dinov2/layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+from torch import Tensor
+from torch import nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

models/encoders/dinov2/layers/mlp.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+from torch import Tensor, nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

models/encoders/dinov2/layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+from torch import Tensor
+import torch.nn as nn
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

models/encoders/dinov2/layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import os
+from typing import Callable, Optional
+import warnings
+from torch import Tensor, nn
+import torch.nn.functional as F
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+XFORMERS_ENABLED = os.environ.get("XFORMERS_DISABLED") is None
+try:
+    if XFORMERS_ENABLED:
+        from xformers.ops import SwiGLU
+        XFORMERS_AVAILABLE = True
+        warnings.warn("xFormers is available (SwiGLU)")
+    else:
+        warnings.warn("xFormers is disabled (SwiGLU)")
+        raise ImportError
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+    warnings.warn("xFormers is not available (SwiGLU)")
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

models/encoders/dinov2/models/__init__.py ADDED Viewed

	@@ -0,0 +1,43 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+import logging
+from . import vision_transformer as vits
+logger = logging.getLogger("dinov2")
+def build_model(args, only_teacher=False, img_size=224):
+    args.arch = args.arch.removesuffix("_memeff")
+    if "vit" in args.arch:
+        vit_kwargs = dict(
+            img_size=img_size,
+            patch_size=args.patch_size,
+            init_values=args.layerscale,
+            ffn_layer=args.ffn_layer,
+            block_chunks=args.block_chunks,
+            qkv_bias=args.qkv_bias,
+            proj_bias=args.proj_bias,
+            ffn_bias=args.ffn_bias,
+            num_register_tokens=args.num_register_tokens,
+            interpolate_offset=args.interpolate_offset,
+            interpolate_antialias=args.interpolate_antialias,
+        )
+        teacher = vits.__dict__[args.arch](**vit_kwargs)
+        if only_teacher:
+            return teacher, teacher.embed_dim
+        student = vits.__dict__[args.arch](
+            **vit_kwargs,
+            drop_path_rate=args.drop_path_rate,
+            drop_path_uniform=args.drop_path_uniform,
+        )
+        embed_dim = student.embed_dim
+    return student, teacher, embed_dim
+def build_model_from_cfg(cfg, only_teacher=False):
+    return build_model(cfg.student, only_teacher=only_teacher, img_size=cfg.crops.global_crops_size)

models/encoders/dinov2/models/vision_transformer.py ADDED Viewed

	@@ -0,0 +1,542 @@

+# Modified from DINOv2 (https://github.com/facebookresearch/dinov2)
+# ------------------------------------------------------------------------
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+import copy
+from models.encoders.dinov2.layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+logger = logging.getLogger("dinov2")
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+        num_additional_blocks = 0,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.img_size = img_size
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        if num_additional_blocks > 0:
+            assert not self.chunked_blocks
+            self.additional_blocks = copy.deepcopy(self.blocks[:num_additional_blocks])
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.init_weights()
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def interpolate_pos_encoding(self, x, w, h, with_cls_token = True):
+        previous_dtype = x.dtype
+        # npatch = x.shape[1] - 1 if with_cls_token else x.shape[1]
+        N = self.pos_embed.shape[1] - 1
+        # if npatch == N and w == h:
+        #     return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if self.interpolate_offset:
+            # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
+            sx = float(w0 + self.interpolate_offset) / M
+            sy = float(h0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sx, sy)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (w0, h0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (w0, h0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        if with_cls_token:
+            return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+        else:
+            return patch_pos_embed.to(previous_dtype)
+    def interpolate_pos_encoding2(self, x, input_size, feature_h, feature_w):
+        previous_dtype = x.dtype
+        N = self.pos_embed.shape[1] - 1
+        pos_embed = self.pos_embed.float()
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = input_size // self.patch_size
+        h0 = input_size // self.patch_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if self.interpolate_offset:
+            # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
+            sx = float(w0 + self.interpolate_offset) / M
+            sy = float(h0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sx, sy)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (w0, h0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (w0, h0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed[...,:feature_h,:feature_w].permute(0, 2, 3, 1).reshape(1, -1, dim)
+        return patch_pos_embed.to(previous_dtype)
+    def interpolate_pos_encoding3(self, target_size):
+        # previous_dtype = x.dtype
+        N = self.pos_embed.shape[1] - 1
+        pos_embed = self.pos_embed.float()
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = self.embed_dim
+        w0 = target_size
+        h0 = target_size
+        M = int(math.sqrt(N))  # Recover the number of patches in each dimension
+        assert N == M * M
+        kwargs = {}
+        if self.interpolate_offset:
+            # Historical kludge: add a small number to avoid floating point error in the interpolation, see https://github.com/facebookresearch/dino/issues/8
+            # Note: still needed for backward-compatibility, the underlying operators are using both output size and scale factors
+            sx = float(w0 + self.interpolate_offset) / M
+            sy = float(h0 + self.interpolate_offset) / M
+            kwargs["scale_factor"] = (sx, sy)
+        else:
+            # Simply specify an output size instead of a scale factor
+            kwargs["size"] = (w0, h0)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, M, M, dim).permute(0, 3, 1, 2),
+            mode="bicubic",
+            antialias=self.interpolate_antialias,
+            **kwargs,
+        )
+        assert (w0, h0) == patch_pos_embed.shape[-2:]
+        patch_pos_embed = patch_pos_embed.squeeze(0).permute(1,2,0)
+        return patch_pos_embed
+    def prepare_tokens_with_masks(self, x, masks=None, with_pos_embed = True):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        if with_pos_embed:
+            x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+        return x
+    def prepare_tokens_with_masks2(self, x, masks):
+        assert masks.ndim == 3
+        B, nc, w, h = x.shape
+        token_lens = masks.flatten(1).sum(1).tolist()
+        patched_x = x.view(B, 3, w//self.patch_size, self.patch_size, h//self.patch_size, self.patch_size)
+        patched_x = patched_x.permute(0, 2, 4, 1, 3, 5)
+        x = self.patch_embed.norm(self.patch_embed.proj(patched_x[masks]).flatten(1))
+        pos_embed = self.interpolate_pos_encoding(x, w, h, with_cls_token=False).repeat(B, 1, 1)
+        x = x + pos_embed[masks.flatten(1)]
+        cr_token = self.cls_token.view(1,-1) + self.pos_embed.float()[:,0].view(1,-1)
+        if self.register_tokens is not None:
+            cr_token = torch.cat([cr_token, self.register_tokens.view(self.num_register_tokens, -1)])
+        # x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        # if self.register_tokens is not None:
+        #     x = torch.cat(
+        #         (
+        #             x[:, :1],
+        #             self.register_tokens.expand(x.shape[0], -1, -1),
+        #             x[:, 1:],
+        #         ),
+        #         dim=1,
+        #     )
+        return x, token_lens, cr_token
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        for blk in self.blocks:
+            x = blk(x)
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+    def forward_specific_layers(self, x, start=0, end=None, norm=True):
+        assert not self.chunked_blocks
+        if end is None:
+            end = len(self.blocks)
+        for blk in self.blocks[start:end]:
+            x = blk(x)
+        out = x[:, 1 + self.num_register_tokens :]
+        if norm:
+            out = self.norm(out)
+        return x, out
+    def forward_specific_layers_list(self, x_list, start=0, end=None, norm=True, get_feature=True):
+        assert not self.chunked_blocks
+        if end is None:
+            end = len(self.blocks)
+        for blk in self.blocks[start:end]:
+            x_list = blk(x_list)
+        if get_feature:
+            out_list = [x[:, 1 + self.num_register_tokens:, :] for x in x_list]
+            if norm:
+                out_list = [self.norm(out) for out in out_list]
+            return x_list, out_list
+        else:
+            return x_list
+    def forward_additional_layers_list(self, x_list, start=0, end=None, norm=True, get_feature=True):
+        assert not self.chunked_blocks
+        if end is None:
+            end = len(self.additional_blocks)
+        for blk in self.additional_blocks[start:end]:
+            x_list = blk(x_list)
+        if get_feature:
+            out_list = [x[:, 1 + self.num_register_tokens:, :] for x in x_list]
+            if norm:
+                out_list = [self.norm(out) for out in out_list]
+            return x_list, out_list
+        else:
+            return x_list
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens :] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model

models/human_models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .smpl_models import SMPL_Layer, smpl_gendered

models/human_models/smpl_models.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import torch
+from torch import nn
+import smplx
+import numpy as np
+import pickle
+import os.path as osp
+from configs.paths import smpl_model_path
+class SMPL_Layer(nn.Module):
+    def __init__(self, model_path, with_genders = True, **kwargs):
+        """
+        Extension of the SMPL Layer with gendered inputs.
+        """
+        super().__init__()
+        smpl_kwargs = {'create_global_orient': False, 'create_body_pose': False,
+                        'create_betas': False, 'create_transl': False}
+        smpl_kwargs.update(kwargs)
+        self.with_genders = with_genders
+        if self.with_genders:
+            self.layer_n = smplx.create(model_path, 'smpl', gender='neutral', **smpl_kwargs)
+            self.layer_m = smplx.create(model_path, 'smpl', gender='male', **smpl_kwargs)
+            self.layer_f = smplx.create(model_path, 'smpl', gender='female', **smpl_kwargs)
+            self.layers = {'neutral': self.layer_n, 'male': self.layer_m, 'female': self.layer_f}
+        else:
+            self.layer_n = smplx.create(model_path, 'smpl', gender='neutral', **smpl_kwargs)
+            self.layers = {'neutral': self.layer_n}
+        self.vertex_num = 6890
+        self.faces = self.layer_n.faces
+        self.body_vertex_idx = np.load(osp.join(model_path, 'smpl', 'body_verts_smpl.npy'))
+        self.smpl2h36m_regressor = np.load(osp.join(model_path, 'smpl', 'J_regressor_h36m_correct.npy'))
+    def forward_single_gender(self, poses, betas, gender='neutral'):
+        bs = poses.shape[0]
+        if poses.ndim == 2:
+            poses = poses.view(bs, -1, 3)
+        assert poses.shape[1] == 24
+        pose_params = {'global_orient': poses[:, :1, :],
+                    'body_pose': poses[:, 1:, :]}
+        smpl_output = self.layers[gender](betas=betas, **pose_params)
+        return smpl_output.vertices, smpl_output.joints
+    def forward(self, poses, betas, genders = None):
+        bs = poses.shape[0]
+        assert poses.shape[0] == betas.shape[0]
+        if genders is None:
+            return self.forward_single_gender(poses, betas)
+        else:
+            assert len(genders) == bs
+            assert set(genders) <= {'male', 'female'}
+            assert self.with_genders
+            male_idx = [i for i, gender in enumerate(genders) if gender == 'male']
+            if len(male_idx) == bs:
+                return self.forward_single_gender(poses, betas, gender='male')
+            elif len(male_idx) == 0:
+                return self.forward_single_gender(poses, betas, gender='female')
+            else:
+                vertices, joints = self.forward_single_gender(poses, betas, gender='female')
+                vertices[male_idx], joints[male_idx] =\
+                         self.forward_single_gender(poses[male_idx], betas[male_idx], gender='male')
+                return vertices, joints
+smpl_gendered = SMPL_Layer(smpl_model_path, with_genders = True)

models/matcher.py ADDED Viewed

	@@ -0,0 +1,159 @@

+# Modified from DAB-DETR (https://github.com/IDEA-Research/DAB-DETR)
+import torch
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+from utils.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
+class HungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+    def __init__(self,
+                cost_conf: float = 1,
+                cost_bbox: float = 1,
+                cost_giou: float = 1,
+                cost_kpts: float = 10,
+                j2ds_norm_scale: float = 518,
+                ):
+        """Creates the matcher
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
+            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
+        """
+        super().__init__()
+        self.cost_conf = cost_conf
+        self.cost_bbox = cost_bbox
+        self.cost_giou = cost_giou
+        self.cost_kpts = cost_kpts
+        self.j2ds_norm_scale = j2ds_norm_scale
+        assert cost_conf != 0 or cost_bbox != 0 or cost_giou != 0 or cost_kpts != 0, "all costs cant be 0"
+        # self.focal_alpha = focal_alpha
+    @torch.no_grad()
+    def forward_enc(self, outputs, targets):
+        """ Performs the matching
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_confs": Tensor of flattened confidence score, [total_lens, 1]
+                 "pred_boxes": Tensor of flattened boxes, [total_lens, 4]
+                 "lens": num of predictions for each sample in the batch, sum(lens) == total_lens
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        out_conf = outputs['pred_confs']
+        out_bbox = outputs["pred_boxes"]
+        lens = outputs['lens']
+        assert len(lens) == len(targets)
+        assert tuple(out_conf.shape) == (sum(lens),1)
+        assert tuple(out_bbox.shape) == (sum(lens),4)
+        # Also concat the target labels and boxes
+        tgt_bbox = torch.cat([v["boxes"] for v in targets])
+        # Compute the confidence cost.
+        alpha = 0.25
+        gamma = 2.0
+        cost_conf = alpha * ((1 - out_conf) ** gamma) * (-(out_conf + 1e-8).log())
+        # cost_conf = -(out_conf+1e-8).log()
+        # Compute the L1 cost between boxes
+        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
+        # Compute the giou cost betwen boxes
+        # import ipdb; ipdb.set_trace()
+        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
+        # Final cost matrix
+        C = self.cost_conf*cost_conf + self.cost_bbox * cost_bbox + self.cost_giou * cost_giou
+        C = C.cpu()
+        sizes = [len(v["boxes"]) for v in targets]
+        idx=0
+        indices = []
+        for i, c in enumerate(C.split(sizes, -1)):
+            indices.append(linear_sum_assignment(c[idx:idx+lens[i]]))
+            idx += lens[i]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """ Performs the matching
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        assert outputs['pred_confs'].shape[0]==len(targets)
+        bs, num_queries, _ = outputs["pred_confs"].shape
+        # We flatten to compute the cost matrices in a batch
+        out_conf = outputs['pred_confs'].flatten(0,1)  # [batch_size * num_queries, 1]
+        out_bbox = outputs["pred_boxes"].flatten(0,1)  # [batch_size * num_queries, 4]
+        out_kpts = outputs['pred_j2ds'][...,:22,:].flatten(2).flatten(0,1) / self.j2ds_norm_scale
+        # Also concat the target labels and boxes
+        tgt_bbox = torch.cat([v["boxes"] for v in targets])
+        tgt_kpts = torch.cat([v['j2ds'][:,:22,:].flatten(1) for v in targets]) / self.j2ds_norm_scale
+        tgt_kpts_mask = torch.cat([v['j2ds_mask'][:,:22,:].flatten(1) for v in targets])
+        tgt_kpts_vis_cnt = tgt_kpts_mask.sum(-1)
+        assert (torch.all(tgt_kpts_vis_cnt))
+        # Compute the confidence cost.
+        alpha = 0.25
+        gamma = 2.0
+        cost_conf = alpha * ((1 - out_conf) ** gamma) * (-(out_conf + 1e-8).log())
+        # cost_conf = -(out_conf+1e-8).log()
+        # Compute the L1 cost between boxes
+        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
+        # Compute the giou cost betwen boxes
+        # import ipdb; ipdb.set_trace()
+        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), box_cxcywh_to_xyxy(tgt_bbox))
+        # Compute the mean L1 cost between visible joints
+        all_dist = torch.abs(out_kpts[:,None,:] - tgt_kpts[None,:,:])
+        mean_dist = (all_dist * tgt_kpts_mask[None,:,:]).sum(-1) / tgt_kpts_vis_cnt[None,:]
+        cost_kpts = mean_dist
+        # Final cost matrix
+        C = self.cost_conf*cost_conf + self.cost_kpts*cost_kpts + self.cost_bbox * cost_bbox + self.cost_giou * cost_giou
+        C = C.view(bs, num_queries, -1).cpu()
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+def build_matcher(args):
+    return HungarianMatcher(
+        cost_conf=args.set_cost_conf,
+        cost_bbox=args.set_cost_bbox,
+        cost_giou=args.set_cost_giou,
+        cost_kpts=args.set_cost_kpts,
+        j2ds_norm_scale=args.input_size
+    )

models/position_encoding.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Modified from DAB-DETR (https://github.com/IDEA-Research/DAB-DETR)
+"""
+Various positional encodings for the transformer.
+"""
+import math
+import torch
+from torch import nn
+from utils.misc import NestedTensor
+def position_encoding_xy(pos_x, pos_y, embedding_dim, temperature = 20, scale = 2*math.pi):
+    assert embedding_dim % 2 == 0
+    assert pos_x.ndim == 1 and pos_y.ndim == 1
+    dim_t = torch.arange(embedding_dim // 2, dtype=torch.float32, device=pos_x.device)
+    dim_t = temperature ** (2 * (dim_t // 2) / (embedding_dim // 2))
+    x_embed = pos_x * scale
+    y_embed = pos_y * scale
+    pos_x = x_embed[:, None] / dim_t
+    pos_y = y_embed[:, None] / dim_t
+    pos_x = torch.stack((pos_x[:, 0::2].sin(), pos_x[:, 1::2].cos()), dim=2).flatten(1)
+    pos_y = torch.stack((pos_y[:, 0::2].sin(), pos_y[:, 1::2].cos()), dim=2).flatten(1)
+    pos = torch.cat([pos_y,pos_x], dim=1)
+    return pos
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+class PositionEmbeddingSineHW(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperatureH=10000, temperatureW=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperatureH = temperatureH
+        self.temperatureW = temperatureW
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        # import ipdb; ipdb.set_trace()
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+        dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_tx = self.temperatureW ** (2 * (dim_tx // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_tx
+        dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_ty = self.temperatureH ** (2 * (dim_ty // 2) / self.num_pos_feats)
+        pos_y = y_embed[:, :, :, None] / dim_ty
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        # import ipdb; ipdb.set_trace()
+        return pos
+class PositionEmbeddingLearned(nn.Module):
+    """
+    Absolute pos embedding, learned.
+    """
+    def __init__(self, num_pos_feats=256):
+        super().__init__()
+        self.row_embed = nn.Embedding(50, num_pos_feats)
+        self.col_embed = nn.Embedding(50, num_pos_feats)
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.uniform_(self.row_embed.weight)
+        nn.init.uniform_(self.col_embed.weight)
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        h, w = x.shape[-2:]
+        i = torch.arange(w, device=x.device)
+        j = torch.arange(h, device=x.device)
+        x_emb = self.col_embed(i)
+        y_emb = self.row_embed(j)
+        pos = torch.cat([
+            x_emb.unsqueeze(0).repeat(h, 1, 1),
+            y_emb.unsqueeze(1).repeat(1, w, 1),
+        ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
+        return pos
+def build_position_encoding(args):
+    N_steps = args.hidden_dim // 2
+    if args.position_embedding in ('v2', 'sine'):
+        # TODO find a better way of exposing other arguments
+        position_embedding = PositionEmbeddingSineHW(
+            N_steps,
+            temperatureH=args.pe_temperatureH,
+            temperatureW=args.pe_temperatureW,
+            normalize=True
+        )
+    elif args.position_embedding in ('v3', 'learned'):
+        position_embedding = PositionEmbeddingLearned(N_steps)
+    else:
+        raise ValueError(f"not supported {args.position_embedding}")
+    return position_embedding

models/sat_model.py ADDED Viewed

	@@ -0,0 +1,767 @@

+# Modified from DAB-DETR (https://github.com/IDEA-Research/DAB-DETR)
+import os
+import math
+from math import tan,pi
+from typing import Dict
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torchvision.transforms import Resize
+import numpy as np
+import time
+import random
+from utils.misc import (NestedTensor, nested_tensor_from_tensor_list,
+                       accuracy, get_world_size, interpolate,
+                       is_dist_avail_and_initialized, inverse_sigmoid)
+from utils.transforms import rot6d_to_axis_angle, img2patch_flat, img2patch, to_zorder
+from utils.map import build_z_map
+from utils import constants
+from configs.paths import smpl_mean_path
+from models.encoders import build_encoder
+from .matcher import build_matcher
+from .decoder import build_decoder
+from .position_encoding import position_encoding_xy
+from .criterion import SetCriterion
+from .dn_components import prepare_for_cdn, dn_post_process
+import copy
+from configs.paths import smpl_model_path
+from models.human_models import SMPL_Layer
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+class Model(nn.Module):
+    """ One-stage Multi-person Human Mesh Estimation via Scale-adaptive Tokens """
+    def __init__(self, encoder, decoder,
+                    num_queries,
+                    input_size,
+                    sat_cfg = {'use_sat': False},
+                    dn_cfg = {'use_dn': False},
+                    train_pos_embed = True,
+                    aux_loss=True,
+                    iter_update=True,
+                    query_dim=4,
+                    bbox_embed_diff_each_layer=True,
+                    random_refpoints_xy=False,
+                    num_poses=24,
+                    dim_shape=10,
+                    FOV=pi/3
+                    ):
+        """ Initializes the model.
+        Parameters:
+            encoder: torch module of the encoder to be used. See ./encoders.
+            decoder: torch module of the decoder architecture. See decoder.py
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         DETR can detect in a single image.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+            iter_update: iterative update of boxes
+            query_dim: query dimension. 2 for point and 4 for box.
+            bbox_embed_diff_each_layer: dont share weights of prediction heads. Default for False. (shared weights.)
+            random_refpoints_xy: random init the x,y of anchor boxes and freeze them. (It sometimes helps to improve the performance)
+        """
+        super().__init__()
+        # ========== Start of common settings =============
+        self.input_size = input_size
+        hidden_dim = decoder.d_model
+        num_dec_layers = decoder.dec_layers
+        self.hidden_dim = hidden_dim
+        # camera model
+        self.focal = input_size/(2*tan(FOV/2))
+        self.FOV = FOV
+        cam_intrinsics = torch.tensor([[self.focal,0.,self.input_size/2],
+                                            [0.,self.focal,self.input_size/2],
+                                            [0.,0.,1.]])
+        self.register_buffer('cam_intrinsics', cam_intrinsics)
+        # human model
+        self.num_poses = num_poses
+        self.dim_shape = dim_shape
+        self.human_model = SMPL_Layer(model_path = smpl_model_path, with_genders = False)
+        # init params (following multi-hmr)
+        smpl_mean_params = np.load(smpl_mean_path, allow_pickle = True)
+        self.register_buffer('mean_pose', torch.from_numpy(smpl_mean_params['pose']))
+        self.register_buffer('mean_shape', torch.from_numpy(smpl_mean_params['shape']))
+        # ========== End of common settings =============
+        # ========== Start of SAT-encoder settings =============
+        self.encoder = encoder
+        self.patch_size = encoder.patch_size
+        assert self.patch_size == 14
+        self.use_sat = sat_cfg['use_sat']
+        self.sat_cfg = sat_cfg
+        if self.use_sat:
+            assert sat_cfg['num_lvls'] >= 2
+            assert self.input_size % (self.patch_size<<2) == 0
+            self.feature_size = []
+            for lvl in range(sat_cfg['num_lvls']):
+                patch_size = self.patch_size<<lvl
+                self.feature_size.append(self.input_size / patch_size)
+            # build z_order curve
+            z_depth = math.ceil(math.log2(self.feature_size[1]))
+            z_map, ys, xs = build_z_map(z_depth)
+            self.register_buffer('z_order_map', z_map)
+            self.register_buffer('y_coords', ys)
+            self.register_buffer('x_coords', xs)
+            self.enc_inter_norm = copy.deepcopy(encoder.norm)
+            self.scale_head = MLP(encoder.embed_dim, encoder.embed_dim, 2, 4)
+            self.encoder_patch_proj = _get_clones(encoder.patch_embed.proj, 2)
+            self.encoder_patch_norm = _get_clones(encoder.patch_embed.norm, 2)
+            if sat_cfg['lvl_embed']:
+                # same as level_embed in Deformable-DETR
+                self.level_embed = nn.Parameter(torch.Tensor(sat_cfg['num_lvls'],hidden_dim))
+                nn.init.normal_(self.level_embed)
+        else:
+            assert self.input_size % self.patch_size == 0
+            self.feature_size = [self.input_size // self.patch_size]
+            self.encoder_patch_proj = copy.deepcopy(encoder.patch_embed.proj)
+            self.encoder_patch_norm = copy.deepcopy(encoder.patch_embed.norm)
+        # cls_token and register tokens
+        encoder_cr_token = self.encoder.cls_token.view(1,-1) + self.encoder.pos_embed.float()[:,0].view(1,-1)
+        if self.encoder.register_tokens is not None:
+            encoder_cr_token = torch.cat([encoder_cr_token, self.encoder.register_tokens.view(self.encoder.num_register_tokens,-1)], dim=0)
+        self.encoder_cr_token = nn.Parameter(encoder_cr_token)
+        self.encoder_pos_embeds = nn.Parameter(self.encoder.interpolate_pos_encoding3(self.feature_size[0]).detach())
+        if not train_pos_embed:
+            self.encoder_pos_embeds.requires_grad = False
+        self.preprocessed_pos_lvl1 = None
+        # delete unwanted params
+        del(self.encoder.mask_token)
+        del(self.encoder.pos_embed)
+        del(self.encoder.patch_embed)
+        del(self.encoder.cls_token)
+        del(self.encoder.register_tokens)
+        # ========== End of SAT-encoder settings =============
+        # ========== Start of decoder settings =============
+        self.num_queries = num_queries
+        self.decoder = decoder
+        # embed_dim between encoder and decoder can be different
+        self.feature_proj = nn.Linear(encoder.embed_dim, hidden_dim)
+        # bbox
+        self.bbox_embed_diff_each_layer = bbox_embed_diff_each_layer
+        if bbox_embed_diff_each_layer:
+            self.bbox_embed = nn.ModuleList([MLP(hidden_dim, hidden_dim, 4, 3) for i in range(num_dec_layers)])
+        else:
+            self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        # poses (use 6D rotation)
+        self.pose_head = MLP(hidden_dim, hidden_dim, num_poses*6, 6)
+        # shape
+        self.shape_head = MLP(hidden_dim, hidden_dim, dim_shape, 5)
+        # cam_trans
+        self.cam_head = MLP(hidden_dim, hidden_dim//2, 3, 3)
+        # confidence score
+        self.conf_head = nn.Linear(hidden_dim, 1)
+        # init prior_prob setting for focal loss
+        prior_prob = 0.01
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        self.conf_head.bias.data = torch.ones(1) * bias_value
+        # for iter update
+        self.pose_head = _get_clones(self.pose_head, num_dec_layers)
+        self.shape_head = _get_clones(self.shape_head, num_dec_layers)
+        # setting query dim (bboxes as queries)
+        self.query_dim = query_dim
+        assert query_dim == 4
+        self.refpoint_embed = nn.Embedding(num_queries, query_dim)
+        self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+        self.random_refpoints_xy = random_refpoints_xy
+        if random_refpoints_xy:
+            # import ipdb; ipdb.set_trace()
+            self.refpoint_embed.weight.data[:, :2].uniform_(0,1)
+            self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(self.refpoint_embed.weight.data[:, :2])
+            self.refpoint_embed.weight.data[:, :2].requires_grad = False
+        self.aux_loss = aux_loss
+        self.iter_update = iter_update
+        assert iter_update
+        if self.iter_update:
+            self.decoder.decoder.bbox_embed = self.bbox_embed
+        assert bbox_embed_diff_each_layer
+        if bbox_embed_diff_each_layer:
+            for bbox_embed in self.bbox_embed:
+                nn.init.constant_(bbox_embed.layers[-1].weight.data, 0)
+                nn.init.constant_(bbox_embed.layers[-1].bias.data, 0)
+        else:
+            nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
+            nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
+        # ========== End of decoder settings =============
+        # for dn training
+        self.use_dn = dn_cfg['use_dn']
+        self.dn_cfg = dn_cfg
+        if self.use_dn:
+            assert dn_cfg['dn_number'] > 0
+            if dn_cfg['tgt_embed_type'] == 'labels':
+                self.dn_enc = nn.Embedding(dn_cfg['dn_labelbook_size'], hidden_dim)
+            elif dn_cfg['tgt_embed_type'] == 'params':
+                self.dn_enc = nn.Linear(num_poses*3 + dim_shape, hidden_dim)
+            else:
+                raise NotImplementedError
+    def lvl_pooling(self, tokens):
+        assert len(tokens)%4 == 0
+        C = tokens.shape[-1]
+        return torch.max(tokens.view(-1, 4, C), dim=1)[0]
+    def get_scale_map(self, x_list):
+        if self.sat_cfg['use_additional_blocks']:
+            x_list = self.encoder.forward_additional_layers_list(x_list, end=self.sat_cfg['get_map_layer'], get_feature=False)
+        else:
+            x_list = self.encoder.forward_specific_layers_list(x_list, end=self.sat_cfg['get_map_layer'], get_feature=False)
+        cr_token_list = [x[:, :1 + self.encoder.num_register_tokens, :].squeeze(0) for x in x_list]
+        x_tokens = torch.cat([x[:, 1 + self.encoder.num_register_tokens:, :].squeeze(0) for x in x_list], dim=0)
+        scale_map = self.scale_head(self.enc_inter_norm(x_tokens)).sigmoid()
+        return scale_map, cr_token_list, x_tokens
+    def pad_mask(self, mask):
+        mask = mask.reshape(-1,4)
+        mask[torch.any(mask, dim=1)] = True
+        return mask.flatten()
+    def forward_encoder(self, samples, targets, use_gt = False):
+        B = len(samples)
+        C = self.encoder.embed_dim
+        cr_token_list = [self.encoder_cr_token]*len(samples)
+        if not self.use_sat:
+            # img2token
+            lvl0_feature_hw = [(img.shape[1]//self.patch_size, img.shape[2]//self.patch_size) for img in samples]
+            lvl0_token_lens = [h*w for (h,w) in lvl0_feature_hw]
+            lvl0_img_patches = torch.cat([img2patch_flat(img, patch_size = self.patch_size)\
+                                        for img in samples], dim=0)
+            lvl0_tokens = self.encoder_patch_norm(self.encoder_patch_proj(lvl0_img_patches).flatten(1))
+            # token position information
+            full_grids = torch.meshgrid(torch.arange(self.feature_size[0]), torch.arange(self.feature_size[0]), indexing='ij')
+            lvl0_pos_y = torch.cat([full_grids[0][:h,:w].flatten() for (h,w) in lvl0_feature_hw]).to(device = lvl0_tokens.device)
+            lvl0_pos_x = torch.cat([full_grids[1][:h,:w].flatten() for (h,w) in lvl0_feature_hw]).to(device = lvl0_tokens.device)
+            # pos_embed
+            full_pos_embed = self.encoder_pos_embeds
+            lvl0_pos_embed = torch.cat([full_pos_embed[:h,:w].flatten(0,1)\
+                                        for (h,w) in lvl0_feature_hw], dim=0)
+            lvl0_tokens = lvl0_tokens + lvl0_pos_embed
+            # convert to list for DINOv2 input
+            x_list = [torch.cat([cr, lvl0],dim=0).unsqueeze(0)\
+                                for (cr, lvl0) \
+                                in zip(cr_token_list, lvl0_tokens.split(lvl0_token_lens))]
+            lvl0_pos_y_norm = (lvl0_pos_y.to(dtype=lvl0_tokens.dtype) + 0.5) / self.feature_size[0]
+            lvl0_pos_x_norm = (lvl0_pos_x.to(dtype=lvl0_tokens.dtype) + 0.5) / self.feature_size[0]
+            pos_x_list = list(lvl0_pos_y_norm.split(lvl0_token_lens))
+            pos_y_list = list(lvl0_pos_x_norm.split(lvl0_token_lens))
+            scale_map_dict = None
+            # also create lvl_list for patch visualization
+            lvl_list = [torch.zeros_like(pos,dtype=int) for pos in pos_x_list]
+        else:
+            lvl1_feature_hw = [(img.shape[1]//(2*self.patch_size), img.shape[2]//(2*self.patch_size)) for img in samples]
+            lvl1_token_lens = [h*w for (h,w) in lvl1_feature_hw]
+            lvl1_img_patches_28, lvl1_zorders = [], []
+            lvl1_pos_y, lvl1_pos_x = [], []
+            lvl1_bids = []
+            for i, img in enumerate(samples):
+                z_patches, z_order, pos_y, pos_x = to_zorder(img2patch(img, patch_size = 2*self.patch_size),
+                                                             z_order_map = self.z_order_map,
+                                                             y_coords = self.y_coords,
+                                                             x_coords = self.x_coords)
+                lvl1_img_patches_28.append(z_patches)
+                lvl1_zorders.append(z_order)
+                lvl1_pos_y.append(pos_y)
+                lvl1_pos_x.append(pos_x)
+                lvl1_bids.append(torch.full_like(pos_y, i, dtype=torch.int64))
+            lvl1_img_patches_28 = torch.cat(lvl1_img_patches_28, dim=0)
+            lvl1_zorders = torch.cat(lvl1_zorders, dim=0)
+            lvl1_pos_y = torch.cat(lvl1_pos_y, dim=0)
+            lvl1_pos_x = torch.cat(lvl1_pos_x, dim=0)
+            lvl1_bids = torch.cat(lvl1_bids, dim=0)
+            # (L1, 3, 28, 28)
+            assert len(lvl1_img_patches_28) == sum(lvl1_token_lens)
+            lvl1_img_patches = F.interpolate(lvl1_img_patches_28, size = (14,14), mode='bilinear', align_corners=False)
+            # (L1, 3, 14, 14)
+            lvl1_tokens = self.encoder_patch_norm[1](self.encoder_patch_proj[1](lvl1_img_patches).flatten(1))
+            # (L1, C)
+            assert len(lvl1_pos_y) == len(lvl1_tokens)
+            full_pos_embed = self.preprocessed_pos_lvl1 if not self.training\
+                                else F.interpolate(self.encoder_pos_embeds.unsqueeze(0).permute(0, 3, 1, 2),
+                                            mode="bicubic",
+                                            antialias=self.encoder.interpolate_antialias,
+                                            size = (int(self.feature_size[1]),int(self.feature_size[1]))).squeeze(0).permute(1,2,0)
+            lvl1_pos_embed = torch.cat([full_pos_embed[ys,xs]\
+                                        for (ys,xs) in zip(lvl1_pos_y.split(lvl1_token_lens), lvl1_pos_x.split(lvl1_token_lens))], dim=0)
+            lvl1_tokens = lvl1_tokens + lvl1_pos_embed
+            # get scale map (flattened)
+            x_list = [torch.cat([cr, lvl1],dim=0).unsqueeze(0)\
+                                 for (cr, lvl1) \
+                                 in zip(cr_token_list, lvl1_tokens.split(lvl1_token_lens))]
+            scale_map, updated_cr_list, updated_lvl1 = self.get_scale_map(x_list)
+            # for visualization
+            scale_map_dict = {'scale_map': scale_map, 'lens': lvl1_token_lens, 'hw': lvl1_feature_hw,
+                              'pos_y': lvl1_pos_y, 'pos_x': lvl1_pos_x}
+            # get sat masks
+            conf_thresh = self.sat_cfg['conf_thresh']
+            scale_thresh = self.sat_cfg['scale_thresh']
+            if use_gt:
+                scale_map = torch.cat([tgt['scale_map'].view(-1,2) for tgt in targets], dim=0)
+            lvl1_valid_mask = scale_map[:,0] > conf_thresh
+            lvl1_sat_mask = lvl1_valid_mask & (scale_map[:,1] < scale_thresh)
+            # prepare sat tokens (lvl0)
+            lvl0_token_lens = [msk.sum().item()<<2 for msk in lvl1_sat_mask.split(lvl1_token_lens)]
+            lvl1_sat_patches_28 = lvl1_img_patches_28[lvl1_sat_mask] # (L0//4, 3, 28, 28)
+            lvl0_tokens = self.encoder_patch_norm[0](self.encoder_patch_proj[0](lvl1_sat_patches_28).permute(0, 2, 3, 1).flatten(0,2))
+            assert len(lvl0_tokens) == sum(lvl0_token_lens)
+            # lvl0 positions
+            lvl0_pos_y, lvl0_pos_x = lvl1_pos_y[lvl1_sat_mask], lvl1_pos_x[lvl1_sat_mask]
+            lvl0_pos_y = (lvl0_pos_y<<1)[:,None].repeat(1,4).flatten()
+            lvl0_pos_x = (lvl0_pos_x<<1)[:,None].repeat(1,4).flatten()
+            lvl0_pos_y[2::4] += 1
+            lvl0_pos_y[3::4] += 1
+            lvl0_pos_x[1::2] += 1
+            assert len(lvl0_pos_x) == len(lvl0_tokens)
+            # lvl0 pos_embed
+            full_pos_embed = self.encoder_pos_embeds
+            lvl0_pos_embed = torch.cat([full_pos_embed[ys,xs]\
+                                        for (ys,xs) in zip(lvl0_pos_y.split(lvl0_token_lens), lvl0_pos_x.split(lvl0_token_lens))], dim=0)
+            lvl0_tokens = lvl0_tokens + lvl0_pos_embed
+            # update tokens
+            x_list = [torch.cat([cr, lvl0],dim=0).unsqueeze(0)\
+                            for (cr, lvl0) \
+                            in zip(cr_token_list, lvl0_tokens.split(lvl0_token_lens))]
+            x_list = self.encoder.forward_specific_layers_list(x_list, end=self.sat_cfg['get_map_layer'], get_feature=False)
+            lvl0_tokens = torch.cat([x[:, 1 + self.encoder.num_register_tokens:, :].squeeze(0) for x in x_list], dim=0)
+            assert len(lvl0_pos_x) == len(lvl0_tokens)
+            # also update lvl1 and crs
+            lvl1_tokens = updated_lvl1
+            cr_token_list = updated_cr_list
+            if self.sat_cfg['num_lvls'] == 2:
+                # drop corresponding lvl1 tokens
+                lvl1_keep = ~lvl1_sat_mask
+                lvl1_token_lens = [msk.sum().item() for msk in lvl1_keep.split(lvl1_token_lens)]
+                lvl1_tokens = lvl1_tokens[lvl1_keep]
+                lvl1_pos_y = lvl1_pos_y[lvl1_keep]
+                lvl1_pos_x = lvl1_pos_x[lvl1_keep]
+                # normalize positions
+                lvl0_pos_y_norm = (lvl0_pos_y.to(dtype=lvl0_tokens.dtype) + 0.5) / self.feature_size[0]
+                lvl0_pos_x_norm = (lvl0_pos_x.to(dtype=lvl0_tokens.dtype) + 0.5) / self.feature_size[0]
+                lvl1_pos_y_norm = (lvl1_pos_y.to(dtype=lvl1_tokens.dtype) + 0.5) / self.feature_size[1]
+                lvl1_pos_x_norm = (lvl1_pos_x.to(dtype=lvl1_tokens.dtype) + 0.5) / self.feature_size[1]
+                # merge all
+                x_list = [torch.cat([cr, lvl0, lvl1]).unsqueeze(0) \
+                                for cr, lvl0, lvl1 \
+                                in zip(cr_token_list, lvl0_tokens.split(lvl0_token_lens), lvl1_tokens.split(lvl1_token_lens))]
+                pos_y_list = [torch.cat([lvl0, lvl1]) \
+                                    for lvl0, lvl1 \
+                                    in zip(lvl0_pos_y_norm.split(lvl0_token_lens), lvl1_pos_y_norm.split(lvl1_token_lens))]
+                pos_x_list = [torch.cat([lvl0, lvl1]) \
+                                    for lvl0, lvl1 \
+                                    in zip(lvl0_pos_x_norm.split(lvl0_token_lens), lvl1_pos_x_norm.split(lvl1_token_lens))]
+                lvl_list = [torch.cat([torch.zeros_like(lvl0, dtype=int), torch.ones_like(lvl1, dtype=int)]) \
+                                    for lvl0, lvl1 \
+                                    in zip(lvl0_pos_x_norm.split(lvl0_token_lens), lvl1_pos_x_norm.split(lvl1_token_lens))]
+            else:
+                # prune lvl1 correspond to lvl0
+                lvl1_valid_mask = self.pad_mask(lvl1_valid_mask)
+                lvl1_keep = lvl1_valid_mask & (~lvl1_sat_mask)
+                lvl1_to_lvl2 = ~lvl1_valid_mask
+                token_lvls = [lvl0_tokens, lvl1_tokens]
+                token_lens_lvls = [lvl0_token_lens, lvl1_token_lens]
+                pos_y_lvls = [lvl0_pos_y, lvl1_pos_y]
+                pos_x_lvls = [lvl0_pos_x, lvl1_pos_x]
+                to_next_lvl = lvl1_to_lvl2
+                keep = lvl1_keep
+                lvl_zorders = lvl1_zorders
+                lvl_bids = lvl1_bids
+                pad_vals = torch.full((3,), -1, dtype=lvl_zorders.dtype, device=lvl_zorders.device)
+                for lvl in range(self.sat_cfg['num_lvls']-2):
+                    if to_next_lvl.sum() == 0:
+                        break
+                    next_tokens = self.lvl_pooling(token_lvls[-1][to_next_lvl])
+                    # next_tokens = torch.max(token_lvls[-1][to_next_lvl].view(-1,4,C), dim=1)[0]
+                    next_pos_y = pos_y_lvls[-1][to_next_lvl][::4]>>1
+                    next_pos_x = pos_x_lvls[-1][to_next_lvl][::4]>>1
+                    next_lens = [msk.sum().item()//4 for msk in to_next_lvl.split(token_lens_lvls[-1])]
+                    token_lvls[-1] = token_lvls[-1][keep]
+                    pos_y_lvls[-1] = pos_y_lvls[-1][keep]
+                    pos_x_lvls[-1] = pos_x_lvls[-1][keep]
+                    token_lens_lvls[-1] = [msk.sum().item() for msk in keep.split(token_lens_lvls[-1])]
+                    token_lvls.append(next_tokens)
+                    token_lens_lvls.append(next_lens)
+                    pos_y_lvls.append(next_pos_y)
+                    pos_x_lvls.append(next_pos_x)
+                    if lvl < self.sat_cfg['num_lvls']-3:
+                        lvl_zorders = lvl_zorders[to_next_lvl][::4]>>2
+                        lvl_bids = lvl_bids[to_next_lvl][::4]
+                        z_starts_idx = torch.where((lvl_zorders&3)==0)[0]
+                        padded_z = torch.cat([lvl_zorders, pad_vals])
+                        padded_bids = torch.cat([lvl_bids, pad_vals])
+                        valids = (padded_z[z_starts_idx] + 3 == padded_z[z_starts_idx + 3]) & (padded_bids[z_starts_idx] == padded_bids[z_starts_idx + 3])
+                        valid_starts = z_starts_idx[valids]
+                        to_next_lvl = torch.zeros_like(lvl_zorders, dtype=bool)
+                        to_next_lvl[valid_starts] = True
+                        to_next_lvl[valid_starts+1] = True
+                        to_next_lvl[valid_starts+2] = True
+                        to_next_lvl[valid_starts+3] = True
+                        keep = ~to_next_lvl
+                norm_pos_y_lvls = [(pos_y.to(dtype=lvl0_tokens.dtype) + 0.5)/self.feature_size[i]  for i, pos_y in enumerate(pos_y_lvls)]
+                norm_pos_x_lvls = [(pos_x.to(dtype=lvl0_tokens.dtype) + 0.5)/self.feature_size[i]  for i, pos_x in enumerate(pos_x_lvls)]
+                x_list = [torch.cat([cr, *lvls]).unsqueeze(0) \
+                                for cr, *lvls \
+                                in zip(cr_token_list, *[tokens.split(lens) for (tokens, lens) in zip(token_lvls, token_lens_lvls)])]
+                pos_y_list = [torch.cat([*lvls]) \
+                                    for lvls \
+                                    in zip(*[pos_y.split(lens) for (pos_y, lens) in zip(norm_pos_y_lvls, token_lens_lvls)])]
+                pos_x_list = [torch.cat([*lvls]) \
+                                    for lvls \
+                                    in zip(*[pos_x.split(lens) for (pos_x, lens) in zip(norm_pos_x_lvls, token_lens_lvls)])]
+                lvl_list = [torch.cat([torch.full_like(lvl, i, dtype=torch.int64) for i, lvl in enumerate(lvls)]) \
+                                    for lvls \
+                                    in zip(*[pos_x.split(lens) for (pos_x, lens) in zip(norm_pos_x_lvls, token_lens_lvls)])]
+        start = self.sat_cfg['get_map_layer'] if self.use_sat else 0
+        _, final_feature_list = self.encoder.forward_specific_layers_list(x_list, start = start, norm=True)
+        # proj
+        token_lens = [feature.shape[1] for feature in final_feature_list]
+        final_features = self.feature_proj(torch.cat(final_feature_list,dim=1).squeeze(0)) # (sum(L), C)
+        assert tuple(final_features.shape) == (sum(token_lens), self.hidden_dim)
+        # positional encoding
+        pos_embeds = position_encoding_xy(torch.cat(pos_x_list,dim=0), torch.cat(pos_y_list,dim=0), embedding_dim=self.hidden_dim)
+        if self.use_sat and self.sat_cfg['lvl_embed']:
+            lvl_embeds = self.level_embed[torch.cat(lvl_list,dim=0)]
+            pos_embeds = pos_embeds + lvl_embeds
+        sat_dict = {'pos_y': pos_y_list, 'pos_x': pos_x_list, 'lvl': lvl_list,
+                    #  'features': [feature.squeeze(0) for feature in final_feature_list],
+                     'lens': token_lens}
+        return final_features, pos_embeds, token_lens, scale_map_dict, sat_dict
+    def process_smpl(self, poses, shapes, cam_xys, cam_intrinsics, detach_j3ds = False):
+        bs, num_queries, _ = poses.shape # should be (bs,n_q,num_poses*3)
+        # flatten and compute
+        poses = poses.flatten(0,1) # (bs*n_q,24*3)
+        shapes = shapes.flatten(0,1) # (bs*n_q,10)
+        verts, joints = self.human_model(poses=poses,
+                                         betas=shapes)
+        num_verts = verts.shape[1]
+        num_joints = joints.shape[1]
+        verts = verts.reshape(bs,num_queries,num_verts,3)
+        joints = joints.reshape(bs,num_queries,num_joints,3)
+        # apply cam_trans and projection
+        scale = 2*cam_xys[:,:,2:].sigmoid() + 1e-6
+        t_xy = cam_xys[:,:,:2]/scale
+        t_z = (2*self.focal)/(scale*self.input_size)    # (bs,num_queries,1)
+        transl = torch.cat([t_xy,t_z],dim=2)[:,:,None,:]    # (bs,nq,1,3)
+        verts_cam = verts + transl # only for visualization and evaluation
+        j3ds_cam = joints + transl
+        if detach_j3ds:
+            j2ds_homo = torch.matmul(joints.detach() + transl, cam_intrinsics.transpose(2,3))
+        else:
+            j2ds_homo = torch.matmul(j3ds_cam, cam_intrinsics.transpose(2,3))
+        j2ds_img = (j2ds_homo[..., :2] / (j2ds_homo[..., 2, None] + 1e-6)).reshape(bs,num_queries,num_joints,2)
+        depths = j3ds_cam[:,:,0,2:]   # (bs, n_q, 1)
+        depths = torch.cat([depths, depths/self.focal], dim=-1) # (bs, n_q, 2)
+        return verts_cam, j3ds_cam, j2ds_img, depths, transl.flatten(2)
+    def forward(self, samples: NestedTensor, targets, sat_use_gt = False, detach_j3ds = False):
+        """ The forward expects a NestedTensor, which consists of:
+               - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
+               - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
+            It returns a dict with the following elements:
+               - "pred_logits": the classification logits (including no-object) for all queries.
+                                Shape= [batch_size x num_queries x num_classes]
+               - "pred_boxes": The normalized boxes coordinates for all queries, represented as
+                               (center_x, center_y, width, height). These values are normalized in [0, 1],
+                               relative to the size of each individual image (disregarding possible padding).
+                               See PostProcess for information on how to retrieve the unnormalized bounding box.
+               - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
+                                dictionnaries containing the two above keys for each decoder layer.
+        """
+        assert isinstance(samples, (list, torch.Tensor))
+        if self.training:
+            self.preprocessed_pos_lvl1 = None
+        elif self.preprocessed_pos_lvl1 is None and self.use_sat:
+            self.preprocessed_pos_lvl1 = F.interpolate(self.encoder_pos_embeds.unsqueeze(0).permute(0, 3, 1, 2),
+                                            mode="bicubic",
+                                            antialias=self.encoder.interpolate_antialias,
+                                            size = (int(self.feature_size[1]),int(self.feature_size[1]))).squeeze(0).permute(1,2,0)
+        bs = len(targets)
+        # get cam_intrinsics
+        img_size = torch.stack([t['img_size'].flip(0) for t in targets])
+        valid_ratio = img_size/self.input_size
+        cam_intrinsics = self.cam_intrinsics.repeat(bs, 1, 1, 1)
+        cam_intrinsics[...,:2,2] = cam_intrinsics[...,:2,2] * valid_ratio[:, None, :]
+        final_features, pos_embeds, token_lens, scale_map_dict, sat_dict\
+             = self.forward_encoder(samples, targets, use_gt = sat_use_gt)
+        # default dab-detr pipeline
+        embedweight = (self.refpoint_embed.weight).unsqueeze(0).repeat(bs,1,1)
+        tgt = (self.tgt_embed.weight).unsqueeze(0).repeat(bs,1,1)
+        if self.training and self.use_dn:
+            input_query_tgt, input_query_bbox, attn_mask, dn_meta =\
+                            prepare_for_cdn(targets = targets, dn_cfg = self.dn_cfg,
+                                        num_queries = self.num_queries, hidden_dim = self.hidden_dim, dn_enc = self.dn_enc)
+            tgt = torch.cat([input_query_tgt, tgt], dim=1)
+            embedweight = torch.cat([input_query_bbox, embedweight], dim=1)
+        else:
+            attn_mask = None
+        tgt_lens = [tgt.shape[1]]*bs
+        hs, reference = self.decoder(memory=final_features, memory_lens=token_lens,
+                                         tgt=tgt.flatten(0,1), tgt_lens=tgt_lens,
+                                         refpoint_embed=embedweight.flatten(0,1),
+                                         pos_embed=pos_embeds,
+                                         self_attn_mask = attn_mask)
+        reference_before_sigmoid = inverse_sigmoid(reference)
+        outputs_coords = []
+        for lvl in range(hs.shape[0]):
+            tmp = self.bbox_embed[lvl](hs[lvl])
+            tmp[..., :self.query_dim] += reference_before_sigmoid[lvl]
+            outputs_coord = tmp.sigmoid()
+            outputs_coords.append(outputs_coord)
+        pred_boxes = torch.stack(outputs_coords)
+        outputs_poses = []
+        outputs_shapes = []
+        outputs_confs = []
+        outputs_j3ds = []
+        outputs_j2ds = []
+        outputs_depths = []
+        # shape of hs: (lvl, bs, num_queries, dim)
+        outputs_pose_6d = self.mean_pose.view(1, 1, -1)
+        outputs_shape = self.mean_shape.view(1, 1, -1)
+        for lvl in range(hs.shape[0]):
+            outputs_pose_6d = outputs_pose_6d + self.pose_head[lvl](hs[lvl])
+            outputs_shape = outputs_shape + self.shape_head[lvl](hs[lvl])
+            if self.training or lvl == hs.shape[0] - 1:
+                outputs_pose = rot6d_to_axis_angle(outputs_pose_6d)
+                outputs_conf = self.conf_head(hs[lvl]).sigmoid()
+                # cam
+                cam_xys = self.cam_head(hs[lvl])
+                outputs_vert, outputs_j3d, outputs_j2d, depth, transl\
+                = self.process_smpl(poses = outputs_pose,
+                                    shapes = outputs_shape,
+                                    cam_xys = cam_xys,
+                                    cam_intrinsics = cam_intrinsics,
+                                    detach_j3ds = detach_j3ds)
+                outputs_poses.append(outputs_pose)
+                outputs_shapes.append(outputs_shape)
+                outputs_confs.append(outputs_conf)
+                # outputs_verts.append(outputs_vert)
+                outputs_j3ds.append(outputs_j3d)
+                outputs_j2ds.append(outputs_j2d)
+                outputs_depths.append(depth)
+        pred_poses = torch.stack(outputs_poses)
+        pred_betas = torch.stack(outputs_shapes)
+        pred_confs = torch.stack(outputs_confs)
+        pred_verts = outputs_vert
+        pred_transl = transl
+        pred_intrinsics = cam_intrinsics
+        pred_j3ds = torch.stack(outputs_j3ds)
+        pred_j2ds = torch.stack(outputs_j2ds)
+        pred_depths = torch.stack(outputs_depths)
+        if self.training > 0 and self.use_dn:
+            pred_poses, pred_betas,\
+            pred_boxes, pred_confs,\
+            pred_j3ds, pred_j2ds, pred_depths,\
+            pred_verts, pred_transl =\
+                dn_post_process(pred_poses, pred_betas,
+                                pred_boxes, pred_confs,
+                                pred_j3ds, pred_j2ds, pred_depths,
+                                pred_verts, pred_transl,
+                                dn_meta, self.aux_loss, self._set_aux_loss)
+        out = {'pred_poses': pred_poses[-1], 'pred_betas': pred_betas[-1],
+                'pred_boxes': pred_boxes[-1], 'pred_confs': pred_confs[-1],
+               'pred_j3ds': pred_j3ds[-1], 'pred_j2ds': pred_j2ds[-1],
+               'pred_verts': pred_verts, 'pred_intrinsics': pred_intrinsics,
+               'pred_depths': pred_depths[-1], 'pred_transl': pred_transl}
+        if self.aux_loss and self.training:
+            out['aux_outputs'] = self._set_aux_loss(pred_poses, pred_betas,
+                                                    pred_boxes, pred_confs,
+                                                    pred_j3ds, pred_j2ds, pred_depths)
+        if self.use_sat:
+            out['enc_outputs'] = scale_map_dict
+        out['sat'] = sat_dict
+        if self.training > 0 and self.use_dn:
+            out['dn_meta'] = dn_meta
+        return out
+    @torch.jit.unused
+    def _set_aux_loss(self, pred_poses, pred_betas, pred_boxes,
+                        pred_confs, pred_j3ds,
+                        pred_j2ds, pred_depths):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{'pred_poses': a, 'pred_betas': b,
+                    'pred_boxes': c, 'pred_confs': d,
+                'pred_j3ds': e, 'pred_j2ds': f, 'pred_depths': g}
+                    for a, b, c, d, e, f, g in zip(pred_poses[:-1], pred_betas[:-1],
+                    pred_boxes[:-1], pred_confs[:-1], pred_j3ds[:-1], pred_j2ds[:-1], pred_depths[:-1])]
+class MLP(nn.Module):
+    """ Very simple multi-layer perceptron (also called FFN)"""
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+def build_sat_model(args, set_criterion=True):
+    encoder = build_encoder(args)
+    decoder = build_decoder(args)
+    model = Model(
+        encoder,
+        decoder,
+        num_queries=args.num_queries,
+        input_size=args.input_size,
+        sat_cfg=args.sat_cfg,
+        dn_cfg=args.dn_cfg,
+        train_pos_embed=getattr(args,'train_pos_embed',True)
+    )
+    if set_criterion:
+        matcher = build_matcher(args)
+        weight_dict = args.weight_dict
+        losses = args.losses
+        if args.dn_cfg['use_dn']:
+            dn_weight_dict = {}
+            dn_weight_dict.update({f'{k}_dn': v for k, v in weight_dict.items()})
+            weight_dict.update(dn_weight_dict)
+        aux_weight_dict = {}
+        for i in range(args.dec_layers - 1):
+            aux_weight_dict.update({f'{k}.{i}': v for k, v in weight_dict.items()})
+        weight_dict.update(aux_weight_dict)
+        if args.sat_cfg['use_sat']:
+            if 'map_confs' not in weight_dict:
+                weight_dict.update({'map_confs': weight_dict['confs']})
+            # weight_dict.update({'map_scales': })
+        criterion = SetCriterion(matcher, weight_dict, losses = losses, j2ds_norm_scale = args.input_size)
+        return model, criterion
+    else:
+        return model, None

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+accelerate==0.26.1
+chumpy
+smplx
+opencv-python
+trimesh
+tensorboard
+scipy
+pyrender==0.1.45
+joblib
+termcolor
+transformers
+matplotlib
+scikit-learn

utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

utils/box_ops.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Utilities for bounding box manipulation and GIoU.
+"""
+import torch, os
+from torchvision.ops.boxes import box_area
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
+         (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+def box_xyxy_to_cxcywh(x):
+    x0, y0, x1, y1 = x.unbind(-1)
+    b = [(x0 + x1) / 2, (y0 + y1) / 2,
+         (x1 - x0), (y1 - y0)]
+    return torch.stack(b, dim=-1)
+# modified from torchvision to also return the union
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+    # import ipdb; ipdb.set_trace()
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+    union = area1[:, None] + area2 - inter
+    iou = inter / (union + 1e-6)
+    return iou, union
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/
+    The boxes should be in [x0, y0, x1, y1] format
+    Returns a [N, M] pairwise matrix, where N = len(boxes1)
+    and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    # except:
+    #     import ipdb; ipdb.set_trace()
+    iou, union = box_iou(boxes1, boxes2)
+    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    area = wh[:, :, 0] * wh[:, :, 1]
+    return iou - (area - union) / (area + 1e-6)
+# modified from torchvision to also return the union
+def box_iou_pairwise(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+    lt = torch.max(boxes1[:, :2], boxes2[:, :2])  # [N,2]
+    rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])  # [N,2]
+    wh = (rb - lt).clamp(min=0)  # [N,2]
+    inter = wh[:, 0] * wh[:, 1]  # [N]
+    union = area1 + area2 - inter
+    iou = inter / union
+    return iou, union
+def generalized_box_iou_pairwise(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/
+    Input:
+        - boxes1, boxes2: N,4
+    Output:
+        - giou: N, 4
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    assert boxes1.shape == boxes2.shape
+    iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4
+    lt = torch.min(boxes1[:, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, 2:], boxes2[:, 2:])
+    wh = (rb - lt).clamp(min=0)  # [N,2]
+    area = wh[:, 0] * wh[:, 1]
+    return iou - (area - union) / area
+def masks_to_boxes(masks):
+    """Compute the bounding boxes around the provided masks
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+    Returns a [N, 4] tensors, with the boxes in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+    h, w = masks.shape[-2:]
+    y = torch.arange(0, h, dtype=torch.float)
+    x = torch.arange(0, w, dtype=torch.float)
+    y, x = torch.meshgrid(y, x)
+    x_mask = (masks * x.unsqueeze(0))
+    x_max = x_mask.flatten(1).max(-1)[0]
+    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+    y_mask = (masks * y.unsqueeze(0))
+    y_max = y_mask.flatten(1).max(-1)[0]
+    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
+if __name__ == '__main__':
+    x = torch.rand(5, 4)
+    y = torch.rand(3, 4)
+    iou, union = box_iou(x, y)
+    import ipdb; ipdb.set_trace()