Spaces:

luoxue-star
/

AniMerPlus

Build error

App Files Files Community

luoxue-star commited on Aug 1, 2025

Commit

1966925

1 Parent(s): 7a90a56

Init Commit

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
README.md +11 -0
amr/__init__.py +0 -0
amr/__pycache__/__init__.cpython-310.pyc +0 -0
amr/configs/__init__.py +112 -0
amr/configs/__pycache__/__init__.cpython-310.pyc +0 -0
amr/datasets/__init__.py +0 -0
amr/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
amr/datasets/__pycache__/utils.cpython-310.pyc +0 -0
amr/datasets/__pycache__/vitdet_dataset.cpython-310.pyc +0 -0
amr/datasets/utils.py +1098 -0
amr/datasets/vitdet_dataset.py +102 -0
amr/models/__init__.py +26 -0
amr/models/__pycache__/__init__.cpython-310.pyc +0 -0
amr/models/__pycache__/__init__.cpython-312.pyc +0 -0
amr/models/__pycache__/__init__.cpython-39.pyc +0 -0
amr/models/__pycache__/amr.cpython-310.pyc +0 -0
amr/models/__pycache__/amr.cpython-312.pyc +0 -0
amr/models/__pycache__/amr.cpython-39.pyc +0 -0
amr/models/__pycache__/animerpp.cpython-310.pyc +0 -0
amr/models/__pycache__/animerpp.cpython-312.pyc +0 -0
amr/models/__pycache__/aves_hmr.cpython-310.pyc +0 -0
amr/models/__pycache__/aves_hmr.cpython-312.pyc +0 -0
amr/models/__pycache__/aves_warapper.cpython-310.pyc +0 -0
amr/models/__pycache__/aves_warapper.cpython-312.pyc +0 -0
amr/models/__pycache__/discriminator.cpython-310.pyc +0 -0
amr/models/__pycache__/discriminator.cpython-312.pyc +0 -0
amr/models/__pycache__/discriminator.cpython-39.pyc +0 -0
amr/models/__pycache__/dyamr.cpython-310.pyc +0 -0
amr/models/__pycache__/dyamr.cpython-312.pyc +0 -0
amr/models/__pycache__/losses.cpython-310.pyc +0 -0
amr/models/__pycache__/losses.cpython-312.pyc +0 -0
amr/models/__pycache__/losses.cpython-39.pyc +0 -0
amr/models/__pycache__/predictor.cpython-310.pyc +0 -0
amr/models/__pycache__/smal_warapper.cpython-310.pyc +0 -0
amr/models/__pycache__/smal_warapper.cpython-312.pyc +0 -0
amr/models/__pycache__/smal_warapper.cpython-39.pyc +0 -0
amr/models/__pycache__/smooth_amr.cpython-310.pyc +0 -0
amr/models/__pycache__/smooth_amr.cpython-312.pyc +0 -0
amr/models/__pycache__/smooth_netv2.cpython-310.pyc +0 -0
amr/models/__pycache__/stamr.cpython-310.pyc +0 -0
amr/models/__pycache__/stamr.cpython-312.pyc +0 -0
amr/models/animerpp.py +508 -0
amr/models/aves_warapper.py +136 -0
amr/models/backbones/__init__.py +9 -0
amr/models/backbones/__pycache__/__init__.cpython-310.pyc +0 -0
amr/models/backbones/__pycache__/__init__.cpython-312.pyc +0 -0
amr/models/backbones/__pycache__/__init__.cpython-39.pyc +0 -0
amr/models/backbones/__pycache__/rope_deit.cpython-310.pyc +0 -0
amr/models/backbones/__pycache__/vit.cpython-310.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text
+*.JPEG filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,5 +1,6 @@
 ---
 title: AniMerPlus
 emoji: 📊
 colorFrom: gray
 colorTo: indigo
@@ -7,6 +8,16 @@ sdk: gradio
 sdk_version: 5.39.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: AniMerPlus
+<<<<<<< HEAD
 emoji: 📊
 colorFrom: gray
 colorTo: indigo
 sdk_version: 5.39.0
 app_file: app.py
 pinned: false
+=======
+emoji: 🔥
+colorFrom: pink
+colorTo: blue
+sdk: gradio
+sdk_version: 5.1.0
+app_file: app.py
+pinned: false
+license: mit
+>>>>>>> Init Commit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

amr/__init__.py ADDED Viewed

File without changes

amr/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (146 Bytes). View file

amr/configs/__init__.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import os
+from typing import Dict
+from yacs.config import CfgNode as CN
+CACHE_DIR_AniMer = "./_DATA"
+def to_lower(x: Dict) -> Dict:
+    """
+    Convert all dictionary keys to lowercase
+    Args:
+      x (dict): Input dictionary
+    Returns:
+      dict: Output dictionary with all keys converted to lowercase
+    """
+    return {k.lower(): v for k, v in x.items()}
+_C = CN(new_allowed=True)
+_C.GENERAL = CN(new_allowed=True)
+_C.GENERAL.RESUME = True
+_C.GENERAL.TIME_TO_RUN = 3300
+_C.GENERAL.VAL_STEPS = 100
+_C.GENERAL.LOG_STEPS = 100
+_C.GENERAL.CHECKPOINT_STEPS = 20000
+_C.GENERAL.CHECKPOINT_DIR = "checkpoints"
+_C.GENERAL.SUMMARY_DIR = "tensorboard"
+_C.GENERAL.NUM_GPUS = 1
+_C.GENERAL.NUM_WORKERS = 4
+_C.GENERAL.MIXED_PRECISION = True
+_C.GENERAL.ALLOW_CUDA = True
+_C.GENERAL.PIN_MEMORY = False
+_C.GENERAL.DISTRIBUTED = False
+_C.GENERAL.LOCAL_RANK = 0
+_C.GENERAL.USE_SYNCBN = False
+_C.GENERAL.WORLD_SIZE = 1
+_C.TRAIN = CN(new_allowed=True)
+_C.TRAIN.NUM_EPOCHS = 100
+_C.TRAIN.SHUFFLE = True
+_C.TRAIN.WARMUP = False
+_C.TRAIN.NORMALIZE_PER_IMAGE = False
+_C.TRAIN.CLIP_GRAD = False
+_C.TRAIN.CLIP_GRAD_VALUE = 1.0
+_C.LOSS_WEIGHTS = CN(new_allowed=True)
+_C.DATASETS = CN(new_allowed=True)
+_C.MODEL = CN(new_allowed=True)
+_C.MODEL.IMAGE_SIZE = 224
+_C.EXTRA = CN(new_allowed=True)
+_C.EXTRA.FOCAL_LENGTH = 5000
+_C.DATASETS.CONFIG = CN(new_allowed=True)
+_C.DATASETS.CONFIG.SCALE_FACTOR = 0.3
+_C.DATASETS.CONFIG.ROT_FACTOR = 30
+_C.DATASETS.CONFIG.TRANS_FACTOR = 0.02
+_C.DATASETS.CONFIG.COLOR_SCALE = 0.2
+_C.DATASETS.CONFIG.ROT_AUG_RATE = 0.6
+_C.DATASETS.CONFIG.TRANS_AUG_RATE = 0.5
+_C.DATASETS.CONFIG.DO_FLIP = False
+_C.DATASETS.CONFIG.FLIP_AUG_RATE = 0.5
+_C.DATASETS.CONFIG.EXTREME_CROP_AUG_RATE = 0.10
+def default_config() -> CN:
+    """
+    Get a yacs CfgNode object with the default config values.
+    """
+    # Return a clone so that the defaults will not be altered
+    # This is for the "local variable" use pattern
+    return _C.clone()
+def dataset_config() -> CN:
+    """
+    Get dataset config file
+    Returns:
+      CfgNode: Dataset config as a yacs CfgNode object.
+    """
+    cfg = CN(new_allowed=True)
+    config_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'datasets_tar.yaml')
+    cfg.merge_from_file(config_file)
+    cfg.freeze()
+    return cfg
+def get_config(config_file: str, merge: bool = True, update_cachedir: bool = False) -> CN:
+    """
+    Read a config file and optionally merge it with the default config file.
+    Args:
+      config_file (str): Path to config file.
+      merge (bool): Whether to merge with the default config or not.
+    Returns:
+      CfgNode: Config as a yacs CfgNode object.
+    """
+    if merge:
+        cfg = default_config()
+    else:
+        cfg = CN(new_allowed=True)
+    cfg.merge_from_file(config_file)
+    if update_cachedir:
+        def update_path(path: str) -> str:
+            if os.path.isabs(path):
+                return path
+            return os.path.join(CACHE_DIR_AniMer, path)
+    cfg.freeze()
+    return cfg

amr/configs/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (3.21 kB). View file

amr/datasets/__init__.py ADDED Viewed

File without changes

amr/datasets/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (155 Bytes). View file

amr/datasets/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (36 kB). View file

amr/datasets/__pycache__/vitdet_dataset.cpython-310.pyc ADDED Viewed

Binary file (3.19 kB). View file

amr/datasets/utils.py ADDED Viewed

	@@ -0,0 +1,1098 @@

+"""
+Parts of the code are taken or adapted from
+https://github.com/mkocabas/EpipolarPose/blob/master/lib/utils/img_utils.py
+"""
+import torch
+import numpy as np
+from skimage.transform import rotate, resize
+from skimage.filters import gaussian
+import random
+import cv2
+from typing import List, Dict, Tuple
+from yacs.config import CfgNode
+from typing import Union
+def expand_to_aspect_ratio(input_shape, target_aspect_ratio=None):
+    """Increase the size of the bounding box to match the target shape."""
+    if target_aspect_ratio is None:
+        return input_shape
+    try:
+        w, h = input_shape
+    except (ValueError, TypeError):
+        return input_shape
+    w_t, h_t = target_aspect_ratio
+    if h / w < h_t / w_t:
+        h_new = max(w * h_t / w_t, h)
+        w_new = w
+    else:
+        h_new = h
+        w_new = max(h * w_t / h_t, w)
+    if h_new < h or w_new < w:
+        breakpoint()
+    return np.array([w_new, h_new])
+def do_augmentation(aug_config: CfgNode) -> Tuple:
+    """
+    Compute random augmentation parameters.
+    Args:
+        aug_config (CfgNode): Config containing augmentation parameters.
+    Returns:
+        scale (float): Box rescaling factor.
+        rot (float): Random image rotation.
+        do_flip (bool): Whether to flip image or not.
+        do_extreme_crop (bool): Whether to apply extreme cropping (as proposed in EFT).
+        color_scale (List): Color rescaling factor
+        tx (float): Random translation along the x axis.
+        ty (float): Random translation along the y axis.
+    """
+    tx = np.clip(np.random.randn(), -1.0, 1.0) * aug_config.TRANS_FACTOR
+    ty = np.clip(np.random.randn(), -1.0, 1.0) * aug_config.TRANS_FACTOR
+    scale = np.clip(np.random.randn(), -1.0, 1.0) * aug_config.SCALE_FACTOR + 1.0
+    rot = np.clip(np.random.randn(), -2.0,
+                  2.0) * aug_config.ROT_FACTOR if random.random() <= aug_config.ROT_AUG_RATE else 0
+    do_flip = aug_config.DO_FLIP and random.random() <= aug_config.FLIP_AUG_RATE
+    do_extreme_crop = random.random() <= aug_config.EXTREME_CROP_AUG_RATE
+    extreme_crop_lvl = aug_config.get('EXTREME_CROP_AUG_LEVEL', 0)
+    # extreme_crop_lvl = 0
+    c_up = 1.0 + aug_config.COLOR_SCALE
+    c_low = 1.0 - aug_config.COLOR_SCALE
+    color_scale = [random.uniform(c_low, c_up), random.uniform(c_low, c_up), random.uniform(c_low, c_up)]
+    return scale, rot, do_flip, do_extreme_crop, extreme_crop_lvl, color_scale, tx, ty
+def rotate_2d(pt_2d: np.array, rot_rad: float) -> np.array:
+    """
+    Rotate a 2D point on the x-y plane.
+    Args:
+        pt_2d (np.array): Input 2D point with shape (2,).
+        rot_rad (float): Rotation angle
+    Returns:
+        np.array: Rotated 2D point.
+    """
+    x = pt_2d[0]
+    y = pt_2d[1]
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+    xx = x * cs - y * sn
+    yy = x * sn + y * cs
+    return np.array([xx, yy], dtype=np.float32)
+def gen_trans_from_patch_cv(c_x: float, c_y: float,
+                            src_width: float, src_height: float,
+                            dst_width: float, dst_height: float,
+                            scale: float, rot: float) -> np.array:
+    """
+    Create transformation matrix for the bounding box crop.
+    Args:
+        c_x (float): Bounding box center x coordinate in the original image.
+        c_y (float): Bounding box center y coordinate in the original image.
+        src_width (float): Bounding box width.
+        src_height (float): Bounding box height.
+        dst_width (float): Output box width.
+        dst_height (float): Output box height.
+        scale (float): Rescaling factor for the bounding box (augmentation).
+        rot (float): Random rotation applied to the box.
+    Returns:
+        trans (np.array): Target geometric transformation.
+    """
+    # augment size with scale
+    src_w = src_width * scale
+    src_h = src_height * scale
+    src_center = np.zeros(2)
+    src_center[0] = c_x
+    src_center[1] = c_y
+    # augment rotation
+    rot_rad = np.pi * rot / 180
+    src_downdir = rotate_2d(np.array([0, src_h * 0.5], dtype=np.float32), rot_rad)
+    src_rightdir = rotate_2d(np.array([src_w * 0.5, 0], dtype=np.float32), rot_rad)
+    dst_w = dst_width
+    dst_h = dst_height
+    dst_center = np.array([dst_w * 0.5, dst_h * 0.5], dtype=np.float32)
+    dst_downdir = np.array([0, dst_h * 0.5], dtype=np.float32)
+    dst_rightdir = np.array([dst_w * 0.5, 0], dtype=np.float32)
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = src_center
+    src[1, :] = src_center + src_downdir
+    src[2, :] = src_center + src_rightdir
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = dst_center
+    dst[1, :] = dst_center + dst_downdir
+    dst[2, :] = dst_center + dst_rightdir
+    trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+    return trans
+def trans_point2d(pt_2d: np.array, trans: np.array):
+    """
+    Transform a 2D point using translation matrix trans.
+    Args:
+        pt_2d (np.array): Input 2D point with shape (2,).
+        trans (np.array): Transformation matrix.
+    Returns:
+        np.array: Transformed 2D point.
+    """
+    src_pt = np.array([pt_2d[0], pt_2d[1], 1.]).T
+    dst_pt = np.dot(trans, src_pt)
+    return dst_pt[0:2]
+def get_transform(center, scale, res, rot=0):
+    """Generate transformation matrix."""
+    """Taken from PARE: https://github.com/mkocabas/PARE/blob/6e0caca86c6ab49ff80014b661350958e5b72fd8/pare/utils/image_utils.py"""
+    h = 200 * scale
+    t = np.zeros((3, 3))
+    t[0, 0] = float(res[1]) / h
+    t[1, 1] = float(res[0]) / h
+    t[0, 2] = res[1] * (-float(center[0]) / h + .5)
+    t[1, 2] = res[0] * (-float(center[1]) / h + .5)
+    t[2, 2] = 1
+    if not rot == 0:
+        rot = -rot  # To match direction of rotation from cropping
+        rot_mat = np.zeros((3, 3))
+        rot_rad = rot * np.pi / 180
+        sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+        rot_mat[0, :2] = [cs, -sn]
+        rot_mat[1, :2] = [sn, cs]
+        rot_mat[2, 2] = 1
+        # Need to rotate around center
+        t_mat = np.eye(3)
+        t_mat[0, 2] = -res[1] / 2
+        t_mat[1, 2] = -res[0] / 2
+        t_inv = t_mat.copy()
+        t_inv[:2, 2] *= -1
+        t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t)))
+    return t
+def transform(pt, center, scale, res, invert=0, rot=0, as_int=True):
+    """Transform pixel location to different reference."""
+    """Taken from PARE: https://github.com/mkocabas/PARE/blob/6e0caca86c6ab49ff80014b661350958e5b72fd8/pare/utils/image_utils.py"""
+    t = get_transform(center, scale, res, rot=rot)
+    if invert:
+        t = np.linalg.inv(t)
+    new_pt = np.array([pt[0] - 1, pt[1] - 1, 1.]).T
+    new_pt = np.dot(t, new_pt)
+    if as_int:
+        new_pt = new_pt.astype(int)
+    return new_pt[:2] + 1
+def crop_img(img, ul, br, border_mode=cv2.BORDER_CONSTANT, border_value=0):
+    c_x = (ul[0] + br[0]) / 2
+    c_y = (ul[1] + br[1]) / 2
+    bb_width = patch_width = br[0] - ul[0]
+    bb_height = patch_height = br[1] - ul[1]
+    trans = gen_trans_from_patch_cv(c_x, c_y, bb_width, bb_height, patch_width, patch_height, 1.0, 0)
+    img_patch = cv2.warpAffine(img, trans, (int(patch_width), int(patch_height)),
+                               flags=cv2.INTER_LINEAR,
+                               borderMode=border_mode,
+                               borderValue=border_value
+                               )
+    # Force borderValue=cv2.BORDER_CONSTANT for alpha channel
+    if (img.shape[2] == 4) and (border_mode != cv2.BORDER_CONSTANT):
+        img_patch[:, :, 3] = cv2.warpAffine(img[:, :, 3], trans, (int(patch_width), int(patch_height)),
+                                            flags=cv2.INTER_LINEAR,
+                                            borderMode=cv2.BORDER_CONSTANT,
+                                            )
+    return img_patch
+def generate_image_patch_skimage(img: np.array, c_x: float, c_y: float,
+                                 bb_width: float, bb_height: float,
+                                 patch_width: float, patch_height: float,
+                                 do_flip: bool, scale: float, rot: float,
+                                 border_mode=cv2.BORDER_CONSTANT, border_value=0) -> Tuple[np.array, np.array]:
+    """
+    Crop image according to the supplied bounding box.
+    Args:
+        img (np.array): Input image of shape (H, W, 3)
+        c_x (float): Bounding box center x coordinate in the original image.
+        c_y (float): Bounding box center y coordinate in the original image.
+        bb_width (float): Bounding box width.
+        bb_height (float): Bounding box height.
+        patch_width (float): Output box width.
+        patch_height (float): Output box height.
+        do_flip (bool): Whether to flip image or not.
+        scale (float): Rescaling factor for the bounding box (augmentation).
+        rot (float): Random rotation applied to the box.
+    Returns:
+        img_patch (np.array): Cropped image patch of shape (patch_height, patch_height, 3)
+        trans (np.array): Transformation matrix.
+    """
+    img_height, img_width, img_channels = img.shape
+    if do_flip:
+        img = img[:, ::-1, :]
+        c_x = img_width - c_x - 1
+    trans = gen_trans_from_patch_cv(c_x, c_y, bb_width, bb_height, patch_width, patch_height, scale, rot)
+    # img_patch = cv2.warpAffine(img, trans, (int(patch_width), int(patch_height)), flags=cv2.INTER_LINEAR)
+    # skimage
+    center = np.zeros(2)
+    center[0] = c_x
+    center[1] = c_y
+    res = np.zeros(2)
+    res[0] = patch_width
+    res[1] = patch_height
+    # assumes bb_width = bb_height
+    # assumes patch_width = patch_height
+    assert bb_width == bb_height, f'{bb_width=} != {bb_height=}'
+    assert patch_width == patch_height, f'{patch_width=} != {patch_height=}'
+    scale1 = scale * bb_width / 200.
+    # Upper left point
+    ul = np.array(transform([1, 1], center, scale1, res, invert=1, as_int=False)) - 1
+    # Bottom right point
+    br = np.array(transform([res[0] + 1,
+                             res[1] + 1], center, scale1, res, invert=1, as_int=False)) - 1
+    # Padding so that when rotated proper amount of context is included
+    try:
+        pad = int(np.linalg.norm(br - ul) / 2 - float(br[1] - ul[1]) / 2) + 1
+    except:
+        breakpoint()
+    if not rot == 0:
+        ul -= pad
+        br += pad
+    if False:
+        # Old way of cropping image
+        ul_int = ul.astype(int)
+        br_int = br.astype(int)
+        new_shape = [br_int[1] - ul_int[1], br_int[0] - ul_int[0]]
+        if len(img.shape) > 2:
+            new_shape += [img.shape[2]]
+        new_img = np.zeros(new_shape)
+        # Range to fill new array
+        new_x = max(0, -ul_int[0]), min(br_int[0], len(img[0])) - ul_int[0]
+        new_y = max(0, -ul_int[1]), min(br_int[1], len(img)) - ul_int[1]
+        # Range to sample from original image
+        old_x = max(0, ul_int[0]), min(len(img[0]), br_int[0])
+        old_y = max(0, ul_int[1]), min(len(img), br_int[1])
+        new_img[new_y[0]:new_y[1], new_x[0]:new_x[1]] = img[old_y[0]:old_y[1],
+                                                        old_x[0]:old_x[1]]
+    # New way of cropping image
+    new_img = crop_img(img, ul, br, border_mode=border_mode, border_value=border_value).astype(np.float32)
+    # print(f'{new_img.shape=}')
+    # print(f'{new_img1.shape=}')
+    # print(f'{np.allclose(new_img, new_img1)=}')
+    # print(f'{img.dtype=}')
+    if not rot == 0:
+        # Remove padding
+        new_img = rotate(new_img, rot)  # scipy.misc.imrotate(new_img, rot)
+        new_img = new_img[pad:-pad, pad:-pad]
+    if new_img.shape[0] < 1 or new_img.shape[1] < 1:
+        print(f'{img.shape=}')
+        print(f'{new_img.shape=}')
+        print(f'{ul=}')
+        print(f'{br=}')
+        print(f'{pad=}')
+        print(f'{rot=}')
+        breakpoint()
+    # resize image
+    new_img = resize(new_img, res)  # scipy.misc.imresize(new_img, res)
+    new_img = np.clip(new_img, 0, 255).astype(np.uint8)
+    return new_img, trans
+def generate_image_patch_cv2(img: np.array, c_x: float, c_y: float,
+                             bb_width: float, bb_height: float,
+                             patch_width: float, patch_height: float,
+                             do_flip: bool, scale: float, rot: float,
+                             border_mode=cv2.BORDER_CONSTANT, border_value=0) -> Tuple[np.array, np.array]:
+    """
+    Crop the input image and return the crop and the corresponding transformation matrix.
+    Args:
+        img (np.array): Input image of shape (H, W, 3)
+        c_x (float): Bounding box center x coordinate in the original image.
+        c_y (float): Bounding box center y coordinate in the original image.
+        bb_width (float): Bounding box width.
+        bb_height (float): Bounding box height.
+        patch_width (float): Output box width.
+        patch_height (float): Output box height.
+        do_flip (bool): Whether to flip image or not.
+        scale (float): Rescaling factor for the bounding box (augmentation).
+        rot (float): Random rotation applied to the box.
+    Returns:
+        img_patch (np.array): Cropped image patch of shape (patch_height, patch_height, 3)
+        trans (np.array): Transformation matrix.
+    """
+    img_height, img_width, img_channels = img.shape
+    if do_flip:
+        img = img[:, ::-1, :]
+        c_x = img_width - c_x - 1
+    trans = gen_trans_from_patch_cv(c_x, c_y, bb_width, bb_height, patch_width, patch_height, scale, rot)
+    img_patch = cv2.warpAffine(img, trans, (int(patch_width), int(patch_height)),
+                               flags=cv2.INTER_LINEAR,
+                               borderMode=border_mode,
+                               borderValue=border_value,
+                               )
+    # Force borderValue=cv2.BORDER_CONSTANT for alpha channel
+    if (img.shape[2] == 4) and (border_mode != cv2.BORDER_CONSTANT):
+        img_patch[:, :, 3] = cv2.warpAffine(img[:, :, 3], trans, (int(patch_width), int(patch_height)),
+                                            flags=cv2.INTER_LINEAR,
+                                            borderMode=cv2.BORDER_CONSTANT,
+                                            )
+    is_border = np.all(img_patch[:, :, :-1] == border_value, axis=2) if img_patch.shape[2] == 4 else np.all(img_patch == 0, axis=2)
+    img_border_mask = ~is_border
+    return img_patch, trans, img_border_mask
+def convert_cvimg_to_tensor(cvimg: np.array):
+    """
+    Convert image from HWC to CHW format.
+    Args:
+        cvimg (np.array): Image of shape (H, W, 3) as loaded by OpenCV.
+    Returns:
+        np.array: Output image of shape (3, H, W).
+    """
+    # from h,w,c(OpenCV) to c,h,w
+    img = cvimg.copy()
+    img = np.transpose(img, (2, 0, 1))
+    # from int to float
+    img = img.astype(np.float32)
+    return img
+def fliplr_params(smal_params: Dict, has_smal_params: Dict) -> Tuple[Dict, Dict]:
+    """
+    Flip SMAL parameters when flipping the image.
+    Args:
+        smal_params (Dict): SMAL parameter annotations.
+        has_smal_params (Dict): Whether SMAL annotations are valid.
+    Returns:
+        Dict, Dict: Flipped SMAL parameters and valid flags.
+    """
+    global_orient = smal_params['global_orient'].copy()
+    pose = smal_params['pose'].copy()
+    betas = smal_params['betas'].copy()
+    translation = smal_params['translation'].copy()
+    has_global_orient = has_smal_params['global_orient'].copy()
+    has_pose = has_smal_params['pose'].copy()
+    has_betas = has_smal_params['betas'].copy()
+    has_translation = has_smal_params['translation'].copy()
+    global_orient[1::3] *= -1
+    global_orient[2::3] *= -1
+    pose[1::3] *= -1
+    pose[2::3] *= -1
+    translation[1::3] *= -1
+    translation[2::3] *= -1
+    smal_params = {'global_orient': global_orient.astype(np.float32),
+                   'pose': pose.astype(np.float32),
+                   'betas': betas.astype(np.float32),
+                   'translation': translation.astype(np.float32)
+                   }
+    has_smal_params = {'global_orient': has_global_orient,
+                       'pose': has_pose,
+                       'betas': has_betas,
+                       'translation': has_translation
+                       }
+    return smal_params, has_smal_params
+def fliplr_keypoints(joints: np.array, width: float, flip_permutation: List[int]) -> np.array:
+    """
+    Flip 2D or 3D keypoints.
+    Args:
+        joints (np.array): Array of shape (N, 3) or (N, 4) containing 2D or 3D keypoint locations and confidence.
+        flip_permutation (List): Permutation to apply after flipping.
+    Returns:
+        np.array: Flipped 2D or 3D keypoints with shape (N, 3) or (N, 4) respectively.
+    """
+    joints = joints.copy()
+    # Flip horizontal
+    joints[:, 0] = width - joints[:, 0] - 1
+    joints = joints[flip_permutation, :]
+    return joints
+def keypoint_3d_processing(keypoints_3d: np.array, rot: float, filp: bool) -> np.array:
+    """
+    Process 3D keypoints (rotation/flipping).
+    Args:
+        keypoints_3d (np.array): Input array of shape (N, 4) containing the 3D keypoints and confidence.
+        rot (float): Random rotation applied to the keypoints.
+    Returns:
+        np.array: Transformed 3D keypoints with shape (N, 4).
+    """
+    # in-plane rotation
+    rot_mat = np.eye(3, dtype=np.float32)
+    if not rot == 0:
+        rot_rad = -rot * np.pi / 180
+        sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+        rot_mat[0, :2] = [cs, -sn]
+        rot_mat[1, :2] = [sn, cs]
+    keypoints_3d[:, :-1] = np.einsum('ij,kj->ki', rot_mat, keypoints_3d[:, :-1])
+    # flip the x coordinates
+    if filp:
+        keypoints_3d = fliplr_keypoints(keypoints_3d, list(range(len(keypoints_3d))))
+    keypoints_3d = keypoints_3d.astype('float32')
+    return keypoints_3d
+def rot_aa(aa: np.array, rot: float) -> np.array:
+    """
+    Rotate axis angle parameters.
+    Args:
+        aa (np.array): Axis-angle vector of shape (3,).
+        rot (np.array): Rotation angle in degrees.
+    Returns:
+        np.array: Rotated axis-angle vector.
+    """
+    # pose parameters
+    R = np.array([[np.cos(np.deg2rad(-rot)), -np.sin(np.deg2rad(-rot)), 0],
+                  [np.sin(np.deg2rad(-rot)), np.cos(np.deg2rad(-rot)), 0],
+                  [0, 0, 1]])
+    # find the rotation of the hand in camera frame
+    per_rdg, _ = cv2.Rodrigues(aa)
+    # apply the global rotation to the global orientation
+    resrot, _ = cv2.Rodrigues(np.dot(R, per_rdg))
+    aa = (resrot.T)[0]
+    return aa.astype(np.float32)
+def smal_param_processing(smal_params: Dict, has_smal_params: Dict, rot: float, do_flip: bool) -> Tuple[Dict, Dict]:
+    """
+    Apply random augmentations to the SMAL parameters.
+    Args:
+        smal_params (Dict): SMAL parameter annotations.
+        has_smal_params (Dict): Whether SMAL annotations are valid.
+        rot (float): Random rotation applied to the keypoints.
+        do_flip (bool): Whether to flip keypoints or not.
+    Returns:
+        Dict, Dict: Transformed SMAL parameters and valid flags.
+    """
+    if do_flip:
+        smal_params, has_smal_params = fliplr_params(smal_params, has_smal_params)
+    smal_params['global_orient'] = rot_aa(smal_params['global_orient'], rot)
+    # camera location is not change, so the translation is not change too.
+    # smal_params['transl'] = np.dot(np.array([[np.cos(np.deg2rad(-rot)), -np.sin(np.deg2rad(-rot)), 0],
+    #                                          [np.sin(np.deg2rad(-rot)), np.cos(np.deg2rad(-rot)), 0],
+    #                                          [0, 0, 1]], dtype=np.float32), smal_params['transl'])
+    return smal_params, has_smal_params
+def get_example(img_path: Union[str,np.ndarray], center_x: float, center_y: float,
+                width: float, height: float,
+                keypoints_2d: np.array, keypoints_3d: np.array,
+                smal_params: Dict, has_smal_params: Dict,
+                patch_width: int, patch_height: int,
+                mean: np.array, std: np.array,
+                do_augment: bool, augm_config: CfgNode,
+                is_bgr: bool = True,
+                use_skimage_antialias: bool = False,
+                border_mode: int = cv2.BORDER_CONSTANT,
+                return_trans: bool = False,) -> Tuple:
+    """
+    Get an example from the dataset and (possibly) apply random augmentations.
+    Args:
+        img_path (str): Image filename
+        center_x (float): Bounding box center x coordinate in the original image.
+        center_y (float): Bounding box center y coordinate in the original image.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array with shape (N,3) containing the 2D keypoints in the original image coordinates.
+        keypoints_3d (np.array): Array with shape (N,4) containing the 3D keypoints.
+        smal_params (Dict): SMAL parameter annotations.
+        has_smal_params (Dict): Whether SMAL annotations are valid.
+        patch_width (float): Output box width.
+        patch_height (float): Output box height.
+        mean (np.array): Array of shape (3,) containing the mean for normalizing the input image.
+        std (np.array): Array of shape (3,) containing the std for normalizing the input image.
+        do_augment (bool): Whether to apply data augmentation or not.
+        aug_config (CfgNode): Config containing augmentation parameters.
+    Returns:
+        return img_patch, keypoints_2d, keypoints_3d, smal_params, has_smal_params, img_size
+        img_patch (np.array): Cropped image patch of shape (3, patch_height, patch_height)
+        keypoints_2d (np.array): Array with shape (N,3) containing the transformed 2D keypoints.
+        keypoints_3d (np.array): Array with shape (N,4) containing the transformed 3D keypoints.
+        smal_params (Dict): Transformed SMAL parameters.
+        has_smal_params (Dict): Valid flag for transformed SMAL parameters.
+        img_size (np.array): Image size of the original image.
+        """
+    if isinstance(img_path, str):
+        # 1. load image
+        cvimg = cv2.imread(img_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
+        if not isinstance(cvimg, np.ndarray):
+            raise IOError("Fail to read %s" % img_path)
+    elif isinstance(img_path, np.ndarray):
+        cvimg = img_path
+    else:
+        raise TypeError('img_path must be either a string or a numpy array')
+    img_height, img_width, img_channels = cvimg.shape
+    img_size = np.array([img_height, img_width], dtype=np.int32)
+    # 2. get augmentation params
+    if do_augment:
+        # box rescale factor, rotation angle, flip or not flip, crop or not crop, ..., color scale, translation x, ...
+        scale, rot, do_flip, do_extreme_crop, extreme_crop_lvl, color_scale, tx, ty = do_augmentation(augm_config)
+    else:
+        scale, rot, do_flip, do_extreme_crop, extreme_crop_lvl, color_scale, tx, ty = 1.0, 0, False, False, 0, [1.0,
+                                                                                                                1.0,
+                                                                                                                1.0], 0., 0.
+    if width < 1 or height < 1:
+        breakpoint()
+    if do_extreme_crop:
+        if extreme_crop_lvl == 0:
+            center_x1, center_y1, width1, height1 = extreme_cropping(center_x, center_y, width, height, keypoints_2d)
+        elif extreme_crop_lvl == 1:
+            center_x1, center_y1, width1, height1 = extreme_cropping_aggressive(center_x, center_y, width, height,
+                                                                                keypoints_2d)
+        THRESH = 4
+        if width1 < THRESH or height1 < THRESH:
+            pass
+        else:
+            center_x, center_y, width, height = center_x1, center_y1, width1, height1
+    center_x += width * tx
+    center_y += height * ty
+    # Process 3D keypoints
+    keypoints_3d = keypoint_3d_processing(keypoints_3d, rot, do_flip)
+    # 3. generate image patch
+    if use_skimage_antialias:
+        # Blur image to avoid aliasing artifacts
+        downsampling_factor = (patch_width / (width * scale))
+        if downsampling_factor > 1.1:
+            cvimg = gaussian(cvimg, sigma=(downsampling_factor - 1) / 2, channel_axis=2, preserve_range=True,
+                             truncate=3.0)
+    # augmentation image, translation matrix
+    img_patch_cv, trans, img_border_mask = generate_image_patch_cv2(cvimg,
+                                                   center_x, center_y,
+                                                   width, height,
+                                                   patch_width, patch_height,
+                                                   do_flip, scale, rot,
+                                                   border_mode=border_mode)
+    image = img_patch_cv.copy()
+    if is_bgr:
+        image = image[:, :, ::-1]
+    img_patch_cv = image.copy()
+    img_patch = convert_cvimg_to_tensor(image)  # [h, w, 4] -> [4, h, w]
+    smal_params, has_smal_params = smal_param_processing(smal_params, has_smal_params, rot, do_flip)
+    # apply normalization
+    for n_c in range(min(img_channels, 3)):
+        img_patch[n_c, :, :] = np.clip(img_patch[n_c, :, :] * color_scale[n_c], 0, 255)
+        if mean is not None and std is not None:
+            img_patch[n_c, :, :] = (img_patch[n_c, :, :] - mean[n_c]) / std[n_c]
+    if do_flip:
+        keypoints_2d = fliplr_keypoints(keypoints_2d, img_width, list(range(len(keypoints_2d))))
+    for n_jt in range(len(keypoints_2d)):
+        keypoints_2d[n_jt, 0:2] = trans_point2d(keypoints_2d[n_jt, 0:2], trans)
+    keypoints_2d[:, :-1] = keypoints_2d[:, :-1] / patch_width - 0.5
+    if not return_trans:
+        return img_patch, keypoints_2d, keypoints_3d, smal_params, has_smal_params, img_size, img_border_mask
+    else:
+        return img_patch, keypoints_2d, keypoints_3d, smal_params, has_smal_params, img_size, trans, img_border_mask
+def get_cub17_example(cvimg: np.array,
+                keypoints_2d: np.array,
+                center_x: float, center_y: float,
+                width: float, height: float,
+                patch_width: int, patch_height: int,
+                mean: np.array, std: np.array,
+                do_augment: bool, augm_config: CfgNode,
+                return_trans=True) -> Tuple:
+    """
+    Get an example from the dataset and (possibly) apply random augmentations.
+    Args:
+        cvimg (np.ndarray): Image
+        keypoints_2d (np.array): Array with shape (N,3) containing the 2D keypoints in the original image coordinates.
+        center_x (float): Bounding box center x coordinate in the original image.
+        center_y (float): Bounding box center y coordinate in the original image.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        patch_width (int): Output box width.
+        patch_height (int): Output box height.
+        mean (np.array): Array of shape (3,) containing the mean for normalizing the input image.
+        std (np.array): Array of shape (3,) containing the std for normalizing the input image.
+        do_augment (bool): Whether to apply data augmentation or not.
+        aug_config (CfgNode): Config containing augmentation parameters.
+    Returns:
+        return img_patch, keypoints_2d
+        img_patch (np.array): Cropped image patch of shape (3, patch_height, patch_height)
+        keypoints_2d (np.array): Array with shape (N,3) containing the transformed 2D keypoints.
+        """
+    img_height, img_width, img_channels = cvimg.shape
+    img_size = np.array([img_height, img_width], dtype=np.int32)
+    # 2. get augmentation params
+    if do_augment:
+        # box rescale factor, rotation angle, flip or not flip, crop or not crop, ..., color scale, translation x, ...
+        scale, rot, do_flip, do_extreme_crop, extreme_crop_lvl, color_scale, tx, ty = do_augmentation(augm_config)
+    else:
+        scale, rot, do_flip, do_extreme_crop, extreme_crop_lvl, color_scale, tx, ty = 1.0, 0, False, False, 0, [1.0,
+                                                                                                                1.0,
+                                                                                                                1.0], 0., 0.
+    # bounding box height and width
+    center_x += width * tx
+    center_y += height * ty
+    # augmentation image, translation matrix
+    img_patch_cv, trans, img_border_mask = generate_image_patch_cv2(cvimg,
+                                                   center_x, center_y,
+                                                   width, height,
+                                                   patch_width, patch_height,
+                                                   do_flip, scale, rot,
+                                                   border_mode=cv2.BORDER_CONSTANT)
+    image = img_patch_cv.copy()
+    img_patch = convert_cvimg_to_tensor(image)  # [h, w, 4] -> [4, h, w]
+    # apply normalization
+    for n_c in range(min(img_channels, 3)):
+        img_patch[n_c, :, :] = np.clip(img_patch[n_c, :, :] * color_scale[n_c], 0, 255)
+        if mean is not None and std is not None:
+            img_patch[n_c, :, :] = (img_patch[n_c, :, :] - mean[n_c]) / std[n_c]
+    if do_flip:
+        keypoints_2d = fliplr_keypoints(keypoints_2d, img_width, list(range(len(keypoints_2d))))
+    for n_jt in range(len(keypoints_2d)):
+        keypoints_2d[n_jt, 0:2] = trans_point2d(keypoints_2d[n_jt, 0:2], trans)
+    keypoints_2d[:, :-1] = keypoints_2d[:, :-1] / patch_width - 0.5
+    if return_trans:
+        return img_patch, keypoints_2d, img_size, trans, img_border_mask
+    else:
+        return img_patch, keypoints_2d, img_size, img_border_mask
+def crop_to_hips(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array) -> Tuple:
+    """
+    Extreme cropping: Crop the box up to the hip locations.
+    Args:
+        center_x (float): x coordinate of the bounding box center.
+        center_y (float): y coordinate of the bounding box center.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        center_x (float): x coordinate of the new bounding box center.
+        center_y (float): y coordinate of the new bounding box center.
+        width (float): New bounding box width.
+        height (float): New bounding box height.
+    """
+    keypoints_2d = keypoints_2d.copy()
+    lower_body_keypoints = [10, 11, 13, 14, 19, 20, 21, 22, 23, 24, 25 + 0, 25 + 1, 25 + 4, 25 + 5]
+    keypoints_2d[lower_body_keypoints, :] = 0
+    if keypoints_2d[:, -1].sum() > 1:
+        center, scale = get_bbox(keypoints_2d)
+        center_x = center[0]
+        center_y = center[1]
+        width = 1.1 * scale[0]
+        height = 1.1 * scale[1]
+    return center_x, center_y, width, height
+def crop_to_shoulders(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array):
+    """
+    Extreme cropping: Crop the box up to the shoulder locations.
+    Args:
+        center_x (float): x coordinate of the bounding box center.
+        center_y (float): y coordinate of the bounding box center.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        center_x (float): x coordinate of the new bounding box center.
+        center_y (float): y coordinate of the new bounding box center.
+        width (float): New bounding box width.
+        height (float): New bounding box height.
+    """
+    keypoints_2d = keypoints_2d.copy()
+    lower_body_keypoints = [3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 19, 20, 21, 22, 23, 24] + [25 + i for i in
+                                                                                             [0, 1, 2, 3, 4, 5, 6, 7,
+                                                                                              10, 11, 14, 15, 16]]
+    keypoints_2d[lower_body_keypoints, :] = 0
+    center, scale = get_bbox(keypoints_2d)
+    if keypoints_2d[:, -1].sum() > 1:
+        center, scale = get_bbox(keypoints_2d)
+        center_x = center[0]
+        center_y = center[1]
+        width = 1.2 * scale[0]
+        height = 1.2 * scale[1]
+    return center_x, center_y, width, height
+def crop_to_head(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array):
+    """
+    Extreme cropping: Crop the box and keep on only the head.
+    Args:
+        center_x (float): x coordinate of the bounding box center.
+        center_y (float): y coordinate of the bounding box center.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        center_x (float): x coordinate of the new bounding box center.
+        center_y (float): y coordinate of the new bounding box center.
+        width (float): New bounding box width.
+        height (float): New bounding box height.
+    """
+    keypoints_2d = keypoints_2d.copy()
+    lower_body_keypoints = [3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 19, 20, 21, 22, 23, 24] + [25 + i for i in
+                                                                                             [0, 1, 2, 3, 4, 5, 6, 7, 8,
+                                                                                              9, 10, 11, 14, 15, 16]]
+    keypoints_2d[lower_body_keypoints, :] = 0
+    if keypoints_2d[:, -1].sum() > 1:
+        center, scale = get_bbox(keypoints_2d)
+        center_x = center[0]
+        center_y = center[1]
+        width = 1.3 * scale[0]
+        height = 1.3 * scale[1]
+    return center_x, center_y, width, height
+def crop_torso_only(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array):
+    """
+    Extreme cropping: Crop the box and keep on only the torso.
+    Args:
+        center_x (float): x coordinate of the bounding box center.
+        center_y (float): y coordinate of the bounding box center.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        center_x (float): x coordinate of the new bounding box center.
+        center_y (float): y coordinate of the new bounding box center.
+        width (float): New bounding box width.
+        height (float): New bounding box height.
+    """
+    keypoints_2d = keypoints_2d.copy()
+    nontorso_body_keypoints = [0, 3, 4, 6, 7, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24] + [25 + i for i in
+                                                                                                         [0, 1, 4, 5, 6,
+                                                                                                          7, 10, 11, 13,
+                                                                                                          17, 18]]
+    keypoints_2d[nontorso_body_keypoints, :] = 0
+    if keypoints_2d[:, -1].sum() > 1:
+        center, scale = get_bbox(keypoints_2d)
+        center_x = center[0]
+        center_y = center[1]
+        width = 1.1 * scale[0]
+        height = 1.1 * scale[1]
+    return center_x, center_y, width, height
+def crop_rightarm_only(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array):
+    """
+    Extreme cropping: Crop the box and keep on only the right arm.
+    Args:
+        center_x (float): x coordinate of the bounding box center.
+        center_y (float): y coordinate of the bounding box center.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        center_x (float): x coordinate of the new bounding box center.
+        center_y (float): y coordinate of the new bounding box center.
+        width (float): New bounding box width.
+        height (float): New bounding box height.
+    """
+    keypoints_2d = keypoints_2d.copy()
+    nonrightarm_body_keypoints = [0, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24] + [
+        25 + i for i in [0, 1, 2, 3, 4, 5, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]]
+    keypoints_2d[nonrightarm_body_keypoints, :] = 0
+    if keypoints_2d[:, -1].sum() > 1:
+        center, scale = get_bbox(keypoints_2d)
+        center_x = center[0]
+        center_y = center[1]
+        width = 1.1 * scale[0]
+        height = 1.1 * scale[1]
+    return center_x, center_y, width, height
+def crop_leftarm_only(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array):
+    """
+    Extreme cropping: Crop the box and keep on only the left arm.
+    Args:
+        center_x (float): x coordinate of the bounding box center.
+        center_y (float): y coordinate of the bounding box center.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        center_x (float): x coordinate of the new bounding box center.
+        center_y (float): y coordinate of the new bounding box center.
+        width (float): New bounding box width.
+        height (float): New bounding box height.
+    """
+    keypoints_2d = keypoints_2d.copy()
+    nonleftarm_body_keypoints = [0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24] + [
+        25 + i for i in [0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 13, 14, 15, 16, 17, 18]]
+    keypoints_2d[nonleftarm_body_keypoints, :] = 0
+    if keypoints_2d[:, -1].sum() > 1:
+        center, scale = get_bbox(keypoints_2d)
+        center_x = center[0]
+        center_y = center[1]
+        width = 1.1 * scale[0]
+        height = 1.1 * scale[1]
+    return center_x, center_y, width, height
+def crop_legs_only(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array):
+    """
+    Extreme cropping: Crop the box and keep on only the legs.
+    Args:
+        center_x (float): x coordinate of the bounding box center.
+        center_y (float): y coordinate of the bounding box center.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        center_x (float): x coordinate of the new bounding box center.
+        center_y (float): y coordinate of the new bounding box center.
+        width (float): New bounding box width.
+        height (float): New bounding box height.
+    """
+    keypoints_2d = keypoints_2d.copy()
+    nonlegs_body_keypoints = [0, 1, 2, 3, 4, 5, 6, 7, 15, 16, 17, 18] + [25 + i for i in
+                                                                         [6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18]]
+    keypoints_2d[nonlegs_body_keypoints, :] = 0
+    if keypoints_2d[:, -1].sum() > 1:
+        center, scale = get_bbox(keypoints_2d)
+        center_x = center[0]
+        center_y = center[1]
+        width = 1.1 * scale[0]
+        height = 1.1 * scale[1]
+    return center_x, center_y, width, height
+def crop_rightleg_only(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array):
+    """
+    Extreme cropping: Crop the box and keep on only the right leg.
+    Args:
+        center_x (float): x coordinate of the bounding box center.
+        center_y (float): y coordinate of the bounding box center.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        center_x (float): x coordinate of the new bounding box center.
+        center_y (float): y coordinate of the new bounding box center.
+        width (float): New bounding box width.
+        height (float): New bounding box height.
+    """
+    keypoints_2d = keypoints_2d.copy()
+    nonrightleg_body_keypoints = [0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] + [25 + i for i in
+                                                                                                        [3, 4, 5, 6, 7,
+                                                                                                         8, 9, 10, 11,
+                                                                                                         12, 13, 14, 15,
+                                                                                                         16, 17, 18]]
+    keypoints_2d[nonrightleg_body_keypoints, :] = 0
+    if keypoints_2d[:, -1].sum() > 1:
+        center, scale = get_bbox(keypoints_2d)
+        center_x = center[0]
+        center_y = center[1]
+        width = 1.1 * scale[0]
+        height = 1.1 * scale[1]
+    return center_x, center_y, width, height
+def crop_leftleg_only(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array):
+    """
+    Extreme cropping: Crop the box and keep on only the left leg.
+    Args:
+        center_x (float): x coordinate of the bounding box center.
+        center_y (float): y coordinate of the bounding box center.
+        width (float): Bounding box width.
+        height (float): Bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        center_x (float): x coordinate of the new bounding box center.
+        center_y (float): y coordinate of the new bounding box center.
+        width (float): New bounding box width.
+        height (float): New bounding box height.
+    """
+    keypoints_2d = keypoints_2d.copy()
+    nonleftleg_body_keypoints = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 15, 16, 17, 18, 22, 23, 24] + [25 + i for i in
+                                                                                                      [0, 1, 2, 6, 7, 8,
+                                                                                                       9, 10, 11, 12,
+                                                                                                       13, 14, 15, 16,
+                                                                                                       17, 18]]
+    keypoints_2d[nonleftleg_body_keypoints, :] = 0
+    if keypoints_2d[:, -1].sum() > 1:
+        center, scale = get_bbox(keypoints_2d)
+        center_x = center[0]
+        center_y = center[1]
+        width = 1.1 * scale[0]
+        height = 1.1 * scale[1]
+    return center_x, center_y, width, height
+def full_body(keypoints_2d: np.array) -> bool:
+    """
+    Check if all main body joints are visible.
+    Args:
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        bool: True if all main body joints are visible.
+    """
+    body_keypoints_openpose = [2, 3, 4, 5, 6, 7, 10, 11, 13, 14]
+    body_keypoints = [25 + i for i in [8, 7, 6, 9, 10, 11, 1, 0, 4, 5]]
+    return (np.maximum(keypoints_2d[body_keypoints, -1], keypoints_2d[body_keypoints_openpose, -1]) > 0).sum() == len(
+        body_keypoints)
+def upper_body(keypoints_2d: np.array):
+    """
+    Check if all upper body joints are visible.
+    Args:
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+    Returns:
+        bool: True if all main body joints are visible.
+    """
+    lower_body_keypoints_openpose = [10, 11, 13, 14]
+    lower_body_keypoints = [25 + i for i in [1, 0, 4, 5]]
+    upper_body_keypoints_openpose = [0, 1, 15, 16, 17, 18]
+    upper_body_keypoints = [25 + 8, 25 + 9, 25 + 12, 25 + 13, 25 + 17, 25 + 18]
+    return ((keypoints_2d[lower_body_keypoints + lower_body_keypoints_openpose, -1] > 0).sum() == 0) \
+        and ((keypoints_2d[upper_body_keypoints + upper_body_keypoints_openpose, -1] > 0).sum() >= 2)
+def get_bbox(keypoints_2d: np.array, rescale: float = 1.2) -> Tuple:
+    """
+    Get center and scale for bounding box from openpose detections.
+    Args:
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+        rescale (float): Scale factor to rescale bounding boxes computed from the keypoints.
+    Returns:
+        center (np.array): Array of shape (2,) containing the new bounding box center.
+        scale (float): New bounding box scale.
+    """
+    valid = keypoints_2d[:, -1] > 0
+    valid_keypoints = keypoints_2d[valid][:, :-1]
+    center = 0.5 * (valid_keypoints.max(axis=0) + valid_keypoints.min(axis=0))
+    bbox_size = (valid_keypoints.max(axis=0) - valid_keypoints.min(axis=0))
+    # adjust bounding box tightness
+    scale = bbox_size
+    scale *= rescale
+    return center, scale
+def extreme_cropping(center_x: float, center_y: float, width: float, height: float, keypoints_2d: np.array) -> Tuple:
+    """
+    Perform extreme cropping
+    Args:
+        center_x (float): x coordinate of bounding box center.
+        center_y (float): y coordinate of bounding box center.
+        width (float): bounding box width.
+        height (float): bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+        rescale (float): Scale factor to rescale bounding boxes computed from the keypoints.
+    Returns:
+        center_x (float): x coordinate of bounding box center.
+        center_y (float): y coordinate of bounding box center.
+        width (float): bounding box width.
+        height (float): bounding box height.
+    """
+    p = torch.rand(1).item()
+    if full_body(keypoints_2d):
+        if p < 0.7:
+            center_x, center_y, width, height = crop_to_hips(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.9:
+            center_x, center_y, width, height = crop_to_shoulders(center_x, center_y, width, height, keypoints_2d)
+        else:
+            center_x, center_y, width, height = crop_to_head(center_x, center_y, width, height, keypoints_2d)
+    elif upper_body(keypoints_2d):
+        if p < 0.9:
+            center_x, center_y, width, height = crop_to_shoulders(center_x, center_y, width, height, keypoints_2d)
+        else:
+            center_x, center_y, width, height = crop_to_head(center_x, center_y, width, height, keypoints_2d)
+    return center_x, center_y, max(width, height), max(width, height)
+def extreme_cropping_aggressive(center_x: float, center_y: float, width: float, height: float,
+                                keypoints_2d: np.array) -> Tuple:
+    """
+    Perform aggressive extreme cropping
+    Args:
+        center_x (float): x coordinate of bounding box center.
+        center_y (float): y coordinate of bounding box center.
+        width (float): bounding box width.
+        height (float): bounding box height.
+        keypoints_2d (np.array): Array of shape (N, 3) containing 2D keypoint locations.
+        rescale (float): Scale factor to rescale bounding boxes computed from the keypoints.
+    Returns:
+        center_x (float): x coordinate of bounding box center.
+        center_y (float): y coordinate of bounding box center.
+        width (float): bounding box width.
+        height (float): bounding box height.
+    """
+    p = torch.rand(1).item()
+    if full_body(keypoints_2d):
+        if p < 0.2:
+            center_x, center_y, width, height = crop_to_hips(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.3:
+            center_x, center_y, width, height = crop_to_shoulders(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.4:
+            center_x, center_y, width, height = crop_to_head(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.5:
+            center_x, center_y, width, height = crop_torso_only(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.6:
+            center_x, center_y, width, height = crop_rightarm_only(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.7:
+            center_x, center_y, width, height = crop_leftarm_only(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.8:
+            center_x, center_y, width, height = crop_legs_only(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.9:
+            center_x, center_y, width, height = crop_rightleg_only(center_x, center_y, width, height, keypoints_2d)
+        else:
+            center_x, center_y, width, height = crop_leftleg_only(center_x, center_y, width, height, keypoints_2d)
+    elif upper_body(keypoints_2d):
+        if p < 0.2:
+            center_x, center_y, width, height = crop_to_shoulders(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.4:
+            center_x, center_y, width, height = crop_to_head(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.6:
+            center_x, center_y, width, height = crop_torso_only(center_x, center_y, width, height, keypoints_2d)
+        elif p < 0.8:
+            center_x, center_y, width, height = crop_rightarm_only(center_x, center_y, width, height, keypoints_2d)
+        else:
+            center_x, center_y, width, height = crop_leftarm_only(center_x, center_y, width, height, keypoints_2d)
+    return center_x, center_y, max(width, height), max(width, height)

amr/datasets/vitdet_dataset.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from typing import Dict, List
+import cv2
+import numpy as np
+from skimage.filters import gaussian
+from yacs.config import CfgNode
+import torch
+from .utils import (convert_cvimg_to_tensor,
+                    expand_to_aspect_ratio,
+                    generate_image_patch_cv2)
+DEFAULT_MEAN = 255. * np.array([0.485, 0.456, 0.406])
+DEFAULT_STD = 255. * np.array([0.229, 0.224, 0.225])
+class ViTDetDataset(torch.utils.data.Dataset):
+    def __init__(self,
+                 cfg: CfgNode,
+                 img_cv2: np.array,
+                 boxes: np.array,
+                 category: List[int],
+                 rescale_factor=1,
+                 train: bool = False,
+                 **kwargs):
+        super().__init__()
+        self.cfg = cfg
+        self.img_cv2 = img_cv2
+        self.boxes = boxes
+        self.category = category
+        self.focal_length = []
+        for c in category:
+            if c == 6:
+                self.focal_length.append([cfg.AVES.FOCAL_LENGTH, cfg.AVES.FOCAL_LENGTH])
+            else:
+                self.focal_length.append([cfg.SMAL.FOCAL_LENGTH, cfg.SMAL.FOCAL_LENGTH])
+        assert train is False, "ViTDetDataset is only for inference"
+        self.train = train
+        self.img_size = cfg.MODEL.IMAGE_SIZE
+        self.mean = 255. * np.array(self.cfg.MODEL.IMAGE_MEAN)
+        self.std = 255. * np.array(self.cfg.MODEL.IMAGE_STD)
+        # Preprocess annotations
+        boxes = boxes.astype(np.float32)
+        self.center = (boxes[:, 2:4] + boxes[:, 0:2]) / 2.0
+        self.scale = rescale_factor * (boxes[:, 2:4] - boxes[:, 0:2]) / 200.0
+        self.animalid = np.arange(len(boxes), dtype=np.int32)
+    def __len__(self) -> int:
+        return len(self.animalid)
+    def __getitem__(self, idx: int) -> Dict[str, np.array]:
+        center = self.center[idx].copy()
+        center_x = center[0]
+        center_y = center[1]
+        scale = self.scale[idx]
+        BBOX_SHAPE = self.cfg.MODEL.get('BBOX_SHAPE', None)
+        bbox_size = expand_to_aspect_ratio(scale * 200, target_aspect_ratio=BBOX_SHAPE).max()
+        patch_width = patch_height = self.img_size
+        flip = False
+        # 3. generate image patch
+        # if use_skimage_antialias:
+        cvimg = self.img_cv2.copy()
+        if True:
+            # Blur image to avoid aliasing artifacts
+            downsampling_factor = ((bbox_size * 1.0) / patch_width)
+            print(f'{downsampling_factor=}')
+            downsampling_factor = downsampling_factor / 2.0
+            if downsampling_factor > 1.1:
+                cvimg = gaussian(cvimg, sigma=(downsampling_factor - 1) / 2, channel_axis=2, preserve_range=True)
+        img_patch_cv, trans, _ = generate_image_patch_cv2(cvimg,
+                                                          center_x, center_y,
+                                                          bbox_size, bbox_size,
+                                                          patch_width, patch_height,
+                                                          flip, 1.0, 0.0,
+                                                          border_mode=cv2.BORDER_CONSTANT)
+        img_patch_cv = img_patch_cv[:, :, ::-1]
+        img_patch = convert_cvimg_to_tensor(img_patch_cv)
+        # apply normalization
+        for n_c in range(min(self.img_cv2.shape[2], 3)):
+            img_patch[n_c, :, :] = (img_patch[n_c, :, :] - self.mean[n_c]) / self.std[n_c]
+        item = {
+            'img': img_patch,
+            'animalid': int(self.animalid[idx]),
+            'box_center': self.center[idx].copy(),
+            'box_size': bbox_size,
+            'img_size': 1.0 * np.array([cvimg.shape[1], cvimg.shape[0]]),
+            'focal_length': np.array(self.focal_length, dtype=np.float32)[idx],
+            'supercategory': self.category[idx],
+            'has_mask': np.array(0., dtype=np.float32)
+        }
+        return item

amr/models/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from .smal_warapper import SMAL
+from .aves_warapper import AVES
+from .animerpp import AniMerPlusPlus
+def load_amr(checkpoint_path):
+    from pathlib import Path
+    from ..configs import get_config
+    model_cfg = str(Path(checkpoint_path).parent / 'config.yaml')
+    model_cfg = get_config(model_cfg, update_cachedir=True)
+    # Override some config values, to crop bbox correctly
+    if (model_cfg.MODEL.BACKBONE.TYPE in ['vith', 'vithmoe']) and ('BBOX_SHAPE' not in model_cfg.MODEL):
+        model_cfg.defrost()
+        assert model_cfg.MODEL.IMAGE_SIZE == 256, f"MODEL.IMAGE_SIZE ({model_cfg.MODEL.IMAGE_SIZE}) should be 256 for ViT backbone"
+        model_cfg.MODEL.BBOX_SHAPE = [192, 256]
+        model_cfg.freeze()
+    # Update config to be compatible with demo
+    if ('PRETRAINED_WEIGHTS' in model_cfg.MODEL.BACKBONE):
+        model_cfg.defrost()
+        model_cfg.MODEL.BACKBONE.pop('PRETRAINED_WEIGHTS')
+        model_cfg.freeze()
+    model = AniMerPlusPlus.load_from_checkpoint(checkpoint_path, strict=False, cfg=model_cfg, map_location="cpu")
+    return model, model_cfg

amr/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (999 Bytes). View file

amr/models/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (2.87 kB). View file

amr/models/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (2.01 kB). View file

amr/models/__pycache__/amr.cpython-310.pyc ADDED Viewed

Binary file (14.3 kB). View file

amr/models/__pycache__/amr.cpython-312.pyc ADDED Viewed

Binary file (26.7 kB). View file

amr/models/__pycache__/amr.cpython-39.pyc ADDED Viewed

Binary file (13.8 kB). View file

amr/models/__pycache__/animerpp.cpython-310.pyc ADDED Viewed

Binary file (16.8 kB). View file

amr/models/__pycache__/animerpp.cpython-312.pyc ADDED Viewed

Binary file (39.2 kB). View file

amr/models/__pycache__/aves_hmr.cpython-310.pyc ADDED Viewed

Binary file (15.5 kB). View file

amr/models/__pycache__/aves_hmr.cpython-312.pyc ADDED Viewed

Binary file (31 kB). View file

amr/models/__pycache__/aves_warapper.cpython-310.pyc ADDED Viewed

Binary file (4.76 kB). View file

amr/models/__pycache__/aves_warapper.cpython-312.pyc ADDED Viewed

Binary file (8.42 kB). View file

amr/models/__pycache__/discriminator.cpython-310.pyc ADDED Viewed

Binary file (2.95 kB). View file

amr/models/__pycache__/discriminator.cpython-312.pyc ADDED Viewed

Binary file (7.94 kB). View file

amr/models/__pycache__/discriminator.cpython-39.pyc ADDED Viewed

Binary file (2.6 kB). View file

amr/models/__pycache__/dyamr.cpython-310.pyc ADDED Viewed

Binary file (18.2 kB). View file

amr/models/__pycache__/dyamr.cpython-312.pyc ADDED Viewed

Binary file (34.3 kB). View file

amr/models/__pycache__/losses.cpython-310.pyc ADDED Viewed

Binary file (12.9 kB). View file

amr/models/__pycache__/losses.cpython-312.pyc ADDED Viewed

Binary file (24.4 kB). View file

amr/models/__pycache__/losses.cpython-39.pyc ADDED Viewed

Binary file (13 kB). View file

amr/models/__pycache__/predictor.cpython-310.pyc ADDED Viewed

Binary file (7.3 kB). View file

amr/models/__pycache__/smal_warapper.cpython-310.pyc ADDED Viewed

Binary file (5.43 kB). View file

amr/models/__pycache__/smal_warapper.cpython-312.pyc ADDED Viewed

Binary file (8.66 kB). View file

amr/models/__pycache__/smal_warapper.cpython-39.pyc ADDED Viewed

Binary file (5.53 kB). View file

amr/models/__pycache__/smooth_amr.cpython-310.pyc ADDED Viewed

Binary file (7.19 kB). View file

amr/models/__pycache__/smooth_amr.cpython-312.pyc ADDED Viewed

Binary file (14 kB). View file

amr/models/__pycache__/smooth_netv2.cpython-310.pyc ADDED Viewed

Binary file (9.97 kB). View file

amr/models/__pycache__/stamr.cpython-310.pyc ADDED Viewed

Binary file (6.27 kB). View file

amr/models/__pycache__/stamr.cpython-312.pyc ADDED Viewed

Binary file (11.6 kB). View file

amr/models/animerpp.py ADDED Viewed

	@@ -0,0 +1,508 @@

+import torch
+import pickle
+import pytorch_lightning as pl
+from torchvision.utils import make_grid
+from typing import Dict
+from pytorch3d.transforms import matrix_to_axis_angle
+from yacs.config import CfgNode
+from ..utils import MeshRenderer
+from ..utils.geometry import aa_to_rotmat, perspective_projection
+from ..utils.pylogger import get_pylogger
+from ..utils.mesh_renderer import SilhouetteRenderer
+from .backbones import create_backbone
+from .heads.classifier_head import ClassTokenHead
+from .heads import build_aves_head, build_smal_head
+from .losses import (Keypoint3DLoss, Keypoint2DLoss, ParameterLoss, SupConLoss,
+                    PoseBonePriorLoss, SilhouetteLoss, ShapePriorLoss, PosePriorLoss)
+from .aves_warapper import AVES
+from .smal_warapper import SMAL
+log = get_pylogger(__name__)
+class AniMerPlusPlus(pl.LightningModule):
+    def __init__(self, cfg: CfgNode, init_renderer: bool = True):
+        """
+        Setup AVES-HMR model
+        Args:
+            cfg (CfgNode): Config file as a yacs CfgNode
+        """
+        super().__init__()
+        # Save hyperparameters
+        self.save_hyperparameters(logger=False, ignore=['init_renderer'])
+        self.cfg = cfg
+        # Create backbone feature extractor
+        self.backbone = create_backbone(cfg)
+        # Create AVES head
+        self.aves_head = build_aves_head(cfg)
+        # Create SMAL head
+        self.smal_head = build_smal_head(cfg)
+        self.class_token_head = ClassTokenHead(**cfg.MODEL.get("CLASS_TOKEN_HEAD", dict()))
+        # Define loss functions
+        # common loss
+        self.keypoint_3d_loss = Keypoint3DLoss(loss_type='l1')
+        self.keypoint_2d_loss = Keypoint2DLoss(loss_type='l1')
+        self.supcon_loss = SupConLoss()
+        self.parameter_loss = ParameterLoss()
+        # aves loss
+        self.posebone_prior_loss = PoseBonePriorLoss(path_prior=cfg.AVES.POSE_PRIOR_PATH)
+        self.mask_loss = SilhouetteLoss()
+        # smal loss
+        self.shape_prior_loss = ShapePriorLoss(path_prior=cfg.SMAL.SHAPE_PRIOR_PATH)
+        self.pose_prior_loss = PosePriorLoss(path_prior=cfg.SMAL.POSE_PRIOR_PATH)
+        # Instantiate AVES model
+        aves_model_path = cfg.AVES.MODEL_PATH
+        aves_cfg = torch.load(aves_model_path, weights_only=True)
+        self.aves = AVES(**aves_cfg)
+        # Instantiate SMAL model
+        smal_model_path = cfg.SMAL.MODEL_PATH
+        with open(smal_model_path, 'rb') as f:
+            smal_cfg = pickle.load(f, encoding="latin1")
+        self.smal = SMAL(**smal_cfg)
+        # Buffer that shows whetheer we need to initialize ActNorm layers
+        self.register_buffer('initialized', torch.tensor(False))
+        # Setup renderer for visualization
+        if init_renderer:
+            self.aves_mesh_renderer = MeshRenderer(self.cfg, faces=aves_cfg['F'].numpy())
+            self.smal_mesh_renderer = MeshRenderer(self.cfg, faces=self.smal.faces.numpy())
+        else:
+            self.renderer = None
+            self.mesh_renderer = None
+        # Only appling for AVES training
+        self.aves_silouette_render = SilhouetteRenderer(size=self.cfg.MODEL.IMAGE_SIZE,
+                                                        focal=self.cfg.AVES.get("FOCAL_LENGTH", 2167),
+                                                        device='cuda')
+        self.automatic_optimization = False
+    def get_parameters(self):
+        all_params = list(self.aves_head.parameters())
+        all_params += list(self.backbone.parameters())
+        all_params += list(self.smal_head.parameters())
+        all_params += list(self.class_token_head.parameters())
+        return all_params
+    def configure_optimizers(self):
+        """
+        Setup model and distriminator Optimizers
+        Returns:
+            Tuple[torch.optim.Optimizer, torch.optim.Optimizer]: Model and discriminator optimizers
+        """
+        param_groups = [{'params': filter(lambda p: p.requires_grad, self.get_parameters()), 'lr': self.cfg.TRAIN.LR}]
+        if "vit" in self.cfg.MODEL.BACKBONE.TYPE:
+            optimizer = torch.optim.AdamW(params=param_groups,
+                                          weight_decay=self.cfg.TRAIN.WEIGHT_DECAY)
+        else:
+            optimizer = torch.optim.Adam(params=param_groups,
+                                         weight_decay=self.cfg.TRAIN.WEIGHT_DECAY)
+        return optimizer
+    def forward_backbone(self, batch: Dict):
+        x = batch['img']
+        dataset_source = batch["supercategory"] < 5  # bird for index 0
+        # Compute conditioning features using the backbone
+        if self.cfg.MODEL.BACKBONE.TYPE in ["vith"]:
+            conditioning_feats, cls = self.backbone(x[:, :, :, 32:-32])  # [256, 192]
+        elif self.cfg.MODEL.BACKBONE.TYPE in ["vithmoe"]:
+            conditioning_feats, cls = self.backbone(x[:, :, :, 32:-32], dataset_source=dataset_source.type(torch.long))
+        else:
+            conditioning_feats = self.backbone(x)
+            cls = None
+        return conditioning_feats, cls
+    def forward_one_parametric_model(self,
+                                     focal_length: torch.tensor,
+                                     features: torch.tensor,
+                                     head: torch.nn.Module,
+                                     parametric_model: torch.nn.Module,):
+        """
+        Run a forward step of one parametric model.
+        Args:
+            batch (Dict): Dictionary containing batch data
+        Returns:
+            Dict: Dictionary containing the regression output
+        """
+        batch_size = features.shape[0]
+        pred_params, pred_cam, _ = head(features)
+        # Store useful regression outputs to the output dict
+        output = {}
+        output['pred_cam'] = pred_cam
+        output['pred_params'] = {k: v.clone() for k, v in pred_params.items()}
+        # Compute camera translation
+        pred_cam_t = torch.stack([pred_cam[:, 1],
+                                  pred_cam[:, 2],
+                                  2 * focal_length[:, 0] / (self.cfg.MODEL.IMAGE_SIZE * pred_cam[:, 0] + 1e-9)], dim=-1)
+        output['pred_cam_t'] = pred_cam_t
+        output['focal_length'] = focal_length
+        # Compute model vertices, joints and the projected joints
+        pred_params['global_orient'] = pred_params['global_orient'].reshape(batch_size, -1, 3, 3)
+        pred_params['pose'] = pred_params['pose'].reshape(batch_size, -1, 3, 3)
+        pred_params['betas'] = pred_params['betas'].reshape(batch_size, -1)
+        pred_params['bone'] = pred_params['bone'].reshape(batch_size, -1) if 'bone' in pred_params else None
+        parametric_model_output = parametric_model(**pred_params, pose2rot=False)
+        pred_keypoints_3d = parametric_model_output.joints
+        pred_vertices = parametric_model_output.vertices
+        output['pred_keypoints_3d'] = pred_keypoints_3d.reshape(batch_size, -1, 3)
+        output['pred_vertices'] = pred_vertices.reshape(batch_size, -1, 3)
+        pred_cam_t = pred_cam_t.reshape(-1, 3)
+        focal_length = focal_length.reshape(-1, 2)
+        pred_keypoints_2d = perspective_projection(pred_keypoints_3d,
+                                                   translation=pred_cam_t,
+                                                   focal_length=focal_length / self.cfg.MODEL.IMAGE_SIZE)
+        output['pred_keypoints_2d'] = pred_keypoints_2d.reshape(batch_size, -1, 2)
+        return output
+    def forward_step(self, batch: Dict, train: bool = False) -> Dict:
+        """
+        Run a forward step of the network
+        Args:
+            batch (Dict): Dictionary containing batch data
+            train (bool): Flag indicating whether it is training or validation mode
+        Returns:
+            Dict: Dictionary containing the regression output
+        """
+        # Use RGB image as input
+        x = batch['img']
+        batch_size = x.shape[0]
+        device = x.device
+        dataset_source = (batch["supercategory"] < 5)  # bird for index 0
+        features, cls = self.forward_backbone(batch)
+        output = dict()
+        output['cls_feats'] = self.class_token_head(cls) if self.cfg.MODEL.BACKBONE.get("USE_CLS", False) else None
+        num_aves = (batch_size - dataset_source.sum()).item()
+        if num_aves:
+            output['aves_output'] = self.forward_one_parametric_model(batch['focal_length'][~dataset_source],
+                                                                     features[~dataset_source],
+                                                                     self.aves_head,
+                                                                     self.aves)
+            # Only specific to AVES training
+            output['aves_output']['pred_mask'] = self.aves_silouette_render(output['aves_output']['pred_vertices']+output['aves_output']['pred_cam_t'].unsqueeze(1),
+                                                 faces=self.aves.face.unsqueeze(0).repeat(batch_size-dataset_source.sum().item(), 1, 1).to(device))
+        num_smal = dataset_source.sum().item()
+        if num_smal:
+            output['smal_output'] = self.forward_one_parametric_model(batch['focal_length'][dataset_source],
+                                                                      features[dataset_source],
+                                                                      self.smal_head,
+                                                                      self.smal)
+        return output
+    def compute_aves_loss(self, batch: Dict, output: Dict) -> torch.Tensor:
+        """
+        Compute AVES losses given the input batch and the regression output
+        Args:
+            batch (Dict): Dictionary containing batch data
+            output (Dict): Dictionary containing the regression output
+            train (bool): Flag indicating whether it is training or validation mode
+        Returns:
+            torch.Tensor : Total loss for current batch
+        """
+        dataset_source = (batch["supercategory"] > 5)
+        pred_params = output['pred_params']
+        pred_mask = output['pred_mask']
+        pred_keypoints_2d = output['pred_keypoints_2d']
+        pred_keypoints_3d = output['pred_keypoints_3d']
+        batch_size = pred_params['pose'].shape[0]
+        # Get annotations
+        gt_keypoints_2d = batch['keypoints_2d'][dataset_source][:, :18]
+        gt_keypoints_3d = batch['keypoints_3d'][dataset_source][:, :18]
+        gt_mask = batch['mask'][dataset_source]
+        gt_params = {k: v[dataset_source] for k,v in batch['smal_params'].items()}
+        has_params = {k: v[dataset_source] for k,v in batch['has_smal_params'].items()}
+        is_axis_angle = {k: v[dataset_source] for k,v in batch['smal_params_is_axis_angle'].items()}
+        # Compute 3D keypoint loss
+        loss_keypoints_2d = self.keypoint_2d_loss(pred_keypoints_2d, gt_keypoints_2d)
+        loss_keypoints_3d = self.keypoint_3d_loss(pred_keypoints_3d, gt_keypoints_3d, pelvis_id=0)
+        loss_mask = self.mask_loss(pred_mask, gt_mask)
+        # Compute loss on AVES parameters
+        loss_params = {}
+        for k, pred in pred_params.items():
+            gt = gt_params[k].view(batch_size, -1)
+            if is_axis_angle[k].all():
+                gt = aa_to_rotmat(gt.reshape(-1, 3)).view(batch_size, -1, 3, 3)
+            has_gt = has_params[k]
+            if k == "betas":
+                loss_params[k] = self.parameter_loss(pred.reshape(batch_size, -1),
+                                                     gt[:, :15].reshape(batch_size, -1),
+                                                     has_gt)
+                # v1
+                loss_params[k+"_re"] = torch.sum(pred[has_gt.bool()] ** 2) + torch.sum(pred[has_gt.bool()] ** 2) * 0.5
+                # v2
+                # loss_params[k+"_re"] = torch.sum(pred ** 2)
+            elif k == "bone":
+                loss_params[k] = self.parameter_loss(pred.reshape(batch_size, -1),
+                                                     gt.reshape(batch_size, -1),
+                                                     has_gt)
+                # v1
+                loss_params[k+"_re"] = self.posebone_prior_loss.l2_loss(pred, self.posebone_prior_loss.bone_mean, 1 - has_gt) + \
+                                       self.posebone_prior_loss.l2_loss(pred, self.posebone_prior_loss.bone_mean, has_gt) * 0.02
+                # v2
+                # loss_params[k+"_re"] = self.posebone_prior_loss.l2_loss(pred, self.posebone_prior_loss.bone_mean, torch.zeros_like(has_gt))
+            elif k == "pose":
+                loss_params[k] = self.parameter_loss(pred.reshape(batch_size, -1),
+                                                     gt[:, :24].reshape(batch_size, -1),
+                                                     has_gt)
+                pose_axis_angle = matrix_to_axis_angle(pred)
+                # v1
+                loss_params[k+"_re"] = self.posebone_prior_loss.l2_loss(pose_axis_angle.reshape(batch_size, -1), self.posebone_prior_loss.pose_mean, 1 - has_gt) + \
+                                       self.posebone_prior_loss.l2_loss(pose_axis_angle.reshape(batch_size, -1), self.posebone_prior_loss.pose_mean, has_gt) * 0.02
+                # v2
+                # loss_params[k+"_re"] = self.posebone_prior_loss.l2_loss(pose_axis_angle.reshape(batch_size, -1), self.posebone_prior_loss.pose_mean, torch.zeros_like(has_gt))
+            else:
+                loss_params[k] = self.parameter_loss(pred.reshape(batch_size, -1),
+                                                     gt.reshape(batch_size, -1),
+                                                     has_gt)
+        loss_config = self.cfg.LOSS_WEIGHTS.AVES
+        loss = loss_config['KEYPOINTS_3D'] * loss_keypoints_3d + \
+               loss_config['KEYPOINTS_2D'] * loss_keypoints_2d + \
+               sum([loss_params[k] * loss_config[k.upper()] for k in loss_params]) + \
+               loss_config['MASK'] * loss_mask
+        losses = dict(loss_aves=loss.detach(),
+                      loss_aves_keypoints_2d=loss_keypoints_2d.detach(),
+                      loss_aves_keypoints_3d=loss_keypoints_3d.detach(),
+                      loss_aves_mask=loss_mask.detach(),
+                      )
+        for k, v in loss_params.items():
+            losses['loss_aves_' + k] = v.detach()
+        return loss, losses
+    def compute_smal_loss(self, batch: Dict, output: Dict) -> torch.Tensor:
+        """
+        Compute SMAL losses given the input batch and the regression output
+        Args:
+            batch (Dict): Dictionary containing batch data
+            output (Dict): Dictionary containing the regression output
+        Returns:
+            torch.Tensor : Total loss for current batch
+        """
+        dataset_source = (batch["supercategory"] < 5)
+        pred_params = output['pred_params']
+        pred_keypoints_2d = output['pred_keypoints_2d']
+        pred_keypoints_3d = output['pred_keypoints_3d']
+        batch_size = pred_params['pose'].shape[0]
+        # Get annotations
+        gt_keypoints_2d = batch['keypoints_2d'][dataset_source]
+        gt_keypoints_3d = batch['keypoints_3d'][dataset_source]
+        gt_params = {k: v[dataset_source] for k,v in batch['smal_params'].items()}
+        has_params = {k: v[dataset_source] for k,v in batch['has_smal_params'].items()}
+        is_axis_angle = {k: v[dataset_source] for k,v in batch['smal_params_is_axis_angle'].items()}
+        # Compute 3D keypoint loss
+        loss_keypoints_2d = self.keypoint_2d_loss(pred_keypoints_2d, gt_keypoints_2d)
+        loss_keypoints_3d = self.keypoint_3d_loss(pred_keypoints_3d, gt_keypoints_3d, pelvis_id=0)
+        # Compute loss on SMAL parameters
+        loss_smal_params = {}
+        for k, pred in pred_params.items():
+            gt = gt_params[k].view(batch_size, -1)
+            if is_axis_angle[k].all():
+                gt = aa_to_rotmat(gt.reshape(-1, 3)).view(batch_size, -1, 3, 3)
+            has_gt = has_params[k]
+            if k == "betas":
+                loss_smal_params[k] = self.parameter_loss(pred.reshape(batch_size, -1),
+                                                            gt.reshape(batch_size, -1),
+                                                            has_gt) + \
+                                      self.shape_prior_loss(pred, batch["category"][dataset_source], has_gt)
+            elif k == "bone":
+                continue
+            else:
+                loss_smal_params[k] = self.parameter_loss(pred.reshape(batch_size, -1),
+                                                                gt.reshape(batch_size, -1),
+                                                                has_gt) + \
+                                        self.pose_prior_loss(torch.cat((pred_params["global_orient"],
+                                                                        pred_params["pose"]),
+                                                                        dim=1), has_gt) / 2.
+        loss_config = self.cfg.LOSS_WEIGHTS.SMAL
+        loss = loss_config['KEYPOINTS_3D'] * loss_keypoints_3d + \
+               loss_config['KEYPOINTS_2D'] * loss_keypoints_2d + \
+               sum([loss_smal_params[k] * loss_config[k.upper()] for k in loss_smal_params])
+        losses = dict(loss_smal=loss.detach(),
+                      loss_smal_keypoints_2d=loss_keypoints_2d.detach(),
+                      loss_smal_keypoints_3d=loss_keypoints_3d.detach(),
+                      )
+        for k, v in loss_smal_params.items():
+            losses['loss_smal_' + k] = v.detach()
+        return loss, losses
+    def compute_loss(self, batch: Dict, output: Dict, train: bool = True) -> torch.Tensor:
+        """
+        Compute losses given the input batch and the regression output
+        Args:
+            batch (Dict): Dictionary containing batch data
+            output (Dict): Dictionary containing the regression output
+            train (bool): Flag indicating whether it is training or validation mode
+        Returns:
+            torch.Tensor : Total loss for current batch
+        """
+        x = batch['img']
+        device, dtype = x.device, x.dtype
+        if 'aves_output' in output:
+            loss_aves, losses_aves = self.compute_aves_loss(batch, output['aves_output'])
+        else:
+            loss_aves, losses_aves = torch.tensor(0.0, device=device, dtype=dtype), {}
+        if 'smal_output' in output:
+            loss_smal, losses_smal = self.compute_smal_loss(batch, output['smal_output'])
+        else:
+            loss_smal, losses_smal = torch.tensor(0.0, device=device, dtype=dtype), {}
+        loss_supcon = self.supcon_loss(output['cls_feats'], labels=batch['category']) if self.cfg.MODEL.BACKBONE.get("USE_CLS", False) \
+                      else torch.tensor(0.0, device=device, dtype=dtype)
+        loss = loss_aves + loss_smal + loss_supcon * self.cfg.LOSS_WEIGHTS['SUPCON']
+        # Saving loss
+        losses = {}
+        losses['loss'] = loss.detach()
+        losses['loss_supcon'] = loss_supcon.detach()
+        for k, v in losses_aves.items():
+            losses[k] = v.detach()
+        for k, v in losses_smal.items():
+            losses[k] = v.detach()
+        output['losses'] = losses
+        return loss
+    # Tensoroboard logging should run from first rank only
+    @pl.utilities.rank_zero.rank_zero_only
+    def tensorboard_logging(self, batch: Dict, output: Dict, step_count: int, train: bool = True,
+                            write_to_summary_writer: bool = True) -> None:
+        """
+        Log results to Tensorboard
+        Args:
+            batch (Dict): Dictionary containing batch data
+            output (Dict): Dictionary containing the regression output
+            step_count (int): Global training step count
+            train (bool): Flag indicating whether it is training or validation mode
+        """
+        mode = 'train' if train else 'val'
+        batch_size = batch['keypoints_2d'].shape[0]
+        images = batch['img']
+        masks = batch['mask']
+        # mul std then add mean
+        images = (images) * (torch.tensor([0.229, 0.224, 0.225], device=images.device).reshape(1, 3, 1, 1))
+        images = (images + torch.tensor([0.485, 0.456, 0.406], device=images.device).reshape(1, 3, 1, 1))
+        masks = masks.unsqueeze(1).repeat(1, 3, 1, 1)
+        gt_keypoints_2d = batch['keypoints_2d']
+        losses = output['losses']
+        if write_to_summary_writer:
+            summary_writer = self.logger.experiment
+            for loss_name, val in losses.items():
+                summary_writer.add_scalar(mode + '/' + loss_name, val.detach().item(), step_count)
+            if train is False:
+                for metric_name, val in output['metric'].items():
+                    summary_writer.add_scalar(mode + '/' + metric_name, val, step_count)
+        rend_imgs = []
+        num_images = min(batch_size, self.cfg.EXTRA.NUM_LOG_IMAGES)
+        dataset_source = (batch["supercategory"] < 5)[:num_images]  # bird for index 0
+        num_aves = (num_images - dataset_source[:num_images].sum()).item()
+        if num_aves:
+            rend_imgs_aves = self.aves_mesh_renderer.visualize_tensorboard( output['aves_output']['pred_vertices'][:num_aves].detach().cpu().numpy(),
+                                                                            output['aves_output']['pred_cam_t'][:num_aves].detach().cpu().numpy(),
+                                                                            images[:num_images][~dataset_source].cpu().numpy(),
+                                                                            self.cfg.AVES.get("FOCAL_LENGTH", 2167),
+                                                                            output['aves_output']['pred_keypoints_2d'][:num_aves].detach().cpu().numpy(),
+                                                                            gt_keypoints_2d[:num_images][~dataset_source][:, :18].cpu().numpy(),
+                                                                            )
+            rend_imgs.extend(rend_imgs_aves)
+        num_smal = dataset_source[:num_images].sum().item()
+        if num_smal:
+            rend_imgs_smal = self.smal_mesh_renderer.visualize_tensorboard( output['smal_output']['pred_vertices'][:num_smal].detach().cpu().numpy(),
+                                                                            output['smal_output']['pred_cam_t'][:num_smal].detach().cpu().numpy(),
+                                                                            images[:num_images][dataset_source].cpu().numpy(),
+                                                                            self.cfg.SMAL.get("FOCAL_LENGTH", 1000),
+                                                                            output['smal_output']['pred_keypoints_2d'][:num_smal].detach().cpu().numpy(),
+                                                                            gt_keypoints_2d[:num_images][dataset_source].cpu().numpy(),
+                                                                            )
+            rend_imgs.extend(rend_imgs_smal)
+        rend_imgs = make_grid(rend_imgs, nrow=5, padding=2)
+        if write_to_summary_writer:
+            summary_writer.add_image('%s/predictions' % mode, rend_imgs, step_count)
+        return rend_imgs
+    def forward(self, batch: Dict) -> Dict:
+        """
+        Run a forward step of the network in val mode
+        Args:
+            batch (Dict): Dictionary containing batch data
+        Returns:
+            Dict: Dictionary containing the regression output
+        """
+        return self.forward_step(batch, train=False)
+    def training_step(self, batch: Dict) -> Dict:
+        """
+        Run a full training step
+        Args:
+            batch (Dict): Dictionary containing {'img', 'mask', 'keypoints_2d', 'keypoints_3d', 'orig_keypoints_2d',
+                                                 'aves_params', 'aves_params_is_axis_angle', 'focal_length'}
+        Returns:
+            Dict: Dictionary containing regression output.
+        """
+        batch = batch['img']
+        optimizer = self.optimizers(use_pl_optimizer=True)
+        batch_size = batch['img'].shape[0]
+        output = self.forward_step(batch, train=True)
+        if self.cfg.get('UPDATE_GT_SPIN', False):
+            self.update_batch_gt_spin(batch, output)
+        loss = self.compute_loss(batch, output, train=True)
+        # Error if Nan
+        if torch.isnan(loss):
+            raise ValueError('Loss is NaN')
+        optimizer.zero_grad()
+        self.manual_backward(loss)
+        # Clip gradient
+        if self.cfg.TRAIN.get('GRAD_CLIP_VAL', 0) > 0:
+            gn = torch.nn.utils.clip_grad_norm_(self.get_parameters(), self.cfg.TRAIN.GRAD_CLIP_VAL,
+                                                error_if_nonfinite=True)
+            self.log('train/grad_norm', gn, on_step=True, on_epoch=True, prog_bar=True, logger=True, sync_dist=True)
+        optimizer.step()
+        if self.global_step > 0 and self.global_step % self.cfg.GENERAL.LOG_STEPS == 0:
+            self.tensorboard_logging(batch, output, self.global_step, train=True)
+        self.log('train/loss', output['losses']['loss'], on_step=True, on_epoch=True, prog_bar=True, logger=False,
+                 batch_size=batch_size, sync_dist=True)
+        return output
+    def validation_step(self, batch: Dict, batch_idx: int, dataloader_idx=0) -> Dict:
+        pass

amr/models/aves_warapper.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import torch
+import torch.nn.functional as F
+from dataclasses import dataclass
+from smplx.utils import ModelOutput
+from typing import Optional, NewType
+Tensor = NewType('Tensor', torch.Tensor)
+@dataclass
+class AVESOutput(ModelOutput):
+    betas: Optional[Tensor] = None
+    pose: Optional[Tensor] = None
+    bone: Optional[Tensor] = None
+class LBS(torch.nn.Module):
+    '''
+    Implementation of linear blend skinning, with additional bone and scale
+    Input:
+        V (BN, V, 3): vertices to pose and shape
+        pose (BN, J, 3, 3) or (BN, J, 3): pose in rot or axis-angle
+        bone (BN, K): allow for direct change of relative joint distances
+        scale (1): scale the whole kinematic tree
+    '''
+    def __init__(self, J, parents, weights):
+        super(LBS, self).__init__()
+        self.n_joints = J.shape[1]
+        self.register_buffer('h_joints', F.pad(J.unsqueeze(-1), [0,0,0,1], value=0))
+        self.register_buffer('kin_tree', torch.cat([J[:,[0], :], J[:, 1:]-J[:, parents[1:]]], dim=1).unsqueeze(-1))
+        self.register_buffer('parents', parents)
+        self.register_buffer('weights', weights[None].float())
+    def __call__(self, V, pose, bone, scale, to_rotmats=False):
+        batch_size = len(V)
+        device = pose.device
+        V = F.pad(V.unsqueeze(-1), [0,0,0,1], value=1)
+        kin_tree = (scale*self.kin_tree) * bone[:, :, None, None]
+        pose = pose.view([batch_size, -1, 3, 3])
+        T = torch.zeros([batch_size, self.n_joints, 4, 4]).float().to(device)
+        T[:, :, -1, -1] = 1
+        T[:, :, :3, :] = torch.cat([pose, kin_tree], dim=-1)
+        T_rel = [T[:, 0]]
+        for i in range(1, self.n_joints):
+            T_rel.append(T_rel[self.parents[i]] @ T[:, i])
+        T_rel = torch.stack(T_rel, dim=1)
+        T_rel[:,:,:,[-1]] -= T_rel.clone() @ (self.h_joints*scale)
+        T_ = self.weights @ T_rel.view(batch_size, self.n_joints, -1)
+        T_ = T_.view(batch_size, -1, 4, 4)
+        V = T_ @ V
+        return V[:, :, :3, 0]
+class AVES(torch.nn.Module):
+    def __init__(self, **kwargs):
+        super(AVES, self).__init__()
+        # kinematic tree, and map to keypoints from vertices
+        self.register_buffer('kintree_table', kwargs['kintree_table'])
+        self.register_buffer('parents', kwargs['kintree_table'][0])
+        self.register_buffer('weights', kwargs['weights'])
+        self.register_buffer('vert2kpt', kwargs['vert2kpt'])
+        self.register_buffer('face', kwargs['F'])
+        # mean shape and default joints
+        rot = torch.tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]], dtype=torch.float32)
+        rot = torch.tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]], dtype=torch.float32) @ rot
+        # rot = torch.eye(3, dtype=torch.float32)
+        V = (rot @ kwargs['V'].T).T.unsqueeze(0)
+        J = (rot @ kwargs['J'].T).T.unsqueeze(0)
+        self.register_buffer('V', V)
+        self.register_buffer('J', J)
+        self.LBS = LBS(self.J, self.parents, self.weights)
+        # pose and bone prior
+        self.register_buffer('p_m', kwargs['pose_mean'])
+        self.register_buffer('b_m', kwargs['bone_mean'])
+        self.register_buffer('p_cov', kwargs['pose_cov'])
+        self.register_buffer('b_cov', kwargs['bone_cov'])
+        # standardized blend shape basis
+        B = kwargs['Beta']
+        sigma = kwargs['Beta_sigma']
+        B = B * sigma[:,None,None]
+        self.register_buffer('B', B)
+        # PCA coefficient that is optimized to match the original template shape
+        ### so in the __call__ funciton, if beta is set to self.beta_original,
+        ### it will return the template shape from ECCV2020 (marcbadger/avian-mesh).
+        self.register_buffer('beta_original', kwargs['beta_original'])
+    def __call__(self, global_orient, pose, bone, transl=None,
+                 scale=1, betas=None, pose2rot=False, **kwargs):
+        '''
+        Input:
+            global_pose [bn, 3] tensor for batched global_pose on root joint
+            body_pose   [bn, 72] tensor for batched body pose
+            bone_length [bn, 24] tensor for bone length; the bone variable
+                                 captures non-rigid joint articulation in this model
+            beta [bn, 15] shape PCA coefficients
+            If beta is None, it will return the mean shape
+            If beta is self.beta_original, it will return the orignial tempalte shape
+        '''
+        device = global_orient.device
+        batch_size = global_orient.shape[0]
+        V = self.V.repeat([batch_size, 1, 1]) * scale
+        J = self.J.repeat([batch_size, 1, 1]) * scale
+        # multi-bird shape space
+        if betas is not None:
+            V = V + torch.einsum('bk, kmn->bmn', betas, self.B)
+        # concatenate bone and pose
+        bone = torch.cat([torch.ones([batch_size, 1]).to(device), bone], dim=1)
+        pose = torch.cat([global_orient, pose], dim=1)
+        # LBS
+        verts = self.LBS(V, pose, bone, scale, to_rotmats=pose2rot)
+        if transl is not None:
+            verts = verts + transl[:, None, :]
+        # Calculate 3d keypoint from new vertices resulted from pose
+        keypoints = torch.einsum('bni,kn->bki', verts, self.vert2kpt)
+        output = AVESOutput(
+            vertices=verts,
+            joints=keypoints,
+            betas=betas,
+            global_orient=global_orient,
+            pose=pose,
+            bone=bone,
+            transl=transl,
+            full_pose=None,
+        )
+        return output

amr/models/backbones/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from .vit_moe import vithmoe
+from torch import nn
+import torchvision
+def create_backbone(cfg):
+    if cfg.MODEL.BACKBONE.TYPE == 'vithmoe':
+        return vithmoe(cfg)
+    else:
+        raise NotImplementedError('Backbone type is not implemented')

amr/models/backbones/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (489 Bytes). View file

amr/models/backbones/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (1.71 kB). View file

amr/models/backbones/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (401 Bytes). View file

amr/models/backbones/__pycache__/rope_deit.cpython-310.pyc ADDED Viewed

Binary file (2.47 kB). View file

amr/models/backbones/__pycache__/vit.cpython-310.pyc ADDED Viewed

Binary file (11.9 kB). View file