PEAR / utils /bbox.py
BestWJH's picture
Upload 205 files
2c68f56 verified
import torch
from models.modules.ehm import EHM_v2
from models.pipeline.ehm_pipeline import Ehm_Pipeline
import os
import torch
from utils.pipeline_utils import to_tensor
from utils.graphics_utils import GS_Camera
from models.modules.renderer.body_renderer import Renderer2 as BodyRenderer
from pytorch3d.renderer import PointLights
import cv2
import os
import torch
import argparse
import lightning
import numpy as np
from models.pipeline.ehm_pipeline import Ehm_Pipeline
from utils.general_utils import (
ConfigDict, rtqdm, device_parser, add_extra_cfgs
)
import glob
import tqdm
from typing import Union, Optional, Tuple,List
def flex_resize_video(
frames : np.ndarray,
tgt_wh : Union[Tuple[int, int], None] = None,
ratio : Union[float, None] = None,
kp_mod : int = 1,
):
'''
Resize the frames to the target width and height. Set one of width and height to -1 to keep the aspect ratio.
Only one of `tgt_wh` and `ratio` can be set, if both are set, `tgt_wh` will be used.
### Args
- frames: np.ndarray, (L, H, W, 3)
- tgt_wh: Tuple[int, int], default=None
- The target width and height, set one of them to -1 to keep the aspect ratio.
- ratio: float, default=None
- The ratio to resize the frames. It will be used if `tgt_wh` is not set.
- kp_mod: int, default 1
- Keep the width and height as multiples of `kp_mod`.
- For example, if `kp_mod=16`, the width and height will be rounded to the nearest multiple of 16.
### Returns
- np.ndarray, (L, H', W', 3)
- The resized frames.
'''
assert tgt_wh is not None or ratio is not None, 'At least one of tgt_wh and ratio must be set.'
if tgt_wh is not None:
assert len(tgt_wh) == 2, 'tgt_wh must be a tuple of 2 elements.'
assert tgt_wh[0] > 0 or tgt_wh[1] > 0, 'At least one of width and height must be positive.'
if ratio is not None:
assert ratio > 0, 'ratio must be positive.'
assert len(frames.shape) == 4, 'frames must have 3 or 4 dimensions.'
def align_size(val:float):
''' It will round the value to the nearest multiple of `kp_mod`. '''
return int(round(val / kp_mod) * kp_mod)
# Calculate the target width and height.
orig_h, orig_w = frames.shape[1], frames.shape[2]
tgt_wh = (int(orig_w * ratio), int(orig_h * ratio)) if tgt_wh is None else tgt_wh # Get wh from ratio if not given. # type: ignore
tgt_w, tgt_h = tgt_wh
tgt_w = align_size(orig_w * tgt_h / orig_h) if tgt_w == -1 else align_size(tgt_w)
tgt_h = align_size(orig_h * tgt_w / orig_w) if tgt_h == -1 else align_size(tgt_h)
# Resize the frames.
resized_frames = np.stack([cv2.resize(frame, (tgt_w, tgt_h)) for frame in frames])
return resized_frames
def flex_resize_img(
img : np.ndarray,
tgt_wh : Union[Tuple[int, int], None] = None,
ratio : Union[float, None] = None,
kp_mod : int = 1,
):
'''
Resize the image to the target width and height. Set one of width and height to -1 to keep the aspect ratio.
Only one of `tgt_wh` and `ratio` can be set, if both are set, `tgt_wh` will be used.
### Args
- img: np.ndarray, (H, W, 3)
- tgt_wh: Tuple[int, int], default=None
- The target width and height, set one of them to -1 to keep the aspect ratio.
- ratio: float, default=None
- The ratio to resize the frames. It will be used if `tgt_wh` is not set.
- kp_mod: int, default 1
- Keep the width and height as multiples of `kp_mod`.
- For example, if `kp_mod=16`, the width and height will be rounded to the nearest multiple of 16.
### Returns
- np.ndarray, (H', W', 3)
- The resized iamges.
'''
assert len(img.shape) == 3, 'img must have 3 dimensions.'
return flex_resize_video(img[None], tgt_wh, ratio, kp_mod)[0]
def lurb_to_cwh(
lurb : Union[list, np.ndarray, torch.Tensor],
):
'''
Convert the left-upper-right-bottom format to the center-width-height format.
### Args
- lurb: Union[list, np.ndarray, torch.Tensor], (..., 4)
- The left-upper-right-bottom format bounding box.
### Returns
- Union[list, np.ndarray, torch.Tensor], (..., 4)
- The center-width-height format bounding box.
'''
lurb, recover_type_back = to_tensor(lurb, device=None, temporary=True)
assert lurb.shape[-1] == 4, f"Invalid shape: {lurb.shape}, should be (..., 4)"
c = (lurb[..., :2] + lurb[..., 2:]) / 2 # (..., 2)
wh = lurb[..., 2:] - lurb[..., :2] # (..., 2)
cwh = torch.cat([c, wh], dim=-1) # (..., 4)
return recover_type_back(cwh)
def cwh_to_lurb(
cwh : Union[list, np.ndarray, torch.Tensor],
):
'''
Convert the center-width-height format to the left-upper-right-bottom format.
### Args
- cwh: Union[list, np.ndarray, torch.Tensor], (..., 4)
- The center-width-height format bounding box.
### Returns
- Union[list, np.ndarray, torch.Tensor], (..., 4)
- The left-upper-right-bottom format bounding box.
'''
cwh, recover_type_back = to_tensor(cwh, device=None, temporary=True)
assert cwh.shape[-1] == 4, f"Invalid shape: {cwh.shape}, should be (..., 4)"
l = cwh[..., :2] - cwh[..., 2:] / 2 # (..., 2)
r = cwh[..., :2] + cwh[..., 2:] / 2 # (..., 2)
lurb = torch.cat([l, r], dim=-1) # (..., 4)
return recover_type_back(lurb)
def cwh_to_cs(
cwh : Union[list, np.ndarray, torch.Tensor],
reduce : Optional[str] = None,
):
'''
Convert the center-width-height format to the center-scale format.
*Only works when width and height are the same.*
### Args
- cwh: Union[list, np.ndarray, torch.Tensor], (..., 4)
- The center-width-height format bounding box.
- reduce: Optional[str], default None, valid values: None, 'max'
- Determine how to reduce the width and height to a single scale.
### Returns
- Union[list, np.ndarray, torch.Tensor], (..., 3)
- The center-scale format bounding box.
'''
cwh, recover_type_back = to_tensor(cwh, device=None, temporary=True)
assert cwh.shape[-1] == 4, f"Invalid shape: {cwh.shape}, should be (..., 4)"
if reduce is None:
if (cwh[..., 2] != cwh[..., 3]).any():
print(f"Width and height are supposed to be the same, but they're not. The larger one will be used.")
c = cwh[..., :2] # (..., 2)
s = cwh[..., 2:].max(dim=-1)[0] # (...,) 最大的那个
cs = torch.cat([c, s[..., None]], dim=-1) # (..., 3)
return recover_type_back(cs)
def cs_to_cwh(
cs : Union[list, np.ndarray, torch.Tensor],
):
'''
Convert the center-scale format to the center-width-height format.
### Args
- cs: Union[list, np.ndarray, torch.Tensor], (..., 3)
- The center-scale format bounding box.
### Returns
- Union[list, np.ndarray, torch.Tensor], (..., 4)
- The center-width-height format bounding box.
'''
cs, recover_type_back = to_tensor(cs, device=None, temporary=True)
assert cs.shape[-1] == 3, f"Invalid shape: {cs.shape}, should be (..., 3)"
c = cs[..., :2] # (..., 2)
s = cs[..., 2] # (...,)
cwh = torch.cat([c, s[..., None], s[..., None]], dim=-1) # (..., 4)
return recover_type_back(cwh)
def lurb_to_cs(
lurb : Union[list, np.ndarray, torch.Tensor],
):
'''
Convert the left-upper-right-bottom format to the center-scale format.
*Only works when width and height are the same.*
### Args
- lurb: Union[list, np.ndarray, torch.Tensor], (..., 4)
- The left-upper-right-bottom format bounding box.
### Returns
- Union[list, np.ndarray, torch.Tensor], (..., 3)
- The center-scale format bounding box.
'''
return cwh_to_cs(lurb_to_cwh(lurb), reduce='max') # 先转化为 center w h 还是一个4参数
def cs_to_lurb(
cs : Union[list, np.ndarray, torch.Tensor],
):
'''
Convert the center-scale format to the left-upper-right-bottom format.
### Args
- cs: Union[list, np.ndarray, torch.Tensor], (..., 3)
- The center-scale format bounding box.
### Returns
- Union[list, np.ndarray, torch.Tensor], (..., 4)
- The left-upper-right-bottom format bounding box.
'''
return cwh_to_lurb(cs_to_cwh(cs))
def lurb_to_luwh(
lurb : Union[list, np.ndarray, torch.Tensor],
):
'''
Convert the left-upper-right-bottom format to the left-upper-width-height format.
### Args
- lurb: Union[list, np.ndarray, torch.Tensor]
- The left-upper-right-bottom format bounding box.
### Returns
- Union[list, np.ndarray, torch.Tensor]
- The left-upper-width-height format bounding box.
'''
lurb, recover_type_back = to_tensor(lurb, device=None, temporary=True)
assert lurb.shape[-1] == 4, f"Invalid shape: {lurb.shape}, should be (..., 4)"
lu = lurb[..., :2] # (..., 2)
wh = lurb[..., 2:] - lurb[..., :2] # (..., 2)
luwh = torch.cat([lu, wh], dim=-1) # (..., 4)
return recover_type_back(luwh)
def luwh_to_lurb(
luwh : Union[list, np.ndarray, torch.Tensor],
):
'''
Convert the left-upper-width-height format to the left-upper-right-bottom format.
### Args
- luwh: Union[list, np.ndarray, torch.Tensor]
- The left-upper-width-height format bounding box.
### Returns
- Union[list, np.ndarray, torch.Tensor]
- The left-upper-right-bottom format bounding box.
'''
luwh, recover_type_back = to_tensor(luwh, device=None, temporary=True)
assert luwh.shape[-1] == 4, f"Invalid shape: {luwh.shape}, should be (..., 4)"
l = luwh[..., :2] # (..., 2)
r = luwh[..., :2] + luwh[..., 2:] # (..., 2)
lurb = torch.cat([l, r], dim=-1) # (..., 4)
return recover_type_back(lurb)
def crop_with_lurb(data, lurb, padding=0):
"""
Crop the img-like data according to the lurb bounding box.
### Args
- data: Union[np.ndarray, torch.Tensor], shape (H, W, C)
- Data like image.
- lurb: Union[list, np.ndarray, torch.Tensor], shape (4,)
- Bounding box with [left, upper, right, bottom] coordinates.
- padding: int, default 0
- Padding value for out-of-bound areas.
### Returns
- Union[np.ndarray, torch.Tensor], shape (H', W', C)
- Cropped image with padding if necessary.
"""
data, recover_type_back = to_tensor(data, device=None, temporary=True)
# Ensure lurb is in numpy array format for indexing
lurb = np.array(lurb).astype(np.int64)
l_, u_, r_, b_ = lurb
# Determine the shape of the data.
H_raw, W_raw, C_raw = data.size()
# Compute the cropped patch size.
H_patch = b_ - u_
W_patch = r_ - l_
# Create an output buffer of the crop size, initialized to padding
if isinstance(data, np.ndarray):
output = np.full((H_patch, W_patch, C_raw), padding, dtype=data.dtype)
else:
output = torch.full((H_patch, W_patch, C_raw), padding, dtype=data.dtype)
# Calculate the valid region in the original data
valid_l_ = max(0, l_)
valid_u_ = max(0, u_)
valid_r_ = min(W_raw, r_)
valid_b_ = min(H_raw, b_)
# Calculate the corresponding valid region in the output
target_l_ = valid_l_ - l_
target_u_ = valid_u_ - u_
target_r_ = target_l_ + (valid_r_ - valid_l_)
target_b_ = target_u_ + (valid_b_ - valid_u_)
# Copy the valid region into the output buffer
output[target_u_:target_b_, target_l_:target_r_, :] = data[valid_u_:valid_b_, valid_l_:valid_r_, :]
return recover_type_back(output)
def fit_bbox_to_aspect_ratio(
bbox : np.ndarray,
tgt_ratio : Optional[Tuple[int, int]] = None,
bbox_type : str = 'lurb'
):
'''
Fit a random bounding box to a target aspect ratio through enlarging the bounding box with least change.
### Args
- bbox: np.ndarray, shape is determined by `bbox_type`, e.g. for 'lurb', shape is (4,)
- The bounding box to be modified. The format is determined by `bbox_type`.
- tgt_ratio: Optional[Tuple[int, int]], default None
- The target aspect ratio to be matched.
- bbox_type: str, default 'lurb', valid values: 'lurb', 'cwh'.
### Returns
- np.ndarray, shape is determined by `bbox_type`, e.g. for 'lurb', shape is (4,)
- The modified bounding box.
'''
bbox = bbox.copy()
if bbox_type == 'lurb':
bbx_cwh = lurb_to_cwh(bbox)
bbx_wh = bbx_cwh[2:]
elif bbox_type == 'cwh':
bbx_wh = bbox[2:]
else:
raise ValueError(f"Unsupported bbox type: {bbox_type}")
new_bbx_wh = expand_wh_to_aspect_ratio(bbx_wh, tgt_ratio)
if bbox_type == 'lurb':
bbx_cwh[2:] = new_bbx_wh
new_bbox = cwh_to_lurb(bbx_cwh)
elif bbox_type == 'cwh':
new_bbox = np.concatenate([bbox[:2], new_bbx_wh])
else:
raise ValueError(f"Unsupported bbox type: {bbox_type}")
return new_bbox
def to_numpy(x, temporary:bool=False):
if isinstance(x, torch.Tensor):
if temporary:
recover_type_back = lambda x_: torch.from_numpy(x_).type_as(x).to(x.device)
return x.detach().cpu().numpy(), recover_type_back
else:
return x.detach().cpu().numpy()
if isinstance(x, np.ndarray):
if temporary:
recover_type_back = lambda x_: x_
return x.copy(), recover_type_back
else:
return x
if isinstance(x, List):
if temporary:
recover_type_back = lambda x_: x_.tolist()
return np.array(x), recover_type_back
else:
return np.array(x)
raise ValueError(f"Unsupported type: {type(x)}")
def expand_wh_to_aspect_ratio(bbx_wh:np.ndarray, tgt_aspect_ratio:Optional[Tuple[int, int]]=None):
'''
Increase the size of the bounding box to match the target shape.
Modified from https://github.com/shubham-goel/4D-Humans/blob/6ec79656a23c33237c724742ca2a0ec00b398b53/hmr2/datasets/utils.py#L14-L33
'''
if tgt_aspect_ratio is None:
return bbx_wh
try:
bbx_w , bbx_h = bbx_wh
except (ValueError, TypeError):
print(f"Invalid bbox_wh content: {bbx_wh}")
return bbx_wh
tgt_w, tgt_h = tgt_aspect_ratio
if bbx_h / bbx_w < tgt_h / tgt_w:
new_h = max(bbx_w * tgt_h / tgt_w, bbx_h)
new_w = bbx_w
else:
new_h = bbx_h
new_w = max(bbx_h * tgt_w / tgt_h, bbx_w)
assert new_h >= bbx_h and new_w >= bbx_w
return to_numpy([new_w, new_h])
def fit_bbox_to_aspect_ratio(
bbox : np.ndarray,
tgt_ratio : Optional[Tuple[int, int]] = None,
bbox_type : str = 'lurb'
):
'''
Fit a random bounding box to a target aspect ratio through enlarging the bounding box with least change.
### Args
- bbox: np.ndarray, shape is determined by `bbox_type`, e.g. for 'lurb', shape is (4,)
- The bounding box to be modified. The format is determined by `bbox_type`.
- tgt_ratio: Optional[Tuple[int, int]], default None
- The target aspect ratio to be matched.
- bbox_type: str, default 'lurb', valid values: 'lurb', 'cwh'.
### Returns
- np.ndarray, shape is determined by `bbox_type`, e.g. for 'lurb', shape is (4,)
- The modified bounding box.
'''
bbox = bbox.copy()
if bbox_type == 'lurb':
bbx_cwh = lurb_to_cwh(bbox)
bbx_wh = bbx_cwh[2:]
elif bbox_type == 'cwh':
bbx_wh = bbox[2:]
else:
raise ValueError(f"Unsupported bbox type: {bbox_type}")
new_bbx_wh = expand_wh_to_aspect_ratio(bbx_wh, tgt_ratio)
if bbox_type == 'lurb':
bbx_cwh[2:] = new_bbx_wh
new_bbox = cwh_to_lurb(bbx_cwh)
elif bbox_type == 'cwh':
new_bbox = np.concatenate([bbox[:2], new_bbx_wh])
else:
raise ValueError(f"Unsupported bbox type: {bbox_type}")
return new_bbox
def _img_det2patches(imgs, det_instances, downsample_ratio:float, max_instances:int=5):
'''
1. Filter out the trusted human detections.
2. Enlarge the bounding boxes to aspect ratio (ViT backbone only use 192*256 pixels, make sure these
pixels can capture main contents) and then to squares (to adapt the data module).
3. Crop the image with the bounding boxes and resize them to 256x256.
4. Normalize the cropped images.
'''
if det_instances is None: # no human detected
return to_numpy([]), to_numpy([])
CLASS_HUMAN_ID, DET_THRESHOLD_SCORE = 0, 0.5
# Filter out the trusted human detections.
is_human_mask = det_instances['pred_classes'] == CLASS_HUMAN_ID
reliable_mask = det_instances['scores'] > DET_THRESHOLD_SCORE
active_mask = is_human_mask & reliable_mask
# Filter out the top-k human instances.
if active_mask.sum().item() > max_instances: # 最多 5 个人体
humans_scores = det_instances['scores'] * is_human_mask.float()
_, top_idx = humans_scores.topk(max_instances)
valid_mask = torch.zeros_like(active_mask).bool()
valid_mask[top_idx] = True
else:
valid_mask = active_mask
# Process the bounding boxes and crop the images.
lurb_all = det_instances['pred_boxes'][valid_mask].numpy() / downsample_ratio # (N, 4) 左上角-右下角格式(LURB)边界框。
lurb_all = [fit_bbox_to_aspect_ratio(bbox=lurb, tgt_ratio=(192, 256)) for lurb in lurb_all] # regularize the bbox size 把 bbox 调整为特定宽高比的矩形
# 将 LURB 格式 bbox 转换成 center-scale 格式(中心坐标 + 尺度),常见于人体估计任务。
cs_all = [lurb_to_cs(lurb) for lurb in lurb_all] # convert rectangle left-up-right-bottom bbx to square center-scale bbx
#又将 center-scale 还原为标准的 [x1, y1, x2, y2] 格式。也就是正方形的格式
lurb_all = [cs_to_lurb(cs) for cs in cs_all] # convert square center-scale bbx to rectangle left-up-right-bottom bbx
cropped_imgs = [crop_with_lurb(imgs, lurb) for lurb in lurb_all] # crop_with_lurb():使用 LURB 形式的 bbox 去从原图 imgs 中裁剪出目标区域。
# flex_resize_img():将每个裁剪图像 resize 到统一的大小 (256, 256)。
patches = to_numpy([flex_resize_img(cropped_img, (256, 256)) for cropped_img in cropped_imgs]) # (N, 256, 256, 3)
return patches, cs_all
def imgs_det2patches(imgs, dets, downsample_ratios, max_instances_per_img):
''' Given the raw images and the detection results, return the image patches of human instances. '''
assert len(imgs) == len(dets), f'L_img = {len(imgs)}, L_det = {len(dets)}'
patches, n_patch_per_img, bbx_cs = [], [], []
for i in tqdm(range(len(imgs))):
patches_i, bbx_cs_i = _img_det2patches(imgs[i], dets[i], downsample_ratios[i], max_instances_per_img)
n_patch_per_img.append(len(patches_i))
if len(patches_i) > 0:
patches.append(patches_i.astype(np.float32))
bbx_cs.append(bbx_cs_i)
else:
print(f'No human detection results on image No.{i}.')
det_meta = {
'n_patch_per_img' : n_patch_per_img,
'bbx_cs' : bbx_cs,
}
return patches, det_meta # 返回人体部分的 图像,并且resize成了 256 大小
def load_img_meta(
img_path : Union[str, Path],
):
''' Read the image meta from the given path without opening image. '''
assert Path(img_path).exists(), f'Image not found: {img_path}'
H, W = imageio.v3.improps(img_path).shape[:2]
meta = {'w': W, 'h': H}
return meta
def load_img(
img_path : Union[str, Path],
mode : str = 'RGB',
):
''' Read the image from the given path. '''
assert Path(img_path).exists(), f'Image not found: {img_path}'
img = imageio.v3.imread(img_path, plugin='pillow', mode=mode)
meta = {
'w': img.shape[1],
'h': img.shape[0],
}
return img, meta
from pathlib import Path
def load_inputs(input_path, input_type, MAX_IMG_W=1920, MAX_IMG_H=1080):
# 1. Inference inputs type.
inputs_path = Path(input_path)
if input_type != 'auto': inputs_type = input_type
else: inputs_type = 'video' if Path(input_path).is_file() else 'imgs'
print(f'🚚 Loading inputs from: {inputs_path}, regarded as <{inputs_type}>.')
# 2. Load inputs.
inputs_meta = {'type': inputs_type}
if inputs_type == 'video':
inputs_meta['seq_name'] = inputs_path.stem
frames, _ = load_video(inputs_path)
if frames.shape[1] > MAX_IMG_H:
frames = flex_resize_video(frames, (MAX_IMG_H, -1), kp_mod=4)
if frames.shape[2] > MAX_IMG_W:
frames = flex_resize_video(frames, (-1, MAX_IMG_W), kp_mod=4)
raw_imgs = [frame for frame in frames]
elif inputs_type == 'imgs':
img_fns = list(inputs_path.glob('*.*'))
img_fns = [fn for fn in img_fns if fn.suffix.lower() in ['.jpg', '.jpeg', '.png', '.webp']]
inputs_meta['seq_name'] = f'{inputs_path.stem}-img_cnt={len(img_fns)}'
raw_imgs = []
for fn in img_fns:
img, _ = load_img(fn)
if img.shape[0] > MAX_IMG_H:
img = flex_resize_img(img, (MAX_IMG_H, -1), kp_mod=4)
if img.shape[1] > MAX_IMG_W:
img = flex_resize_img(img, (-1, MAX_IMG_W), kp_mod=4)
raw_imgs.append(img)
inputs_meta['img_fns'] = img_fns
else:
raise ValueError(f'Unsupported inputs type: {inputs_type}.')
print(f'📦 Totally {len(raw_imgs)} images are loaded.')
return raw_imgs, inputs_meta
import imageio
def load_video(
video_path : Union[str, Path],
):
''' Read the video from the given path. '''
if isinstance(video_path, str):
video_path = Path(video_path)
assert video_path.exists(), f'Video not found: {video_path}'
if video_path.is_dir():
print(f'Found {video_path} is a directory. It will be regarded as a image folder.')
imgs_path = sorted(glob(str(video_path / '*')))
frames = []
for img_path in tqdm(imgs_path):
frames.append(imageio.imread(img_path))
fps = 30 # default fps
else:
print(f'Found {video_path} is a file. It will be regarded as a video file.')
reader = imageio.get_reader(video_path, format='FFMPEG')
frames = []
for frame in tqdm(reader, total=reader.count_frames()):
frames.append(frame)
fps = reader.get_meta_data()['fps']
frames = np.stack(frames, axis=0) # (L, H, W, 3)
meta = {
'fps': fps,
'w' : frames.shape[2],
'h' : frames.shape[1],
'L' : frames.shape[0],
}
return frames, meta