|
|
import sys |
|
|
import os |
|
|
import torch |
|
|
import numpy as np |
|
|
import cv2 |
|
|
import argparse |
|
|
from pathlib import Path |
|
|
from tqdm import tqdm |
|
|
import gc |
|
|
import concurrent.futures |
|
|
|
|
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[2] |
|
|
if str(REPO_ROOT) not in sys.path: |
|
|
sys.path.insert(0, str(REPO_ROOT)) |
|
|
|
|
|
from genmo.utils.pylogger import Log |
|
|
|
|
|
|
|
|
IMAGENET_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32) |
|
|
IMAGENET_STD = np.array([0.229, 0.224, 0.225], dtype=np.float32) |
|
|
|
|
|
def _require_extractor(): |
|
|
gvhmr_root = REPO_ROOT / "third_party" / "GVHMR" |
|
|
if gvhmr_root.exists() and str(gvhmr_root) not in sys.path: |
|
|
sys.path.insert(0, str(gvhmr_root)) |
|
|
try: |
|
|
from third_party.GVHMR.hmr4d.utils.preproc.vitfeat_extractor import Extractor |
|
|
except Exception as e: |
|
|
raise RuntimeError("Could not import Extractor from GVHMR.") from e |
|
|
return Extractor |
|
|
|
|
|
|
|
|
def process_single_image(args): |
|
|
path, cx, cy, scale, img_size = args |
|
|
img = cv2.imread(path) |
|
|
if img is None: |
|
|
return np.zeros((3, img_size, img_size), dtype=np.float32) |
|
|
|
|
|
H, W = img.shape[:2] |
|
|
max_side = float(max(H, W, 1)) |
|
|
try: |
|
|
cx = float(cx) |
|
|
cy = float(cy) |
|
|
scale = float(scale) |
|
|
except Exception as e: |
|
|
raise RuntimeError(f"Bad bbx_xys types for {path}: cx={cx} cy={cy} scale={scale}") from e |
|
|
if not (np.isfinite(cx) and np.isfinite(cy) and np.isfinite(scale)): |
|
|
raise RuntimeError(f"Bad bbx_xys (non-finite) for {path}: cx={cx} cy={cy} scale={scale}") |
|
|
if scale <= 1.0 or scale > max_side * 20.0: |
|
|
raise RuntimeError(f"Bad bbx_xys (scale) for {path}: (H,W)=({H},{W}) cx={cx} cy={cy} scale={scale}") |
|
|
|
|
|
half = scale / 2.0 |
|
|
x0, y0 = int(cx - half), int(cy - half) |
|
|
x1, y1 = int(cx + half), int(cy + half) |
|
|
|
|
|
pad_l, pad_t = max(0, -x0), max(0, -y0) |
|
|
pad_r, pad_b = max(0, x1 - W), max(0, y1 - H) |
|
|
|
|
|
|
|
|
if max(pad_l, pad_t, pad_r, pad_b) > int(max_side * 4.0): |
|
|
raise RuntimeError( |
|
|
f"Insane crop for {path}: (H,W)=({H},{W}) cx={cx:.2f} cy={cy:.2f} scale={scale:.2f} " |
|
|
f"pads(l,t,r,b)=({pad_l},{pad_t},{pad_r},{pad_b})" |
|
|
) |
|
|
|
|
|
if pad_l or pad_t or pad_r or pad_b: |
|
|
img = cv2.copyMakeBorder(img, pad_t, pad_b, pad_l, pad_r, cv2.BORDER_CONSTANT, value=(0,0,0)) |
|
|
x0 += pad_l; y0 += pad_t; x1 += pad_l; y1 += pad_t |
|
|
|
|
|
crop = img[y0:y1, x0:x1] |
|
|
if crop.size == 0: |
|
|
raise RuntimeError( |
|
|
f"Empty crop for {path}: (H,W)=({H},{W}) cx={cx:.2f} cy={cy:.2f} scale={scale:.2f} " |
|
|
f"xyxy=({x0},{y0},{x1},{y1})" |
|
|
) |
|
|
if crop.shape[0] != img_size or crop.shape[1] != img_size: |
|
|
crop = cv2.resize(crop, (img_size, img_size), interpolation=cv2.INTER_LINEAR) |
|
|
|
|
|
crop = crop[:, :, ::-1].astype(np.float32) / 255.0 |
|
|
crop = (crop - IMAGENET_MEAN) / IMAGENET_STD |
|
|
return crop.transpose(2, 0, 1) |
|
|
|
|
|
def load_images_parallel(image_paths, bbx_xys, img_size=256, workers=12): |
|
|
if isinstance(bbx_xys, torch.Tensor): bbx_xys = bbx_xys.cpu().numpy() |
|
|
tasks = [(str(p), b[0], b[1], b[2], img_size) for p, b in zip(image_paths, bbx_xys)] |
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: |
|
|
results = list(executor.map(process_single_image, tasks)) |
|
|
|
|
|
return torch.from_numpy(np.stack(results)) |
|
|
|
|
|
|
|
|
def fast_inference(model, tensor, batch_size=64): |
|
|
""" |
|
|
Replaces the slow extractor loop. |
|
|
""" |
|
|
model.eval() |
|
|
F = tensor.shape[0] |
|
|
features = [] |
|
|
|
|
|
|
|
|
tensor = tensor.contiguous() |
|
|
|
|
|
with torch.inference_mode(): |
|
|
for j in range(0, F, batch_size): |
|
|
|
|
|
batch = tensor[j : j + batch_size].cuda(non_blocking=True) |
|
|
|
|
|
|
|
|
with torch.amp.autocast("cuda"): |
|
|
|
|
|
feat = model({"img": batch}) |
|
|
|
|
|
features.append(feat.detach().cpu()) |
|
|
|
|
|
return torch.cat(features, dim=0) |
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument("--dataset_root", required=True) |
|
|
parser.add_argument("--batch_size", type=int, default=256, help="Increase this if VRAM allows") |
|
|
parser.add_argument("--workers", type=int, default=4) |
|
|
parser.add_argument("--overwrite", action="store_true") |
|
|
args = parser.parse_args() |
|
|
|
|
|
dataset_root = Path(args.dataset_root) |
|
|
feat_dir = dataset_root / "genmo_features" |
|
|
images_root = dataset_root |
|
|
|
|
|
if not feat_dir.exists(): |
|
|
Log.error("Feature dir not found") |
|
|
return |
|
|
|
|
|
Log.info("Initializing ViT Model...") |
|
|
ExtractorClass = _require_extractor() |
|
|
extractor_wrapper = ExtractorClass(tqdm_leave=False) |
|
|
|
|
|
model = extractor_wrapper.extractor |
|
|
|
|
|
pt_files = sorted(list(feat_dir.glob("*.pt"))) |
|
|
Log.info(f"Processing {len(pt_files)} sequences. Batch Size: {args.batch_size}") |
|
|
|
|
|
for pt_file in tqdm(pt_files, desc="Dataset Progress"): |
|
|
try: |
|
|
data = torch.load(pt_file, map_location="cpu", weights_only=False) |
|
|
|
|
|
if not args.overwrite and "f_imgseq" in data: |
|
|
f = data["f_imgseq"] |
|
|
if isinstance(f, torch.Tensor) and f.ndim == 2 and f.shape[1] > 0: |
|
|
continue |
|
|
|
|
|
|
|
|
img_rel_paths = data["imgname"] |
|
|
bbx_xys = data["bbx_xys"] |
|
|
abs_img_paths = [images_root / p for p in img_rel_paths] |
|
|
|
|
|
if not abs_img_paths[0].exists(): |
|
|
continue |
|
|
|
|
|
|
|
|
input_tensor = load_images_parallel(abs_img_paths, bbx_xys, workers=args.workers) |
|
|
|
|
|
|
|
|
vit_features = fast_inference(model, input_tensor, batch_size=args.batch_size) |
|
|
|
|
|
|
|
|
data["f_imgseq"] = vit_features.float() |
|
|
torch.save(data, pt_file) |
|
|
|
|
|
except Exception as e: |
|
|
Log.error(f"Error {pt_file.stem}: {e}") |
|
|
continue |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" |
|
|
main() |
|
|
|