import sys import os import torch import numpy as np import cv2 import argparse from pathlib import Path from tqdm import tqdm import gc import concurrent.futures # Ensure repo root is on sys.path REPO_ROOT = Path(__file__).resolve().parents[2] if str(REPO_ROOT) not in sys.path: sys.path.insert(0, str(REPO_ROOT)) from genmo.utils.pylogger import Log # Standard ImageNet Normalization IMAGENET_MEAN = np.array([0.485, 0.456, 0.406], dtype=np.float32) IMAGENET_STD = np.array([0.229, 0.224, 0.225], dtype=np.float32) def _require_extractor(): gvhmr_root = REPO_ROOT / "third_party" / "GVHMR" if gvhmr_root.exists() and str(gvhmr_root) not in sys.path: sys.path.insert(0, str(gvhmr_root)) try: from third_party.GVHMR.hmr4d.utils.preproc.vitfeat_extractor import Extractor except Exception as e: raise RuntimeError("Could not import Extractor from GVHMR.") from e return Extractor # --- FAST IMAGE LOADER --- def process_single_image(args): path, cx, cy, scale, img_size = args img = cv2.imread(path) if img is None: return np.zeros((3, img_size, img_size), dtype=np.float32) H, W = img.shape[:2] max_side = float(max(H, W, 1)) try: cx = float(cx) cy = float(cy) scale = float(scale) except Exception as e: raise RuntimeError(f"Bad bbx_xys types for {path}: cx={cx} cy={cy} scale={scale}") from e if not (np.isfinite(cx) and np.isfinite(cy) and np.isfinite(scale)): raise RuntimeError(f"Bad bbx_xys (non-finite) for {path}: cx={cx} cy={cy} scale={scale}") if scale <= 1.0 or scale > max_side * 20.0: raise RuntimeError(f"Bad bbx_xys (scale) for {path}: (H,W)=({H},{W}) cx={cx} cy={cy} scale={scale}") half = scale / 2.0 x0, y0 = int(cx - half), int(cy - half) x1, y1 = int(cx + half), int(cy + half) pad_l, pad_t = max(0, -x0), max(0, -y0) pad_r, pad_b = max(0, x1 - W), max(0, y1 - H) # Fail loudly instead of letting OpenCV try to allocate absurdly large padded images. if max(pad_l, pad_t, pad_r, pad_b) > int(max_side * 4.0): raise RuntimeError( f"Insane crop for {path}: (H,W)=({H},{W}) cx={cx:.2f} cy={cy:.2f} scale={scale:.2f} " f"pads(l,t,r,b)=({pad_l},{pad_t},{pad_r},{pad_b})" ) if pad_l or pad_t or pad_r or pad_b: img = cv2.copyMakeBorder(img, pad_t, pad_b, pad_l, pad_r, cv2.BORDER_CONSTANT, value=(0,0,0)) x0 += pad_l; y0 += pad_t; x1 += pad_l; y1 += pad_t crop = img[y0:y1, x0:x1] if crop.size == 0: raise RuntimeError( f"Empty crop for {path}: (H,W)=({H},{W}) cx={cx:.2f} cy={cy:.2f} scale={scale:.2f} " f"xyxy=({x0},{y0},{x1},{y1})" ) if crop.shape[0] != img_size or crop.shape[1] != img_size: crop = cv2.resize(crop, (img_size, img_size), interpolation=cv2.INTER_LINEAR) crop = crop[:, :, ::-1].astype(np.float32) / 255.0 crop = (crop - IMAGENET_MEAN) / IMAGENET_STD return crop.transpose(2, 0, 1) def load_images_parallel(image_paths, bbx_xys, img_size=256, workers=12): if isinstance(bbx_xys, torch.Tensor): bbx_xys = bbx_xys.cpu().numpy() tasks = [(str(p), b[0], b[1], b[2], img_size) for p, b in zip(image_paths, bbx_xys)] with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: results = list(executor.map(process_single_image, tasks)) return torch.from_numpy(np.stack(results)) # --- OPTIMIZED INFERENCE LOOP --- def fast_inference(model, tensor, batch_size=64): """ Replaces the slow extractor loop. """ model.eval() F = tensor.shape[0] features = [] # Pre-allocate pinned memory for faster transfer tensor = tensor.contiguous() with torch.inference_mode(): for j in range(0, F, batch_size): # Non-blocking transfer batch = tensor[j : j + batch_size].cuda(non_blocking=True) # AMP (Automatic Mixed Precision) -> 2x Speedup with torch.amp.autocast("cuda"): # HMR2 expects dictionary input feat = model({"img": batch}) features.append(feat.detach().cpu()) return torch.cat(features, dim=0) def main(): parser = argparse.ArgumentParser() parser.add_argument("--dataset_root", required=True) parser.add_argument("--batch_size", type=int, default=256, help="Increase this if VRAM allows") parser.add_argument("--workers", type=int, default=4) parser.add_argument("--overwrite", action="store_true") args = parser.parse_args() dataset_root = Path(args.dataset_root) feat_dir = dataset_root / "genmo_features" images_root = dataset_root if not feat_dir.exists(): Log.error("Feature dir not found") return Log.info("Initializing ViT Model...") ExtractorClass = _require_extractor() extractor_wrapper = ExtractorClass(tqdm_leave=False) # Get the inner torch module (HMR2) model = extractor_wrapper.extractor pt_files = sorted(list(feat_dir.glob("*.pt"))) Log.info(f"Processing {len(pt_files)} sequences. Batch Size: {args.batch_size}") for pt_file in tqdm(pt_files, desc="Dataset Progress"): try: data = torch.load(pt_file, map_location="cpu", weights_only=False) if not args.overwrite and "f_imgseq" in data: f = data["f_imgseq"] if isinstance(f, torch.Tensor) and f.ndim == 2 and f.shape[1] > 0: continue # Load Images img_rel_paths = data["imgname"] bbx_xys = data["bbx_xys"] abs_img_paths = [images_root / p for p in img_rel_paths] if not abs_img_paths[0].exists(): continue # 1. Load & Process (CPU Parallel) input_tensor = load_images_parallel(abs_img_paths, bbx_xys, workers=args.workers) # 2. Fast Inference (GPU FP16) vit_features = fast_inference(model, input_tensor, batch_size=args.batch_size) # 3. Save data["f_imgseq"] = vit_features.float() # Save as float32 for compatibility torch.save(data, pt_file) except Exception as e: Log.error(f"Error {pt_file.stem}: {e}") continue if __name__ == "__main__": # Optimize CUDA allocator os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" main()