#!/usr/bin/env python # -*- coding: utf-8 -*- # Copyright (c) Xuangeng Chu (xg.chu@outlook.com) # Modified based on code from Orest Kupyn (University of Oxford). # @Organization : Tongyi Lab, Alibaba # @Author : Lingteng Qiu # @Email : 220019047@link.cuhk.edu.cn # @Time : 2025-08-31 10:02:15 # @Function : Face detection and bbox NMS import sys import numpy as np import torch import torchvision sys.path.append("./") def expand_bbox(bbox, scale=1.1): xmin, ymin, xmax, ymax = bbox.unbind(dim=-1) cenx, ceny = (xmin + xmax) / 2, (ymin + ymax) / 2 # ceny = ceny - (ymax - ymin) * 0.05 extend_size = torch.sqrt((ymax - ymin) * (xmax - xmin)) * scale xmine, xmaxe = cenx - extend_size / 2, cenx + extend_size / 2 ymine, ymaxe = ceny - extend_size / 2, ceny + extend_size / 2 expanded_bbox = torch.stack([xmine, ymine, xmaxe, ymaxe], dim=-1) return torch.stack([xmine, ymine, xmaxe, ymaxe], dim=-1) def nms( boxes_xyxy, scores, flame_params, confidence_threshold: float = 0.5, iou_threshold: float = 0.5, top_k: int = 1000, keep_top_k: int = 100, ): for pred_bboxes_xyxy, pred_bboxes_conf, pred_flame_params in zip( boxes_xyxy.detach().float(), scores.detach().float(), flame_params.detach().float(), ): pred_bboxes_conf = pred_bboxes_conf.squeeze(-1) # [Anchors] conf_mask = pred_bboxes_conf >= confidence_threshold pred_bboxes_conf = pred_bboxes_conf[conf_mask] pred_bboxes_xyxy = pred_bboxes_xyxy[conf_mask] pred_flame_params = pred_flame_params[conf_mask] # Filter all predictions by self.nms_top_k if pred_bboxes_conf.size(0) > top_k: topk_candidates = torch.topk( pred_bboxes_conf, k=top_k, largest=True, sorted=True ) pred_bboxes_conf = pred_bboxes_conf[topk_candidates.indices] pred_bboxes_xyxy = pred_bboxes_xyxy[topk_candidates.indices] pred_flame_params = pred_flame_params[topk_candidates.indices] # NMS idx_to_keep = torchvision.ops.boxes.nms( boxes=pred_bboxes_xyxy, scores=pred_bboxes_conf, iou_threshold=iou_threshold ) final_bboxes = pred_bboxes_xyxy[idx_to_keep][:keep_top_k] # [Instances, 4] final_scores = pred_bboxes_conf[idx_to_keep][:keep_top_k] # [Instances, 1] final_params = pred_flame_params[idx_to_keep][ :keep_top_k ] # [Instances, Flame Params] return final_bboxes, final_scores, final_params class VGGHeadDetector(torch.nn.Module): def __init__(self, model_path, device): super().__init__() self.image_size = 640 self._device = device self.model_path = model_path self._init_models() def _init_models(self): self.model = torch.jit.load(self.model_path, map_location="cpu") self.model.to(self._device).eval() def forward(self, image_tensor, conf_threshold=0.5): if not hasattr(self, "model"): self._init_models() image_tensor = image_tensor.to(self._device).float() image, padding, scale = self._preprocess(image_tensor) bbox, scores, flame_params = self.model(image) bbox, vgg_results = self._postprocess( bbox, scores, flame_params, conf_threshold ) if bbox is None: print("VGGHeadDetector: No face detected: {}!".format(image_key)) return None, None vgg_results["normalize"] = {"padding": padding, "scale": scale} # bbox bbox = bbox.clip(0, self.image_size) bbox[[0, 2]] -= padding[0] bbox[[1, 3]] -= padding[1] bbox /= scale bbox = bbox.clip(0, self.image_size / scale) return vgg_results, bbox @torch.no_grad() def detect_face(self, image_tensor): # image_tensor [3, H, W] _, bbox = self.forward(image_tensor=image_tensor) return expand_bbox(bbox, scale=1.65).long() def _preprocess(self, image): _, h, w = image.shape if h > w: new_h, new_w = self.image_size, int(w * self.image_size / h) else: new_h, new_w = int(h * self.image_size / w), self.image_size scale = self.image_size / max(h, w) image = torchvision.transforms.functional.resize( image, (new_h, new_w), antialias=True ) pad_w = self.image_size - image.shape[2] pad_h = self.image_size - image.shape[1] image = torchvision.transforms.functional.pad( image, (pad_w // 2, pad_h // 2, pad_w - pad_w // 2, pad_h - pad_h // 2), fill=127, ) image = image.unsqueeze(0).float() / 255.0 return image, np.array([pad_w // 2, pad_h // 2]), scale def _postprocess(self, bbox, scores, flame_params, conf_threshold): # flame_params = {"shape": 300, "exp": 100, "rotation": 6, "jaw": 3, "translation": 3, "scale": 1} bbox, scores, flame_params = nms( bbox, scores, flame_params, confidence_threshold=conf_threshold ) if bbox.shape[0] == 0: return None, None max_idx = ( ((bbox[:, 3] - bbox[:, 1]) * (bbox[:, 2] - bbox[:, 0])).argmax().long() ) bbox, flame_params = bbox[max_idx], flame_params[max_idx] if bbox[0] < 5 and bbox[1] < 5 and bbox[2] > 635 and bbox[3] > 635: return None, None # flame posecode = torch.cat([flame_params.new_zeros(3), flame_params[400:403]]) vgg_results = { "rotation_6d": flame_params[403:409], "translation": flame_params[409:412], "scale": flame_params[412:], "shapecode": flame_params[:300], "expcode": flame_params[300:400], "posecode": posecode, } return bbox, vgg_results class FaceDetector: def __init__(self, model_path, device): self.model = VGGHeadDetector(model_path=model_path, device=device) @torch.no_grad() def __call__(self, image_tensor): return self.model.detect_face(image_tensor) def __repr__(self): return f"Model: {self.model}" if __name__ == "__main__": from PIL import Image device = "cuda" model_path = "./pretrained_models/gagatracker/vgghead/vgg_heads_l.trcd" easy_head_detect = FaceDetector(model_path=model_path, device=device) rgb_path = "./man_1.png" rgb = np.array(Image.open(rgb_path)) rgb = torch.from_numpy(rgb).permute(2, 0, 1) bbox = easy_head_detect(rgb) head_rgb = rgb[:, int(bbox[1]) : int(bbox[3]), int(bbox[0]) : int(bbox[2])] head_rgb = head_rgb.permute(1, 2, 0) head_rgb = head_rgb.cpu().numpy() Image.fromarray(head_rgb).save("head_rgb.png")