Spaces:
Runtime error
Runtime error
| import argparse | |
| import math | |
| import os | |
| from enum import IntEnum | |
| from pathlib import Path | |
| import cv2 | |
| import face_alignment | |
| import numpy as np | |
| import numpy.linalg as npla | |
| from PIL import Image | |
| from diffusers.utils import load_image | |
| landmarks_2D_new = np.array( | |
| [ | |
| [0.000213256, 0.106454], # 17 | |
| [0.0752622, 0.038915], # 18 | |
| [0.18113, 0.0187482], # 19 | |
| [0.29077, 0.0344891], # 20 | |
| [0.393397, 0.0773906], # 21 | |
| [0.586856, 0.0773906], # 22 | |
| [0.689483, 0.0344891], # 23 | |
| [0.799124, 0.0187482], # 24 | |
| [0.904991, 0.038915], # 25 | |
| [0.98004, 0.106454], # 26 | |
| [0.490127, 0.203352], # 27 | |
| [0.490127, 0.307009], # 28 | |
| [0.490127, 0.409805], # 29 | |
| [0.490127, 0.515625], # 30 | |
| [0.36688, 0.587326], # 31 | |
| [0.426036, 0.609345], # 32 | |
| [0.490127, 0.628106], # 33 | |
| [0.554217, 0.609345], # 34 | |
| [0.613373, 0.587326], # 35 | |
| [0.121737, 0.216423], # 36 | |
| [0.187122, 0.178758], # 37 | |
| [0.265825, 0.179852], # 38 | |
| [0.334606, 0.231733], # 39 | |
| [0.260918, 0.245099], # 40 | |
| [0.182743, 0.244077], # 41 | |
| [0.645647, 0.231733], # 42 | |
| [0.714428, 0.179852], # 43 | |
| [0.793132, 0.178758], # 44 | |
| [0.858516, 0.216423], # 45 | |
| [0.79751, 0.244077], # 46 | |
| [0.719335, 0.245099], # 47 | |
| [0.254149, 0.780233], # 48 | |
| [0.726104, 0.780233], # 54 | |
| ], | |
| dtype=np.float32, | |
| ) | |
| class FaceType(IntEnum): | |
| # enumerating in order "next contains prev" | |
| HALF = 0 | |
| MID_FULL = 1 | |
| FULL = 2 | |
| FULL_NO_ALIGN = 3 | |
| WHOLE_FACE = 4 | |
| WHOLE_FACE_NO_ALIGN = 5 | |
| HEAD = 10 | |
| HEAD_NO_ALIGN = 20 | |
| MARK_ONLY = (100,) # no align at all, just embedded faceinfo | |
| def fromString(s): | |
| r = from_string_dict.get(s.lower()) | |
| if r is None: | |
| raise Exception("FaceType.fromString value error") | |
| return r | |
| def toString(face_type): | |
| return to_string_dict[face_type] | |
| to_string_dict = { | |
| FaceType.HALF: "half_face", | |
| FaceType.MID_FULL: "midfull_face", | |
| FaceType.FULL: "full_face", | |
| FaceType.FULL_NO_ALIGN: "full_face_no_align", | |
| FaceType.WHOLE_FACE: "whole_face", | |
| FaceType.WHOLE_FACE_NO_ALIGN: "whole_face_no_align", | |
| FaceType.HEAD: "head", | |
| FaceType.HEAD_NO_ALIGN: "head_no_align", | |
| FaceType.MARK_ONLY: "mark_only", | |
| } | |
| from_string_dict = {to_string_dict[x]: x for x in to_string_dict.keys()} | |
| FaceType_to_padding_remove_align = { | |
| FaceType.HALF: (0.0, False), | |
| FaceType.MID_FULL: (0.0675, False), | |
| FaceType.FULL: (0.2109375, False), | |
| FaceType.FULL_NO_ALIGN: (0.2109375, True), | |
| FaceType.WHOLE_FACE: (0.40, False), | |
| FaceType.WHOLE_FACE_NO_ALIGN: (0.40, True), | |
| FaceType.HEAD: (0.70, False), | |
| FaceType.HEAD_NO_ALIGN: (0.70, True), | |
| } | |
| def umeyama(src, dst, estimate_scale): | |
| """Estimate N-D similarity transformation with or without scaling. | |
| Parameters | |
| ---------- | |
| src : (M, N) array | |
| Source coordinates. | |
| dst : (M, N) array | |
| Destination coordinates. | |
| estimate_scale : bool | |
| Whether to estimate scaling factor. | |
| Returns | |
| ------- | |
| T : (N + 1, N + 1) | |
| The homogeneous similarity transformation matrix. The matrix contains | |
| NaN values only if the problem is not well-conditioned. | |
| References | |
| ---------- | |
| .. [1] "Least-squares estimation of transformation parameters between two | |
| point patterns", Shinji Umeyama, PAMI 1991, DOI: 10.1109/34.88573 | |
| """ | |
| num = src.shape[0] | |
| dim = src.shape[1] | |
| # Compute mean of src and dst. | |
| src_mean = src.mean(axis=0) | |
| dst_mean = dst.mean(axis=0) | |
| # Subtract mean from src and dst. | |
| src_demean = src - src_mean | |
| dst_demean = dst - dst_mean | |
| # Eq. (38). | |
| A = np.dot(dst_demean.T, src_demean) / num | |
| # Eq. (39). | |
| d = np.ones((dim,), dtype=np.double) | |
| if np.linalg.det(A) < 0: | |
| d[dim - 1] = -1 | |
| T = np.eye(dim + 1, dtype=np.double) | |
| U, S, V = np.linalg.svd(A) | |
| # Eq. (40) and (43). | |
| rank = np.linalg.matrix_rank(A) | |
| if rank == 0: | |
| return np.nan * T | |
| elif rank == dim - 1: | |
| if np.linalg.det(U) * np.linalg.det(V) > 0: | |
| T[:dim, :dim] = np.dot(U, V) | |
| else: | |
| s = d[dim - 1] | |
| d[dim - 1] = -1 | |
| T[:dim, :dim] = np.dot(U, np.dot(np.diag(d), V)) | |
| d[dim - 1] = s | |
| else: | |
| T[:dim, :dim] = np.dot(U, np.dot(np.diag(d), V)) | |
| if estimate_scale: | |
| # Eq. (41) and (42). | |
| scale = 1.0 / src_demean.var(axis=0).sum() * np.dot(S, d) | |
| else: | |
| scale = 1.0 | |
| T[:dim, dim] = dst_mean - scale * np.dot(T[:dim, :dim], src_mean.T) | |
| T[:dim, :dim] *= scale | |
| return T | |
| def transform_points(points, mat, invert=False): | |
| if invert: | |
| mat = cv2.invertAffineTransform(mat) | |
| points = np.expand_dims(points, axis=1) | |
| points = cv2.transform(points, mat, points.shape) | |
| points = np.squeeze(points) | |
| return points | |
| def estimate_averaged_yaw(landmarks): | |
| # Works much better than solvePnP if landmarks from "3DFAN" | |
| if not isinstance(landmarks, np.ndarray): | |
| landmarks = np.array(landmarks) | |
| l = ( | |
| (landmarks[27][0] - landmarks[0][0]) | |
| + (landmarks[28][0] - landmarks[1][0]) | |
| + (landmarks[29][0] - landmarks[2][0]) | |
| ) / 3.0 | |
| r = ( | |
| (landmarks[16][0] - landmarks[27][0]) | |
| + (landmarks[15][0] - landmarks[28][0]) | |
| + (landmarks[14][0] - landmarks[29][0]) | |
| ) / 3.0 | |
| return float(r - l) | |
| def polygon_area(x, y): | |
| return 0.5 * np.abs(np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1))) | |
| def get_transform_mat(image_landmarks, output_size, face_type, scale=1.0): | |
| if not isinstance(image_landmarks, np.ndarray): | |
| image_landmarks = np.array(image_landmarks) | |
| # estimate landmarks transform from global space to local aligned space with bounds [0..1] | |
| mat = umeyama( | |
| np.concatenate([image_landmarks[17:49], image_landmarks[54:55]]), | |
| landmarks_2D_new, | |
| True, | |
| )[0:2] | |
| # get corner points in global space | |
| g_p = transform_points( | |
| np.float32([(0, 0), (1, 0), (1, 1), (0, 1), (0.5, 0.5)]), mat, True | |
| ) | |
| g_c = g_p[4] | |
| # calc diagonal vectors between corners in global space | |
| tb_diag_vec = (g_p[2] - g_p[0]).astype(np.float32) | |
| tb_diag_vec /= npla.norm(tb_diag_vec) | |
| bt_diag_vec = (g_p[1] - g_p[3]).astype(np.float32) | |
| bt_diag_vec /= npla.norm(bt_diag_vec) | |
| # calc modifier of diagonal vectors for scale and padding value | |
| padding, remove_align = FaceType_to_padding_remove_align.get(face_type, 0.0) | |
| mod = (1.0 / scale) * (npla.norm(g_p[0] - g_p[2]) * (padding * np.sqrt(2.0) + 0.5)) | |
| if face_type == FaceType.WHOLE_FACE: | |
| # adjust vertical offset for WHOLE_FACE, 7% below in order to cover more forehead | |
| vec = (g_p[0] - g_p[3]).astype(np.float32) | |
| vec_len = npla.norm(vec) | |
| vec /= vec_len | |
| g_c += vec * vec_len * 0.07 | |
| elif face_type == FaceType.HEAD: | |
| # assuming image_landmarks are 3D_Landmarks extracted for HEAD, | |
| # adjust horizontal offset according to estimated yaw | |
| yaw = estimate_averaged_yaw(transform_points(image_landmarks, mat, False)) | |
| hvec = (g_p[0] - g_p[1]).astype(np.float32) | |
| hvec_len = npla.norm(hvec) | |
| hvec /= hvec_len | |
| yaw *= np.abs(math.tanh(yaw * 2)) # Damp near zero | |
| g_c -= hvec * (yaw * hvec_len / 2.0) | |
| # adjust vertical offset for HEAD, 50% below | |
| vvec = (g_p[0] - g_p[3]).astype(np.float32) | |
| vvec_len = npla.norm(vvec) | |
| vvec /= vvec_len | |
| g_c += vvec * vvec_len * 0.50 | |
| # calc 3 points in global space to estimate 2d affine transform | |
| if not remove_align: | |
| l_t = np.array( | |
| [g_c - tb_diag_vec * mod, g_c + bt_diag_vec * mod, g_c + tb_diag_vec * mod] | |
| ) | |
| else: | |
| # remove_align - face will be centered in the frame but not aligned | |
| l_t = np.array( | |
| [ | |
| g_c - tb_diag_vec * mod, | |
| g_c + bt_diag_vec * mod, | |
| g_c + tb_diag_vec * mod, | |
| g_c - bt_diag_vec * mod, | |
| ] | |
| ) | |
| # get area of face square in global space | |
| area = polygon_area(l_t[:, 0], l_t[:, 1]) | |
| # calc side of square | |
| side = np.float32(math.sqrt(area) / 2) | |
| # calc 3 points with unrotated square | |
| l_t = np.array([g_c + [-side, -side], g_c + [side, -side], g_c + [side, side]]) | |
| # calc affine transform from 3 global space points to 3 local space points size of 'output_size' | |
| pts2 = np.float32(((0, 0), (output_size, 0), (output_size, output_size))) | |
| l_t = l_t.astype(np.float32) | |
| mat = cv2.getAffineTransform(l_t, pts2) | |
| return mat | |
| def extract_faces(model, image, face_image_size, face_type=FaceType.WHOLE_FACE): | |
| # take the first three channels (R, G, B) | |
| array = np.array(image)[:, :, :3] | |
| preds = model.get_landmarks(array) | |
| face_images = [] | |
| image_to_face_matrices = [] | |
| for face_landmarks in preds: | |
| image_to_face_mat = get_transform_mat( | |
| face_landmarks, face_image_size, face_type | |
| ) | |
| face_array = cv2.warpAffine( | |
| array, | |
| image_to_face_mat, | |
| (face_image_size, face_image_size), | |
| cv2.INTER_LANCZOS4, | |
| borderValue=(255, 255, 255), | |
| ) | |
| image_to_face_matrices.append(image_to_face_mat) | |
| face_images.append(Image.fromarray(face_array)) | |
| return face_images, image_to_face_matrices | |
| def parse_args(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| "--image_path", | |
| type=str, | |
| default="", | |
| help="Path to input image", | |
| ) | |
| parser.add_argument( | |
| "--image_size", | |
| type=int, | |
| default=256, | |
| help="Output image size", | |
| ) | |
| parser.add_argument( | |
| "--output_folder", | |
| type=str, | |
| default="./face_images", | |
| help="Path to output folder", | |
| ) | |
| parser.add_argument( | |
| "--face_type", | |
| type=str, | |
| default="whole_face", | |
| help=( | |
| "Face type to extract (e.g., half_face, midfull_face, full_face, " | |
| "full_face_no_align, whole_face, whole_face_no_align, head, " | |
| "head_no_align, mark_only.)" | |
| ), | |
| ) | |
| args = parser.parse_args() | |
| # Convert face_type string to FaceType enum | |
| try: | |
| args.face_type = FaceType.fromString(args.face_type) | |
| except Exception: | |
| raise ValueError(f"Invalid face_type: {args.face_type}") | |
| return args | |
| if __name__ == "__main__": | |
| args = parse_args() | |
| # sfd for SFD, dlib for Dlib and folder for existing bounding boxes. | |
| fa = face_alignment.FaceAlignment( | |
| face_alignment.LandmarksType.TWO_D, face_detector="sfd" | |
| ) | |
| pil_image = load_image(args.image_path) | |
| face_images, image_to_face_matrices = extract_faces( | |
| fa, pil_image, args.image_size, args.face_type | |
| ) | |
| # Make sure the output folder exists | |
| os.makedirs(args.output_folder, exist_ok=True) | |
| input_filename = Path(args.image_path).stem | |
| for i, face_image in enumerate(face_images): | |
| face_image.save(Path(args.output_folder, f"{input_filename}_{i:02}.png")) | |