Spaces:
Sleeping
Sleeping
| import cv2 | |
| import numpy as np | |
| import math | |
| import torch | |
| import torchvision.transforms as transforms | |
| from PIL import Image | |
| try: | |
| from facenet_pytorch import MTCNN | |
| except ImportError: | |
| MTCNN = None | |
| class GeometricAligner: | |
| """ | |
| Module 1: Preprocessing & Facial Landmark Alignment Pipeline | |
| 1. Extracts 5 facial landmarks using MTCNN (facenet_pytorch). | |
| 2. Computes the orientation angle and executes an affine transformation to align eyes horizontally. | |
| 3. Crops around the center of mass with a 10% outer padding margin. | |
| 4. Resizes to 512x512 using bi-cubic interpolation and normalizes. | |
| """ | |
| def __init__(self, device='cpu'): | |
| if MTCNN is None: | |
| raise ImportError("facenet_pytorch is required. Install via 'pip install facenet-pytorch'") | |
| # Initialize MTCNN for face and landmark detection | |
| self.device = device | |
| self.mtcnn = MTCNN(keep_all=False, device=self.device, margin=0, post_process=False) | |
| self.normalize = transforms.Compose([ | |
| transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) | |
| ]) | |
| def align_and_crop(self, image_bgr: np.ndarray, return_tensor=True): | |
| """ | |
| Executes the alignment and cropping pipeline on a BGR numpy image (e.g. from cv2.imread). | |
| If return_tensor=True, returns a normalized torch Tensor of shape [3, 512, 512]. | |
| If return_tensor=False, returns an RGB numpy array of shape [512, 512, 3]. | |
| Returns None if no face is detected. | |
| """ | |
| # MTCNN works best with RGB PIL Images or numpy arrays | |
| image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB) | |
| pil_image = Image.fromarray(image_rgb) | |
| boxes, probs, landmarks = self.mtcnn.detect(pil_image, landmarks=True) | |
| if boxes is None or len(boxes) == 0: | |
| return None | |
| # Take the most prominent face (MTCNN keep_all=False returns the one with highest probability by default) | |
| # But detect returns arrays, so we take the first index | |
| bbox = boxes[0] | |
| pts = landmarks[0] # Shape: (5, 2) | |
| # 1. 5 primary facial landmarks | |
| # MTCNN landmarks: [left_eye, right_eye, nose, left_mouth, right_mouth] | |
| left_eye = pts[0] | |
| right_eye = pts[1] | |
| # 2. Compute orientation angle relative to the base plane | |
| dY = right_eye[1] - left_eye[1] | |
| dX = right_eye[0] - left_eye[0] | |
| angle = np.degrees(np.arctan2(dY, dX)) | |
| # Determine center of mass of landmarks for rotation center | |
| center_of_mass = np.mean(pts, axis=0) | |
| center_x, center_y = int(center_of_mass[0]), int(center_of_mass[1]) | |
| # Execute affine transformation matrix mapped around center of mass | |
| M = cv2.getRotationMatrix2D((center_x, center_y), angle, 1.0) | |
| h, w = image_bgr.shape[:2] | |
| rotated_img = cv2.warpAffine(image_bgr, M, (w, h), flags=cv2.INTER_CUBIC) | |
| # Re-detect on rotated image ensures accurate bounding box after rotation | |
| rotated_rgb = cv2.cvtColor(rotated_img, cv2.COLOR_BGR2RGB) | |
| r_boxes, r_probs, r_landmarks = self.mtcnn.detect(Image.fromarray(rotated_rgb), landmarks=True) | |
| if r_boxes is None or len(r_boxes) == 0: | |
| # Fallback to rotating the bounding box manually | |
| pts_box = np.array([ | |
| [bbox[0], bbox[1], 1], | |
| [bbox[2], bbox[1], 1], | |
| [bbox[2], bbox[3], 1], | |
| [bbox[0], bbox[3], 1] | |
| ]) | |
| pts_rot = M.dot(pts_box.T).T | |
| min_x, min_y = np.min(pts_rot[:, 0]), np.min(pts_rot[:, 1]) | |
| max_x, max_y = np.max(pts_rot[:, 0]), np.max(pts_rot[:, 1]) | |
| rotated_bbox = [min_x, min_y, max_x, max_y] | |
| else: | |
| # Use re-detected bounding box (most accurate) | |
| rotated_bbox = r_boxes[0] | |
| # 3. Isolate the cropping window boundaries with 10% outer padding margin | |
| bx1, by1, bx2, by2 = rotated_bbox | |
| bw = bx2 - bx1 | |
| bh = by2 - by1 | |
| pad_w = bw * 0.10 | |
| pad_h = bh * 0.10 | |
| cx = bx1 + bw / 2 | |
| cy = by1 + bh / 2 | |
| # Make the crop square to avoid distortion during 512x512 resize | |
| side = max(bw + 2 * pad_w, bh + 2 * pad_h) | |
| nx1 = int(max(0, cx - side / 2)) | |
| ny1 = int(max(0, cy - side / 2)) | |
| nx2 = int(min(w, cx + side / 2)) | |
| ny2 = int(min(h, cy + side / 2)) | |
| crop_img = rotated_img[ny1:ny2, nx1:nx2] | |
| # 4. Bi-cubic resizing to 512x512 | |
| resized_img = cv2.resize(crop_img, (512, 512), interpolation=cv2.INTER_CUBIC) | |
| # Convert BGR to RGB | |
| resized_img_rgb = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB) | |
| if not return_tensor: | |
| return resized_img_rgb | |
| # Convert to Tensor, normalize to [0, 1] then ImageNet bounds | |
| tensor_img = torch.from_numpy(resized_img_rgb).permute(2, 0, 1).float() / 255.0 | |
| normalized_img = self.normalize(tensor_img) | |
| return normalized_img | |