""" Facial Comparison — HuggingFace Space ====================================== Detection : RetinaFace (public, well-known detector) Alignment : 5-point similarity transform → 112×112 canonical crop Embedding : facial_comparison.pt (private TorchScript model via HF secrets) Similarity : Augmented cosine (3×3 pairs) + sigmoid confidence """ import os import io import base64 import logging import numpy as np import torch import torch.nn.functional as F import cv2 from PIL import Image, ImageOps from typing import List, Tuple, Optional import gradio as gr # ── Logging ─────────────────────────────────────────────────────────────────── logging.basicConfig(level=logging.INFO, format="%(levelname)s | %(name)s | %(message)s") logger = logging.getLogger("facial-comparison") # ── Config ──────────────────────────────────────────────────────────────────── MODEL_PATH = os.getenv("MODEL_PATH", "models/facial_comparison.pt") HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "") HF_MODEL_FILE = os.getenv("HF_MODEL_FILE", "facial_comparison.pt") THRESHOLD = float(os.getenv("THRESHOLD", 0.38)) FACE_RATIO_THRESH = float(os.getenv("FACE_RATIO_THRESHOLD", 0.15)) STEEPNESS = float(os.getenv("STEEPNESS", 12.0)) MAX_CONFIDENCE = float(os.getenv("MAX_CONFIDENCE", 99.9)) # ── Canonical 5-point template (112×112, ArcFace standard) ─────────────────── # Used to align detected landmarks to a fixed pose before embedding ARCFACE_DST = np.array([ [38.2946, 51.6963], [73.5318, 51.5014], [56.0252, 71.7366], [41.5493, 92.3655], [70.7299, 92.2041], ], dtype=np.float32) # ── Model loading ───────────────────────────────────────────────────────────── def _resolve_model_path() -> str: if os.path.exists(MODEL_PATH): return MODEL_PATH if HF_MODEL_REPO: from huggingface_hub import hf_hub_download logger.info(f"Pulling weights from Hub: {HF_MODEL_REPO}") return hf_hub_download( repo_id=HF_MODEL_REPO, filename=HF_MODEL_FILE, token=os.getenv("HF_TOKEN"), ) raise FileNotFoundError( f"Weights not found at '{MODEL_PATH}'. " "Set HF_MODEL_REPO + HF_MODEL_FILE + HF_TOKEN in Space secrets." ) def _load_models(): device = torch.device("cpu") logger.info("Loading TorchScript embedding model...") path = _resolve_model_path() model = torch.jit.load(path, map_location=device) model.eval() logger.info(f"Embedding model loaded from: {path}") # RetinaFace is imported here — lightweight, no ONNX runtime dependency logger.info("RetinaFace detector ready (loaded on first call)") return model _COMPARISON_MODEL = _load_models() # ───────────────────────────────────────────────────────────────────────────── # Image utilities # ───────────────────────────────────────────────────────────────────────────── def _fix_orientation(img: Image.Image) -> Image.Image: """EXIF-aware rotation — handles iPhone / Android captures.""" try: img = ImageOps.exif_transpose(img) img.info.pop("exif", None) except Exception: pass return img.convert("RGB") def _decode_to_rgb(img_input) -> Tuple[bool, Optional[np.ndarray]]: """ Accepts Gradio numpy (RGB), PIL Image, raw bytes, or base64 string. Returns (success, RGB uint8 ndarray). RetinaFace expects RGB; we keep everything in RGB throughout. """ try: if isinstance(img_input, np.ndarray): if img_input.ndim == 2: # grayscale → RGB img_input = cv2.cvtColor(img_input, cv2.COLOR_GRAY2RGB) elif img_input.shape[2] == 4: # RGBA → RGB img_input = cv2.cvtColor(img_input, cv2.COLOR_RGBA2RGB) return True, img_input.astype(np.uint8) if isinstance(img_input, Image.Image): return True, np.array(_fix_orientation(img_input), dtype=np.uint8) if isinstance(img_input, bytes): pil = Image.open(io.BytesIO(img_input)) return True, np.array(_fix_orientation(pil), dtype=np.uint8) if isinstance(img_input, str): return _decode_to_rgb(base64.b64decode(img_input)) except Exception as e: logger.error(f"Decode failed: {e}") return False, None # ───────────────────────────────────────────────────────────────────────────── # Face alignment — similarity transform to ArcFace canonical crop # ───────────────────────────────────────────────────────────────────────────── def _estimate_norm(lmk: np.ndarray, image_size: int = 112) -> np.ndarray: """ Estimate the similarity transform (rotation + scale + translation) that maps detected 5-point landmarks onto the ArcFace canonical template. Returns a 2×3 affine matrix. """ assert lmk.shape == (5, 2) dst = ARCFACE_DST * (image_size / 112.0) # Use OpenCV estimateAffinePartial2D (similarity: no shear) M, _ = cv2.estimateAffinePartial2D(lmk, dst, method=cv2.LMEDS) if M is None: # Fallback: least-squares full affine M, _ = cv2.estimateAffinePartial2D(lmk, dst, method=cv2.RANSAC) return M def _align_face(img_rgb: np.ndarray, landmarks: np.ndarray, image_size: int = 112) -> Optional[np.ndarray]: """Warp face to 112×112 canonical crop. Returns RGB uint8 or None.""" try: M = _estimate_norm(landmarks, image_size) if M is None: return None warped = cv2.warpAffine(img_rgb, M, (image_size, image_size), borderValue=0) return warped except Exception as e: logger.error(f"Alignment failed: {e}") return None # ───────────────────────────────────────────────────────────────────────────── # RetinaFace detection # ───────────────────────────────────────────────────────────────────────────── def _retinaface_detect(img_rgb: np.ndarray) -> list: """ Run RetinaFace on an RGB image. Returns list of dicts: {bbox, landmarks, score} landmarks shape: (5, 2) — [left_eye, right_eye, nose, left_mouth, right_mouth] """ from retinaface import RetinaFace # RetinaFace.detect_faces returns dict keyed by "face_1", "face_2", ... # Each value: {"facial_area": [x1,y1,x2,y2], "landmarks": {...}, "score": float} detections = RetinaFace.detect_faces(img_rgb) if not isinstance(detections, dict): return [] faces = [] for key, val in detections.items(): try: x1, y1, x2, y2 = val["facial_area"] score = float(val.get("score", 1.0)) lm = val["landmarks"] # RetinaFace landmark keys pts = np.array([ lm["left_eye"], lm["right_eye"], lm["nose"], lm["mouth_left"], lm["mouth_right"], ], dtype=np.float32) faces.append({ "bbox": (x1, y1, x2, y2), "landmarks": pts, "score": score, "area": (x2 - x1) * (y2 - y1), }) except (KeyError, TypeError): continue # Sort by area descending (largest face first) faces.sort(key=lambda f: f["area"], reverse=True) return faces def _detect_and_align(img_rgb: np.ndarray, image_idx: int) -> Tuple[Optional[dict], str]: """ Detect faces in one image with rotation retry. Returns (face_result_dict | None, feedback_message). face_result_dict keys: image_tensor (numpy), detection_confidence """ faces = _retinaface_detect(img_rgb) # Rotation retry if nothing found if not faces: for angle, code in [(90, cv2.ROTATE_90_CLOCKWISE), (180, cv2.ROTATE_180), (270, cv2.ROTATE_90_COUNTERCLOCKWISE)]: rotated = cv2.rotate(img_rgb, code) faces = _retinaface_detect(rotated) if faces: img_rgb = rotated logger.info(f"Image {image_idx}: detected after {angle}° rotation") break if not faces: return None, (f"No face detected in image {image_idx}. " "Ensure the face is clearly visible, well-lit, and unobstructed.") # Two-face handling: keep largest if the second is tiny (background/watermark) if len(faces) >= 2: ratio = faces[1]["area"] / faces[0]["area"] if ratio >= FACE_RATIO_THRESH: return None, (f"Two comparable faces found in image {image_idx} " f"(size ratio {ratio:.2f}). Please upload an image " "with a single dominant face.") # else: silently drop the smaller face face = faces[0] crop = _align_face(img_rgb, face["landmarks"]) if crop is None: return None, f"Face alignment failed for image {image_idx}." # → float32 tensor [1, 3, 112, 112] in [0, 1] tensor = (torch.from_numpy(crop.astype(np.float32)) .permute(2, 0, 1) .unsqueeze(0) / 255.0) return { "image_tensor": tensor.numpy(), "detection_confidence": round(face["score"], 3), }, "OK" # ───────────────────────────────────────────────────────────────────────────── # Embedding + similarity # ───────────────────────────────────────────────────────────────────────────── def _augmented_embeddings(tensor: torch.Tensor) -> List[torch.Tensor]: """Original + horizontal flip + brightened → 3 embeddings.""" flip = torch.flip(tensor, dims=[3]) bright = torch.clamp(tensor * 1.5, 0, 1) with torch.no_grad(): return [_COMPARISON_MODEL(t).squeeze() for t in [tensor, flip, bright]] def _avg_cosine(embs1: List[torch.Tensor], embs2: List[torch.Tensor]) -> float: sims = [F.cosine_similarity(e1.unsqueeze(0), e2.unsqueeze(0)).item() for e1 in embs1 for e2 in embs2] return sum(sims) / len(sims) def _cosine_to_confidence(score: float) -> float: conf = 1.0 / (1.0 + np.exp(-STEEPNESS * (score - THRESHOLD))) return round(min(conf * 100.0, MAX_CONFIDENCE), 2) # ───────────────────────────────────────────────────────────────────────────── # Full pipeline # ───────────────────────────────────────────────────────────────────────────── def _compare(img1, img2) -> dict: ok1, rgb1 = _decode_to_rgb(img1) ok2, rgb2 = _decode_to_rgb(img2) if not ok1 or not ok2: return {"success": False, "message": "Image decoding failed.", "score": 0.0, "confidence": 0.0, "match": False, "det1": 0.0, "det2": 0.0} face1, msg1 = _detect_and_align(rgb1, 1) if face1 is None: return {"success": False, "message": msg1, "score": 0.0, "confidence": 0.0, "match": False, "det1": 0.0, "det2": 0.0} face2, msg2 = _detect_and_align(rgb2, 2) if face2 is None: return {"success": False, "message": msg2, "score": 0.0, "confidence": 0.0, "match": False, "det1": face1["detection_confidence"], "det2": 0.0} t1 = torch.tensor(face1["image_tensor"], dtype=torch.float32) t2 = torch.tensor(face2["image_tensor"], dtype=torch.float32) score = _avg_cosine(_augmented_embeddings(t1), _augmented_embeddings(t2)) confidence = _cosine_to_confidence(score) match = score >= THRESHOLD return { "success": True, "match": match, "score": round(score, 4), "confidence": confidence, "message": "Faces matched" if match else "Faces do not match", "det1": face1["detection_confidence"], "det2": face2["detection_confidence"], } # ───────────────────────────────────────────────────────────────────────────── # Gradio inference wrapper # ───────────────────────────────────────────────────────────────────────────── def run_comparison(img1: np.ndarray, img2: np.ndarray): if img1 is None or img2 is None: err = _verdict_html(False, None, "Upload both images to run comparison.") return err, "—", "—", "" r = _compare(img1, img2) if not r["success"]: return _verdict_html(False, None, r["message"]), "—", "—", _details_html(r) return ( _verdict_html(True, r["match"], r["message"]), f"{r['score']:.4f}", f"{r['confidence']}%", _details_html(r), ) def _verdict_html(success: bool, match: Optional[bool], message: str) -> str: if not success: color, icon, label = "#c0392b", "✕", "Error" elif match: color, icon, label = "#16a085", "✓", "Match" else: color, icon, label = "#c0392b", "✕", "No Match" bg = "#eafaf7" if (success and match) else "#fdf2f2" return f"""