neuralninja10's picture
Update app.py
6fc0a40 verified
"""
Facial Comparison β€” HuggingFace Space
======================================
Detection : RetinaFace (public, well-known detector)
Alignment : 5-point similarity transform β†’ 112Γ—112 canonical crop
Embedding : facial_comparison.pt (private TorchScript model via HF secrets)
Similarity : Augmented cosine (3Γ—3 pairs) + sigmoid confidence
"""
import os
import io
import base64
import logging
import numpy as np
import torch
import torch.nn.functional as F
import cv2
from PIL import Image, ImageOps
from typing import List, Tuple, Optional
import gradio as gr
# ── Logging ───────────────────────────────────────────────────────────────────
logging.basicConfig(level=logging.INFO, format="%(levelname)s | %(name)s | %(message)s")
logger = logging.getLogger("facial-comparison")
# ── Config ────────────────────────────────────────────────────────────────────
MODEL_PATH = os.getenv("MODEL_PATH", "models/facial_comparison.pt")
HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "")
HF_MODEL_FILE = os.getenv("HF_MODEL_FILE", "facial_comparison.pt")
THRESHOLD = float(os.getenv("THRESHOLD", 0.38))
FACE_RATIO_THRESH = float(os.getenv("FACE_RATIO_THRESHOLD", 0.15))
STEEPNESS = float(os.getenv("STEEPNESS", 12.0))
MAX_CONFIDENCE = float(os.getenv("MAX_CONFIDENCE", 99.9))
# ── Canonical 5-point template (112Γ—112, ArcFace standard) ───────────────────
# Used to align detected landmarks to a fixed pose before embedding
ARCFACE_DST = np.array([
[38.2946, 51.6963],
[73.5318, 51.5014],
[56.0252, 71.7366],
[41.5493, 92.3655],
[70.7299, 92.2041],
], dtype=np.float32)
# ── Model loading ─────────────────────────────────────────────────────────────
def _resolve_model_path() -> str:
if os.path.exists(MODEL_PATH):
return MODEL_PATH
if HF_MODEL_REPO:
from huggingface_hub import hf_hub_download
logger.info(f"Pulling weights from Hub: {HF_MODEL_REPO}")
return hf_hub_download(
repo_id=HF_MODEL_REPO,
filename=HF_MODEL_FILE,
token=os.getenv("HF_TOKEN"),
)
raise FileNotFoundError(
f"Weights not found at '{MODEL_PATH}'. "
"Set HF_MODEL_REPO + HF_MODEL_FILE + HF_TOKEN in Space secrets."
)
def _load_models():
device = torch.device("cpu")
logger.info("Loading TorchScript embedding model...")
path = _resolve_model_path()
model = torch.jit.load(path, map_location=device)
model.eval()
logger.info(f"Embedding model loaded from: {path}")
# RetinaFace is imported here β€” lightweight, no ONNX runtime dependency
logger.info("RetinaFace detector ready (loaded on first call)")
return model
_COMPARISON_MODEL = _load_models()
# ─────────────────────────────────────────────────────────────────────────────
# Image utilities
# ─────────────────────────────────────────────────────────────────────────────
def _fix_orientation(img: Image.Image) -> Image.Image:
"""EXIF-aware rotation β€” handles iPhone / Android captures."""
try:
img = ImageOps.exif_transpose(img)
img.info.pop("exif", None)
except Exception:
pass
return img.convert("RGB")
def _decode_to_rgb(img_input) -> Tuple[bool, Optional[np.ndarray]]:
"""
Accepts Gradio numpy (RGB), PIL Image, raw bytes, or base64 string.
Returns (success, RGB uint8 ndarray).
RetinaFace expects RGB; we keep everything in RGB throughout.
"""
try:
if isinstance(img_input, np.ndarray):
if img_input.ndim == 2: # grayscale β†’ RGB
img_input = cv2.cvtColor(img_input, cv2.COLOR_GRAY2RGB)
elif img_input.shape[2] == 4: # RGBA β†’ RGB
img_input = cv2.cvtColor(img_input, cv2.COLOR_RGBA2RGB)
return True, img_input.astype(np.uint8)
if isinstance(img_input, Image.Image):
return True, np.array(_fix_orientation(img_input), dtype=np.uint8)
if isinstance(img_input, bytes):
pil = Image.open(io.BytesIO(img_input))
return True, np.array(_fix_orientation(pil), dtype=np.uint8)
if isinstance(img_input, str):
return _decode_to_rgb(base64.b64decode(img_input))
except Exception as e:
logger.error(f"Decode failed: {e}")
return False, None
# ─────────────────────────────────────────────────────────────────────────────
# Face alignment β€” similarity transform to ArcFace canonical crop
# ─────────────────────────────────────────────────────────────────────────────
def _estimate_norm(lmk: np.ndarray, image_size: int = 112) -> np.ndarray:
"""
Estimate the similarity transform (rotation + scale + translation) that
maps detected 5-point landmarks onto the ArcFace canonical template.
Returns a 2Γ—3 affine matrix.
"""
assert lmk.shape == (5, 2)
dst = ARCFACE_DST * (image_size / 112.0)
# Use OpenCV estimateAffinePartial2D (similarity: no shear)
M, _ = cv2.estimateAffinePartial2D(lmk, dst, method=cv2.LMEDS)
if M is None:
# Fallback: least-squares full affine
M, _ = cv2.estimateAffinePartial2D(lmk, dst, method=cv2.RANSAC)
return M
def _align_face(img_rgb: np.ndarray, landmarks: np.ndarray,
image_size: int = 112) -> Optional[np.ndarray]:
"""Warp face to 112Γ—112 canonical crop. Returns RGB uint8 or None."""
try:
M = _estimate_norm(landmarks, image_size)
if M is None:
return None
warped = cv2.warpAffine(img_rgb, M, (image_size, image_size),
borderValue=0)
return warped
except Exception as e:
logger.error(f"Alignment failed: {e}")
return None
# ─────────────────────────────────────────────────────────────────────────────
# RetinaFace detection
# ─────────────────────────────────────────────────────────────────────────────
def _retinaface_detect(img_rgb: np.ndarray) -> list:
"""
Run RetinaFace on an RGB image.
Returns list of dicts: {bbox, landmarks, score}
landmarks shape: (5, 2) β€” [left_eye, right_eye, nose, left_mouth, right_mouth]
"""
from retinaface import RetinaFace
# RetinaFace.detect_faces returns dict keyed by "face_1", "face_2", ...
# Each value: {"facial_area": [x1,y1,x2,y2], "landmarks": {...}, "score": float}
detections = RetinaFace.detect_faces(img_rgb)
if not isinstance(detections, dict):
return []
faces = []
for key, val in detections.items():
try:
x1, y1, x2, y2 = val["facial_area"]
score = float(val.get("score", 1.0))
lm = val["landmarks"]
# RetinaFace landmark keys
pts = np.array([
lm["left_eye"],
lm["right_eye"],
lm["nose"],
lm["mouth_left"],
lm["mouth_right"],
], dtype=np.float32)
faces.append({
"bbox": (x1, y1, x2, y2),
"landmarks": pts,
"score": score,
"area": (x2 - x1) * (y2 - y1),
})
except (KeyError, TypeError):
continue
# Sort by area descending (largest face first)
faces.sort(key=lambda f: f["area"], reverse=True)
return faces
def _detect_and_align(img_rgb: np.ndarray,
image_idx: int) -> Tuple[Optional[dict], str]:
"""
Detect faces in one image with rotation retry.
Returns (face_result_dict | None, feedback_message).
face_result_dict keys: image_tensor (numpy), detection_confidence
"""
faces = _retinaface_detect(img_rgb)
# Rotation retry if nothing found
if not faces:
for angle, code in [(90, cv2.ROTATE_90_CLOCKWISE),
(180, cv2.ROTATE_180),
(270, cv2.ROTATE_90_COUNTERCLOCKWISE)]:
rotated = cv2.rotate(img_rgb, code)
faces = _retinaface_detect(rotated)
if faces:
img_rgb = rotated
logger.info(f"Image {image_idx}: detected after {angle}Β° rotation")
break
if not faces:
return None, (f"No face detected in image {image_idx}. "
"Ensure the face is clearly visible, well-lit, and unobstructed.")
# Two-face handling: keep largest if the second is tiny (background/watermark)
if len(faces) >= 2:
ratio = faces[1]["area"] / faces[0]["area"]
if ratio >= FACE_RATIO_THRESH:
return None, (f"Two comparable faces found in image {image_idx} "
f"(size ratio {ratio:.2f}). Please upload an image "
"with a single dominant face.")
# else: silently drop the smaller face
face = faces[0]
crop = _align_face(img_rgb, face["landmarks"])
if crop is None:
return None, f"Face alignment failed for image {image_idx}."
# β†’ float32 tensor [1, 3, 112, 112] in [0, 1]
tensor = (torch.from_numpy(crop.astype(np.float32))
.permute(2, 0, 1)
.unsqueeze(0) / 255.0)
return {
"image_tensor": tensor.numpy(),
"detection_confidence": round(face["score"], 3),
}, "OK"
# ─────────────────────────────────────────────────────────────────────────────
# Embedding + similarity
# ─────────────────────────────────────────────────────────────────────────────
def _augmented_embeddings(tensor: torch.Tensor) -> List[torch.Tensor]:
"""Original + horizontal flip + brightened β†’ 3 embeddings."""
flip = torch.flip(tensor, dims=[3])
bright = torch.clamp(tensor * 1.5, 0, 1)
with torch.no_grad():
return [_COMPARISON_MODEL(t).squeeze() for t in [tensor, flip, bright]]
def _avg_cosine(embs1: List[torch.Tensor],
embs2: List[torch.Tensor]) -> float:
sims = [F.cosine_similarity(e1.unsqueeze(0), e2.unsqueeze(0)).item()
for e1 in embs1 for e2 in embs2]
return sum(sims) / len(sims)
def _cosine_to_confidence(score: float) -> float:
conf = 1.0 / (1.0 + np.exp(-STEEPNESS * (score - THRESHOLD)))
return round(min(conf * 100.0, MAX_CONFIDENCE), 2)
# ─────────────────────────────────────────────────────────────────────────────
# Full pipeline
# ─────────────────────────────────────────────────────────────────────────────
def _compare(img1, img2) -> dict:
ok1, rgb1 = _decode_to_rgb(img1)
ok2, rgb2 = _decode_to_rgb(img2)
if not ok1 or not ok2:
return {"success": False, "message": "Image decoding failed.",
"score": 0.0, "confidence": 0.0, "match": False,
"det1": 0.0, "det2": 0.0}
face1, msg1 = _detect_and_align(rgb1, 1)
if face1 is None:
return {"success": False, "message": msg1,
"score": 0.0, "confidence": 0.0, "match": False,
"det1": 0.0, "det2": 0.0}
face2, msg2 = _detect_and_align(rgb2, 2)
if face2 is None:
return {"success": False, "message": msg2,
"score": 0.0, "confidence": 0.0, "match": False,
"det1": face1["detection_confidence"], "det2": 0.0}
t1 = torch.tensor(face1["image_tensor"], dtype=torch.float32)
t2 = torch.tensor(face2["image_tensor"], dtype=torch.float32)
score = _avg_cosine(_augmented_embeddings(t1), _augmented_embeddings(t2))
confidence = _cosine_to_confidence(score)
match = score >= THRESHOLD
return {
"success": True,
"match": match,
"score": round(score, 4),
"confidence": confidence,
"message": "Faces matched" if match else "Faces do not match",
"det1": face1["detection_confidence"],
"det2": face2["detection_confidence"],
}
# ─────────────────────────────────────────────────────────────────────────────
# Gradio inference wrapper
# ─────────────────────────────────────────────────────────────────────────────
def run_comparison(img1: np.ndarray, img2: np.ndarray):
if img1 is None or img2 is None:
err = _verdict_html(False, None, "Upload both images to run comparison.")
return err, "β€”", "β€”", ""
r = _compare(img1, img2)
if not r["success"]:
return _verdict_html(False, None, r["message"]), "β€”", "β€”", _details_html(r)
return (
_verdict_html(True, r["match"], r["message"]),
f"{r['score']:.4f}",
f"{r['confidence']}%",
_details_html(r),
)
def _verdict_html(success: bool, match: Optional[bool], message: str) -> str:
if not success:
color, icon, label = "#c0392b", "βœ•", "Error"
elif match:
color, icon, label = "#16a085", "βœ“", "Match"
else:
color, icon, label = "#c0392b", "βœ•", "No Match"
bg = "#eafaf7" if (success and match) else "#fdf2f2"
return f"""
<div style="display:flex; align-items:center; gap:14px; padding:18px 24px;
border-radius:10px; border-left:4px solid {color}; background:{bg};
font-family:'DM Sans',sans-serif;">
<span style="font-size:28px; color:{color}; font-weight:700;">{icon}</span>
<div>
<div style="font-size:19px; font-weight:700; color:{color};">{label}</div>
<div style="font-size:13px; color:#555; margin-top:2px;">{message}</div>
</div>
</div>"""
def _details_html(r: dict) -> str:
if not r.get("success"):
return (f'<div style="font-family:monospace; font-size:12px; color:#888;'
f'padding:10px 14px; background:#f9f9f9; border-radius:6px;">'
f'{r["message"]}</div>')
bar_pct = min(int(r["confidence"]), 100)
bar_color = "#16a085" if r["match"] else "#c0392b"
return f"""
<div style="font-family:'DM Sans',sans-serif; font-size:13px; color:#333;">
<div style="display:flex; gap:32px; margin-bottom:14px;">
<div>
<div style="font-size:11px; color:#888; text-transform:uppercase; letter-spacing:.06em;">Similarity score</div>
<div style="font-size:22px; font-weight:700; color:#111;">{r['score']}</div>
</div>
<div>
<div style="font-size:11px; color:#888; text-transform:uppercase; letter-spacing:.06em;">Threshold</div>
<div style="font-size:22px; font-weight:700; color:#111;">{THRESHOLD}</div>
</div>
<div>
<div style="font-size:11px; color:#888; text-transform:uppercase; letter-spacing:.06em;">Method</div>
<div style="font-size:13px; font-weight:500; color:#555; padding-top:5px;">Augmented cosine<br>(3Γ—3 pairs)</div>
</div>
</div>
<div style="margin-bottom:6px;">
<div style="display:flex; justify-content:space-between; margin-bottom:4px;">
<span style="font-size:11px; color:#888; text-transform:uppercase; letter-spacing:.06em;">Confidence</span>
<span style="font-size:13px; font-weight:700; color:{bar_color};">{r['confidence']}%</span>
</div>
<div style="height:6px; background:#e8e8e8; border-radius:4px; overflow:hidden;">
<div style="height:100%; width:{bar_pct}%; background:{bar_color};
border-radius:4px; transition:width .4s;"></div>
</div>
</div>
<div style="display:flex; gap:24px; margin-top:14px; padding-top:14px;
border-top:1px solid #ececec;">
<div style="font-size:11px; color:#888;">
Detection confidence β€” image 1: <strong style="color:#333;">{r['det1']}</strong>
</div>
<div style="font-size:11px; color:#888;">
Detection confidence β€” image 2: <strong style="color:#333;">{r['det2']}</strong>
</div>
</div>
</div>"""
# ─────────────────────────────────────────────────────────────────────────────
# CSS
# ─────────────────────────────────────────────────────────────────────────────
CSS = """
@import url('https://fonts.googleapis.com/css2?family=DM+Sans:wght@400;500;700&family=DM+Mono:wght@400;500&display=swap');
body, .gradio-container {
font-family: 'DM Sans', sans-serif !important;
background: #f7f7f5 !important;
}
.top-bar {
background: #0d0d0d; color: #fff;
padding: 18px 28px 14px; border-radius: 12px; margin-bottom: 4px;
}
.top-bar h1 { font-size: 22px; font-weight: 700; margin: 0 0 4px; letter-spacing: -0.02em; }
.top-bar .badges { display: flex; gap: 8px; margin-top: 10px; flex-wrap: wrap; }
.top-bar .badge {
font-family: 'DM Mono', monospace; font-size: 10px;
padding: 3px 9px; border: 1px solid #333; border-radius: 20px; color: #aaa;
}
.upload-panel { background: #fff; border: 1px solid #e5e5e5; border-radius: 12px; overflow: hidden; }
.upload-label {
font-size: 11px; font-weight: 700; letter-spacing: .08em;
text-transform: uppercase; color: #888; padding: 10px 14px 0;
font-family: 'DM Mono', monospace;
}
.results-label {
font-size: 11px; font-weight: 700; text-transform: uppercase;
letter-spacing: .08em; color: #bbb; margin-bottom: 10px;
font-family: 'DM Mono', monospace;
}
.run-btn {
background: #0d0d0d !important; color: #fff !important;
border: none !important; border-radius: 8px !important;
font-family: 'DM Sans', sans-serif !important; font-weight: 700 !important;
font-size: 14px !important; padding: 12px 0 !important;
width: 100% !important; cursor: pointer !important; letter-spacing: 0.01em !important;
}
.run-btn:hover { background: #1a1a1a !important; }
.clear-btn {
background: transparent !important; color: #888 !important;
border: 1px solid #ddd !important; border-radius: 8px !important;
font-family: 'DM Mono', monospace !important; font-size: 12px !important;
}
footer { display: none !important; }
.svelte-1gfkn6j { display: none !important; }
input[type=number] { display: none; }
label span { font-family: 'DM Mono', monospace; font-size: 11px !important; color: #888 !important; }
"""
# ─────────────────────────────────────────────────────────────────────────────
# Gradio UI
# ─────────────────────────────────────────────────────────────────────────────
def build_ui():
with gr.Blocks(css=CSS, title="Facial Comparison") as demo:
gr.HTML("""
<div class="top-bar">
<h1>Facial Comparison</h1>
<p style="color:#ccc; font-size:13px; font-family:'DM Sans',sans-serif;
font-weight:400; margin-bottom:10px; line-height:1.6;">
Verify whether two faces belong to the same person β€” works on portraits, selfies,
and identity documents (CNIC, passport). The system automatically extracts the face
from an ID card and compares it against a live photo.
Deployed across <strong style="color:#fff;">40+ financial institutions</strong> for
customer onboarding and fraud prevention.
</p>
<div class="badges">
<span class="badge">face matching</span>
<span class="badge">ID card face extraction</span>
<span class="badge">liveness-aware</span>
<span class="badge">occlusion handling</span>
<span class="badge">production-grade</span>
</div>
<div style="margin-top:12px; display:flex; align-items:center; gap:6px;
font-family:'DM Mono',monospace; font-size:10px; color:#555;">
<span style="display:inline-block; width:7px; height:7px; border-radius:50%;
background:#22c55e; flex-shrink:0;"></span>
No images are stored, logged, or transmitted beyond this session.
Your data never leaves inference memory.
</div>
</div>
""")
with gr.Row(equal_height=True):
with gr.Column(scale=5):
with gr.Row(equal_height=True):
with gr.Column():
gr.HTML('<div class="upload-label">Image 1</div>')
img1 = gr.Image(label="", type="numpy",
sources=["upload", "clipboard"],
height=260, elem_classes=["upload-panel"])
with gr.Column():
gr.HTML('<div class="upload-label">Image 2</div>')
img2 = gr.Image(label="", type="numpy",
sources=["upload", "clipboard"],
height=260, elem_classes=["upload-panel"])
with gr.Row():
clear_btn = gr.Button("Clear", elem_classes=["clear-btn"])
run_btn = gr.Button("Compare β†’", elem_classes=["run-btn"])
gr.HTML("""
<div style="margin-top:10px; padding:12px 16px; background:#fff;
border:1px solid #ececec; border-radius:10px;
font-family:'DM Mono',monospace; font-size:11px;
color:#aaa; line-height:1.8;">
<strong style="color:#555;">Supported inputs</strong><br>
Portrait photo Β· Selfie Β· ID card (face auto-extracted) Β· Passport photo page<br><br>
<strong style="color:#555;">How to use</strong><br>
Upload any two images β€” the system locates and extracts the face from each,
then computes a match score and confidence percentage.
</div>
""")
with gr.Column(scale=4):
gr.HTML('<div class="results-label" style="margin-bottom:6px;">Result</div>')
verdict_html = gr.HTML(
value='<div style="height:72px; background:#f7f7f5; border-radius:10px;'
'border:1px dashed #ddd; display:flex; align-items:center;'
'justify-content:center; color:#ccc; font-size:13px;'
'font-family:DM Mono,monospace;">awaiting input</div>'
)
with gr.Row():
score_out = gr.Label(label="Similarity score")
conf_out = gr.Label(label="Confidence")
gr.HTML('<div class="results-label" style="margin:10px 0 6px;">Details</div>')
details_html = gr.HTML(
value='<div style="height:80px; background:#f7f7f5; border-radius:8px;'
'border:1px dashed #ddd;"></div>'
)
with gr.Accordion("How it works", open=False):
gr.HTML("""
<div style="font-family:'DM Sans',sans-serif; font-size:13px;
color:#555; line-height:1.8; padding:4px 0;">
<strong>1. Face extraction</strong> β€” The system automatically locates
every face in the uploaded image, including faces embedded in identity
documents like CNICs and passports. No manual cropping required.<br><br>
<strong>2. Alignment</strong> β€” Each detected face is geometrically
normalised to a canonical frontal pose using facial landmark positions,
making the comparison robust to head tilt, lighting, and image angle.<br><br>
<strong>3. Feature encoding</strong> β€” The aligned face is passed through
a deep neural network (custom-trained) that compresses it into a compact
numerical representation capturing unique facial geometry.<br><br>
<strong>4. Robust matching</strong> β€” Multiple augmented versions of each
face are compared, and the results are averaged to produce a stable
similarity score resilient to minor image quality variations.<br><br>
<strong>5. Confidence scoring</strong> β€” The similarity score is converted
into an intuitive 0–99.9% confidence value along with a clear
Match / No Match verdict.<br><br>
<strong style="color:#16a085;">Privacy</strong> β€” All processing happens
entirely within the inference session. No image, face crop, score, or
metadata is written to disk, logged, or sent to any external service.
Once your session ends, nothing is retained.
</div>
""")
run_btn.click(
fn=run_comparison,
inputs=[img1, img2],
outputs=[verdict_html, score_out, conf_out, details_html],
)
clear_btn.click(
fn=lambda: (
None, None,
'<div style="height:72px; background:#f7f7f5; border-radius:10px;'
'border:1px dashed #ddd; display:flex; align-items:center;'
'justify-content:center; color:#ccc; font-size:13px;'
'font-family:DM Mono,monospace;">awaiting input</div>',
"β€”", "β€”",
'<div style="height:80px; background:#f7f7f5; border-radius:8px;'
'border:1px dashed #ddd;"></div>',
),
outputs=[img1, img2, verdict_html, score_out, conf_out, details_html],
)
return demo
if __name__ == "__main__":
ui = build_ui()
ui.launch(server_name="0.0.0.0", server_port=7860, show_error=True)