Spaces:

neuralninja10
/

facialComparison

Sleeping

App Files Files Community

facialComparison / app.py

neuralninja10

Update app.py

6fc0a40 verified about 1 month ago

raw

history blame contribute delete

28.4 kB

	"""
	Facial Comparison — HuggingFace Space
	======================================
	Detection : RetinaFace (public, well-known detector)
	Alignment : 5-point similarity transform → 112×112 canonical crop
	Embedding : facial_comparison.pt (private TorchScript model via HF secrets)
	Similarity : Augmented cosine (3×3 pairs) + sigmoid confidence
	"""

	import os
	import io
	import base64
	import logging
	import numpy as np
	import torch
	import torch.nn.functional as F
	import cv2
	from PIL import Image, ImageOps
	from typing import List, Tuple, Optional
	import gradio as gr

	# ── Logging ───────────────────────────────────────────────────────────────────
	logging.basicConfig(level=logging.INFO, format="%(levelname)s \| %(name)s \| %(message)s")
	logger = logging.getLogger("facial-comparison")

	# ── Config ────────────────────────────────────────────────────────────────────
	MODEL_PATH = os.getenv("MODEL_PATH", "models/facial_comparison.pt")
	HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "")
	HF_MODEL_FILE = os.getenv("HF_MODEL_FILE", "facial_comparison.pt")
	THRESHOLD = float(os.getenv("THRESHOLD", 0.38))
	FACE_RATIO_THRESH = float(os.getenv("FACE_RATIO_THRESHOLD", 0.15))
	STEEPNESS = float(os.getenv("STEEPNESS", 12.0))
	MAX_CONFIDENCE = float(os.getenv("MAX_CONFIDENCE", 99.9))

	# ── Canonical 5-point template (112×112, ArcFace standard) ───────────────────
	# Used to align detected landmarks to a fixed pose before embedding
	ARCFACE_DST = np.array([
	[38.2946, 51.6963],
	[73.5318, 51.5014],
	[56.0252, 71.7366],
	[41.5493, 92.3655],
	[70.7299, 92.2041],
	], dtype=np.float32)


	# ── Model loading ─────────────────────────────────────────────────────────────
	def _resolve_model_path() -> str:
	if os.path.exists(MODEL_PATH):
	return MODEL_PATH
	if HF_MODEL_REPO:
	from huggingface_hub import hf_hub_download
	logger.info(f"Pulling weights from Hub: {HF_MODEL_REPO}")
	return hf_hub_download(
	repo_id=HF_MODEL_REPO,
	filename=HF_MODEL_FILE,
	token=os.getenv("HF_TOKEN"),
	)
	raise FileNotFoundError(
	f"Weights not found at '{MODEL_PATH}'. "
	"Set HF_MODEL_REPO + HF_MODEL_FILE + HF_TOKEN in Space secrets."
	)


	def _load_models():
	device = torch.device("cpu")

	logger.info("Loading TorchScript embedding model...")
	path = _resolve_model_path()
	model = torch.jit.load(path, map_location=device)
	model.eval()
	logger.info(f"Embedding model loaded from: {path}")

	# RetinaFace is imported here — lightweight, no ONNX runtime dependency
	logger.info("RetinaFace detector ready (loaded on first call)")

	return model


	_COMPARISON_MODEL = _load_models()


	# ─────────────────────────────────────────────────────────────────────────────
	# Image utilities
	# ─────────────────────────────────────────────────────────────────────────────

	def _fix_orientation(img: Image.Image) -> Image.Image:
	"""EXIF-aware rotation — handles iPhone / Android captures."""
	try:
	img = ImageOps.exif_transpose(img)
	img.info.pop("exif", None)
	except Exception:
	pass
	return img.convert("RGB")


	def _decode_to_rgb(img_input) -> Tuple[bool, Optional[np.ndarray]]:
	"""
	Accepts Gradio numpy (RGB), PIL Image, raw bytes, or base64 string.
	Returns (success, RGB uint8 ndarray).
	RetinaFace expects RGB; we keep everything in RGB throughout.
	"""
	try:
	if isinstance(img_input, np.ndarray):
	if img_input.ndim == 2: # grayscale → RGB
	img_input = cv2.cvtColor(img_input, cv2.COLOR_GRAY2RGB)
	elif img_input.shape[2] == 4: # RGBA → RGB
	img_input = cv2.cvtColor(img_input, cv2.COLOR_RGBA2RGB)
	return True, img_input.astype(np.uint8)

	if isinstance(img_input, Image.Image):
	return True, np.array(_fix_orientation(img_input), dtype=np.uint8)

	if isinstance(img_input, bytes):
	pil = Image.open(io.BytesIO(img_input))
	return True, np.array(_fix_orientation(pil), dtype=np.uint8)

	if isinstance(img_input, str):
	return _decode_to_rgb(base64.b64decode(img_input))

	except Exception as e:
	logger.error(f"Decode failed: {e}")

	return False, None


	# ─────────────────────────────────────────────────────────────────────────────
	# Face alignment — similarity transform to ArcFace canonical crop
	# ─────────────────────────────────────────────────────────────────────────────

	def _estimate_norm(lmk: np.ndarray, image_size: int = 112) -> np.ndarray:
	"""
	Estimate the similarity transform (rotation + scale + translation) that
	maps detected 5-point landmarks onto the ArcFace canonical template.
	Returns a 2×3 affine matrix.
	"""
	assert lmk.shape == (5, 2)
	dst = ARCFACE_DST * (image_size / 112.0)

	# Use OpenCV estimateAffinePartial2D (similarity: no shear)
	M, _ = cv2.estimateAffinePartial2D(lmk, dst, method=cv2.LMEDS)
	if M is None:
	# Fallback: least-squares full affine
	M, _ = cv2.estimateAffinePartial2D(lmk, dst, method=cv2.RANSAC)
	return M


	def _align_face(img_rgb: np.ndarray, landmarks: np.ndarray,
	image_size: int = 112) -> Optional[np.ndarray]:
	"""Warp face to 112×112 canonical crop. Returns RGB uint8 or None."""
	try:
	M = _estimate_norm(landmarks, image_size)
	if M is None:
	return None
	warped = cv2.warpAffine(img_rgb, M, (image_size, image_size),
	borderValue=0)
	return warped
	except Exception as e:
	logger.error(f"Alignment failed: {e}")
	return None


	# ─────────────────────────────────────────────────────────────────────────────
	# RetinaFace detection
	# ─────────────────────────────────────────────────────────────────────────────

	def _retinaface_detect(img_rgb: np.ndarray) -> list:
	"""
	Run RetinaFace on an RGB image.
	Returns list of dicts: {bbox, landmarks, score}
	landmarks shape: (5, 2) — [left_eye, right_eye, nose, left_mouth, right_mouth]
	"""
	from retinaface import RetinaFace

	# RetinaFace.detect_faces returns dict keyed by "face_1", "face_2", ...
	# Each value: {"facial_area": [x1,y1,x2,y2], "landmarks": {...}, "score": float}
	detections = RetinaFace.detect_faces(img_rgb)

	if not isinstance(detections, dict):
	return []

	faces = []
	for key, val in detections.items():
	try:
	x1, y1, x2, y2 = val["facial_area"]
	score = float(val.get("score", 1.0))
	lm = val["landmarks"]

	# RetinaFace landmark keys
	pts = np.array([
	lm["left_eye"],
	lm["right_eye"],
	lm["nose"],
	lm["mouth_left"],
	lm["mouth_right"],
	], dtype=np.float32)

	faces.append({
	"bbox": (x1, y1, x2, y2),
	"landmarks": pts,
	"score": score,
	"area": (x2 - x1) * (y2 - y1),
	})
	except (KeyError, TypeError):
	continue

	# Sort by area descending (largest face first)
	faces.sort(key=lambda f: f["area"], reverse=True)
	return faces


	def _detect_and_align(img_rgb: np.ndarray,
	image_idx: int) -> Tuple[Optional[dict], str]:
	"""
	Detect faces in one image with rotation retry.
	Returns (face_result_dict \| None, feedback_message).
	face_result_dict keys: image_tensor (numpy), detection_confidence
	"""
	faces = _retinaface_detect(img_rgb)

	# Rotation retry if nothing found
	if not faces:
	for angle, code in [(90, cv2.ROTATE_90_CLOCKWISE),
	(180, cv2.ROTATE_180),
	(270, cv2.ROTATE_90_COUNTERCLOCKWISE)]:
	rotated = cv2.rotate(img_rgb, code)
	faces = _retinaface_detect(rotated)
	if faces:
	img_rgb = rotated
	logger.info(f"Image {image_idx}: detected after {angle}° rotation")
	break

	if not faces:
	return None, (f"No face detected in image {image_idx}. "
	"Ensure the face is clearly visible, well-lit, and unobstructed.")

	# Two-face handling: keep largest if the second is tiny (background/watermark)
	if len(faces) >= 2:
	ratio = faces[1]["area"] / faces[0]["area"]
	if ratio >= FACE_RATIO_THRESH:
	return None, (f"Two comparable faces found in image {image_idx} "
	f"(size ratio {ratio:.2f}). Please upload an image "
	"with a single dominant face.")
	# else: silently drop the smaller face

	face = faces[0]
	crop = _align_face(img_rgb, face["landmarks"])

	if crop is None:
	return None, f"Face alignment failed for image {image_idx}."

	# → float32 tensor [1, 3, 112, 112] in [0, 1]
	tensor = (torch.from_numpy(crop.astype(np.float32))
	.permute(2, 0, 1)
	.unsqueeze(0) / 255.0)

	return {
	"image_tensor": tensor.numpy(),
	"detection_confidence": round(face["score"], 3),
	}, "OK"


	# ─────────────────────────────────────────────────────────────────────────────
	# Embedding + similarity
	# ─────────────────────────────────────────────────────────────────────────────

	def _augmented_embeddings(tensor: torch.Tensor) -> List[torch.Tensor]:
	"""Original + horizontal flip + brightened → 3 embeddings."""
	flip = torch.flip(tensor, dims=[3])
	bright = torch.clamp(tensor * 1.5, 0, 1)
	with torch.no_grad():
	return [_COMPARISON_MODEL(t).squeeze() for t in [tensor, flip, bright]]


	def _avg_cosine(embs1: List[torch.Tensor],
	embs2: List[torch.Tensor]) -> float:
	sims = [F.cosine_similarity(e1.unsqueeze(0), e2.unsqueeze(0)).item()
	for e1 in embs1 for e2 in embs2]
	return sum(sims) / len(sims)


	def _cosine_to_confidence(score: float) -> float:
	conf = 1.0 / (1.0 + np.exp(-STEEPNESS * (score - THRESHOLD)))
	return round(min(conf * 100.0, MAX_CONFIDENCE), 2)


	# ─────────────────────────────────────────────────────────────────────────────
	# Full pipeline
	# ─────────────────────────────────────────────────────────────────────────────

	def _compare(img1, img2) -> dict:
	ok1, rgb1 = _decode_to_rgb(img1)
	ok2, rgb2 = _decode_to_rgb(img2)

	if not ok1 or not ok2:
	return {"success": False, "message": "Image decoding failed.",
	"score": 0.0, "confidence": 0.0, "match": False,
	"det1": 0.0, "det2": 0.0}

	face1, msg1 = _detect_and_align(rgb1, 1)
	if face1 is None:
	return {"success": False, "message": msg1,
	"score": 0.0, "confidence": 0.0, "match": False,
	"det1": 0.0, "det2": 0.0}

	face2, msg2 = _detect_and_align(rgb2, 2)
	if face2 is None:
	return {"success": False, "message": msg2,
	"score": 0.0, "confidence": 0.0, "match": False,
	"det1": face1["detection_confidence"], "det2": 0.0}

	t1 = torch.tensor(face1["image_tensor"], dtype=torch.float32)
	t2 = torch.tensor(face2["image_tensor"], dtype=torch.float32)

	score = _avg_cosine(_augmented_embeddings(t1), _augmented_embeddings(t2))
	confidence = _cosine_to_confidence(score)
	match = score >= THRESHOLD

	return {
	"success": True,
	"match": match,
	"score": round(score, 4),
	"confidence": confidence,
	"message": "Faces matched" if match else "Faces do not match",
	"det1": face1["detection_confidence"],
	"det2": face2["detection_confidence"],
	}


	# ─────────────────────────────────────────────────────────────────────────────
	# Gradio inference wrapper
	# ─────────────────────────────────────────────────────────────────────────────

	def run_comparison(img1: np.ndarray, img2: np.ndarray):
	if img1 is None or img2 is None:
	err = _verdict_html(False, None, "Upload both images to run comparison.")
	return err, "—", "—", ""

	r = _compare(img1, img2)

	if not r["success"]:
	return _verdict_html(False, None, r["message"]), "—", "—", _details_html(r)

	return (
	_verdict_html(True, r["match"], r["message"]),
	f"{r['score']:.4f}",
	f"{r['confidence']}%",
	_details_html(r),
	)


	def _verdict_html(success: bool, match: Optional[bool], message: str) -> str:
	if not success:
	color, icon, label = "#c0392b", "✕", "Error"
	elif match:
	color, icon, label = "#16a085", "✓", "Match"
	else:
	color, icon, label = "#c0392b", "✕", "No Match"

	bg = "#eafaf7" if (success and match) else "#fdf2f2"
	return f"""
	<div style="display:flex; align-items:center; gap:14px; padding:18px 24px;
	border-radius:10px; border-left:4px solid {color}; background:{bg};
	font-family:'DM Sans',sans-serif;">
	<span style="font-size:28px; color:{color}; font-weight:700;">{icon}</span>
	<div>
	<div style="font-size:19px; font-weight:700; color:{color};">{label}</div>
	<div style="font-size:13px; color:#555; margin-top:2px;">{message}</div>
	</div>
	</div>"""


	def _details_html(r: dict) -> str:
	if not r.get("success"):
	return (f'<div style="font-family:monospace; font-size:12px; color:#888;'
	f'padding:10px 14px; background:#f9f9f9; border-radius:6px;">'
	f'{r["message"]}</div>')

	bar_pct = min(int(r["confidence"]), 100)
	bar_color = "#16a085" if r["match"] else "#c0392b"

	return f"""
	<div style="font-family:'DM Sans',sans-serif; font-size:13px; color:#333;">
	<div style="display:flex; gap:32px; margin-bottom:14px;">
	<div>
	<div style="font-size:11px; color:#888; text-transform:uppercase; letter-spacing:.06em;">Similarity score</div>
	<div style="font-size:22px; font-weight:700; color:#111;">{r['score']}</div>
	</div>
	<div>
	<div style="font-size:11px; color:#888; text-transform:uppercase; letter-spacing:.06em;">Threshold</div>
	<div style="font-size:22px; font-weight:700; color:#111;">{THRESHOLD}</div>
	</div>
	<div>
	<div style="font-size:11px; color:#888; text-transform:uppercase; letter-spacing:.06em;">Method</div>
	<div style="font-size:13px; font-weight:500; color:#555; padding-top:5px;">Augmented cosine<br>(3×3 pairs)</div>
	</div>
	</div>
	<div style="margin-bottom:6px;">
	<div style="display:flex; justify-content:space-between; margin-bottom:4px;">
	<span style="font-size:11px; color:#888; text-transform:uppercase; letter-spacing:.06em;">Confidence</span>
	<span style="font-size:13px; font-weight:700; color:{bar_color};">{r['confidence']}%</span>
	</div>
	<div style="height:6px; background:#e8e8e8; border-radius:4px; overflow:hidden;">
	<div style="height:100%; width:{bar_pct}%; background:{bar_color};
	border-radius:4px; transition:width .4s;"></div>
	</div>
	</div>
	<div style="display:flex; gap:24px; margin-top:14px; padding-top:14px;
	border-top:1px solid #ececec;">
	<div style="font-size:11px; color:#888;">
	Detection confidence — image 1: <strong style="color:#333;">{r['det1']}</strong>
	</div>
	<div style="font-size:11px; color:#888;">
	Detection confidence — image 2: <strong style="color:#333;">{r['det2']}</strong>
	</div>
	</div>
	</div>"""


	# ─────────────────────────────────────────────────────────────────────────────
	# CSS
	# ─────────────────────────────────────────────────────────────────────────────
	CSS = """
	@import url('https://fonts.googleapis.com/css2?family=DM+Sans:wght@400;500;700&family=DM+Mono:wght@400;500&display=swap');

	body, .gradio-container {
	font-family: 'DM Sans', sans-serif !important;
	background: #f7f7f5 !important;
	}
	.top-bar {
	background: #0d0d0d; color: #fff;
	padding: 18px 28px 14px; border-radius: 12px; margin-bottom: 4px;
	}
	.top-bar h1 { font-size: 22px; font-weight: 700; margin: 0 0 4px; letter-spacing: -0.02em; }
	.top-bar .badges { display: flex; gap: 8px; margin-top: 10px; flex-wrap: wrap; }
	.top-bar .badge {
	font-family: 'DM Mono', monospace; font-size: 10px;
	padding: 3px 9px; border: 1px solid #333; border-radius: 20px; color: #aaa;
	}
	.upload-panel { background: #fff; border: 1px solid #e5e5e5; border-radius: 12px; overflow: hidden; }
	.upload-label {
	font-size: 11px; font-weight: 700; letter-spacing: .08em;
	text-transform: uppercase; color: #888; padding: 10px 14px 0;
	font-family: 'DM Mono', monospace;
	}
	.results-label {
	font-size: 11px; font-weight: 700; text-transform: uppercase;
	letter-spacing: .08em; color: #bbb; margin-bottom: 10px;
	font-family: 'DM Mono', monospace;
	}
	.run-btn {
	background: #0d0d0d !important; color: #fff !important;
	border: none !important; border-radius: 8px !important;
	font-family: 'DM Sans', sans-serif !important; font-weight: 700 !important;
	font-size: 14px !important; padding: 12px 0 !important;
	width: 100% !important; cursor: pointer !important; letter-spacing: 0.01em !important;
	}
	.run-btn:hover { background: #1a1a1a !important; }
	.clear-btn {
	background: transparent !important; color: #888 !important;
	border: 1px solid #ddd !important; border-radius: 8px !important;
	font-family: 'DM Mono', monospace !important; font-size: 12px !important;
	}
	footer { display: none !important; }
	.svelte-1gfkn6j { display: none !important; }
	input[type=number] { display: none; }
	label span { font-family: 'DM Mono', monospace; font-size: 11px !important; color: #888 !important; }
	"""


	# ─────────────────────────────────────────────────────────────────────────────
	# Gradio UI
	# ─────────────────────────────────────────────────────────────────────────────

	def build_ui():
	with gr.Blocks(css=CSS, title="Facial Comparison") as demo:

	gr.HTML("""
	<div class="top-bar">
	<h1>Facial Comparison</h1>
	<p style="color:#ccc; font-size:13px; font-family:'DM Sans',sans-serif;
	font-weight:400; margin-bottom:10px; line-height:1.6;">
	Verify whether two faces belong to the same person — works on portraits, selfies,
	and identity documents (CNIC, passport). The system automatically extracts the face
	from an ID card and compares it against a live photo.
	Deployed across <strong style="color:#fff;">40+ financial institutions</strong> for
	customer onboarding and fraud prevention.
	</p>
	<div class="badges">
	<span class="badge">face matching</span>
	<span class="badge">ID card face extraction</span>
	<span class="badge">liveness-aware</span>
	<span class="badge">occlusion handling</span>
	<span class="badge">production-grade</span>
	</div>
	<div style="margin-top:12px; display:flex; align-items:center; gap:6px;
	font-family:'DM Mono',monospace; font-size:10px; color:#555;">
	<span style="display:inline-block; width:7px; height:7px; border-radius:50%;
	background:#22c55e; flex-shrink:0;"></span>
	No images are stored, logged, or transmitted beyond this session.
	Your data never leaves inference memory.
	</div>
	</div>
	""")

	with gr.Row(equal_height=True):

	with gr.Column(scale=5):
	with gr.Row(equal_height=True):
	with gr.Column():
	gr.HTML('<div class="upload-label">Image 1</div>')
	img1 = gr.Image(label="", type="numpy",
	sources=["upload", "clipboard"],
	height=260, elem_classes=["upload-panel"])
	with gr.Column():
	gr.HTML('<div class="upload-label">Image 2</div>')
	img2 = gr.Image(label="", type="numpy",
	sources=["upload", "clipboard"],
	height=260, elem_classes=["upload-panel"])

	with gr.Row():
	clear_btn = gr.Button("Clear", elem_classes=["clear-btn"])
	run_btn = gr.Button("Compare →", elem_classes=["run-btn"])

	gr.HTML("""
	<div style="margin-top:10px; padding:12px 16px; background:#fff;
	border:1px solid #ececec; border-radius:10px;
	font-family:'DM Mono',monospace; font-size:11px;
	color:#aaa; line-height:1.8;">
	<strong style="color:#555;">Supported inputs</strong><br>
	Portrait photo · Selfie · ID card (face auto-extracted) · Passport photo page<br><br>
	<strong style="color:#555;">How to use</strong><br>
	Upload any two images — the system locates and extracts the face from each,
	then computes a match score and confidence percentage.
	</div>
	""")

	with gr.Column(scale=4):
	gr.HTML('<div class="results-label" style="margin-bottom:6px;">Result</div>')

	verdict_html = gr.HTML(
	value='<div style="height:72px; background:#f7f7f5; border-radius:10px;'
	'border:1px dashed #ddd; display:flex; align-items:center;'
	'justify-content:center; color:#ccc; font-size:13px;'
	'font-family:DM Mono,monospace;">awaiting input</div>'
	)

	with gr.Row():
	score_out = gr.Label(label="Similarity score")
	conf_out = gr.Label(label="Confidence")

	gr.HTML('<div class="results-label" style="margin:10px 0 6px;">Details</div>')
	details_html = gr.HTML(
	value='<div style="height:80px; background:#f7f7f5; border-radius:8px;'
	'border:1px dashed #ddd;"></div>'
	)

	with gr.Accordion("How it works", open=False):
	gr.HTML("""
	<div style="font-family:'DM Sans',sans-serif; font-size:13px;
	color:#555; line-height:1.8; padding:4px 0;">
	<strong>1. Face extraction</strong> — The system automatically locates
	every face in the uploaded image, including faces embedded in identity
	documents like CNICs and passports. No manual cropping required.<br><br>
	<strong>2. Alignment</strong> — Each detected face is geometrically
	normalised to a canonical frontal pose using facial landmark positions,
	making the comparison robust to head tilt, lighting, and image angle.<br><br>
	<strong>3. Feature encoding</strong> — The aligned face is passed through
	a deep neural network (custom-trained) that compresses it into a compact
	numerical representation capturing unique facial geometry.<br><br>
	<strong>4. Robust matching</strong> — Multiple augmented versions of each
	face are compared, and the results are averaged to produce a stable
	similarity score resilient to minor image quality variations.<br><br>
	<strong>5. Confidence scoring</strong> — The similarity score is converted
	into an intuitive 0–99.9% confidence value along with a clear
	Match / No Match verdict.<br><br>
	<strong style="color:#16a085;">Privacy</strong> — All processing happens
	entirely within the inference session. No image, face crop, score, or
	metadata is written to disk, logged, or sent to any external service.
	Once your session ends, nothing is retained.
	</div>
	""")

	run_btn.click(
	fn=run_comparison,
	inputs=[img1, img2],
	outputs=[verdict_html, score_out, conf_out, details_html],
	)
	clear_btn.click(
	fn=lambda: (
	None, None,
	'<div style="height:72px; background:#f7f7f5; border-radius:10px;'
	'border:1px dashed #ddd; display:flex; align-items:center;'
	'justify-content:center; color:#ccc; font-size:13px;'
	'font-family:DM Mono,monospace;">awaiting input</div>',
	"—", "—",
	'<div style="height:80px; background:#f7f7f5; border-radius:8px;'
	'border:1px dashed #ddd;"></div>',
	),
	outputs=[img1, img2, verdict_html, score_out, conf_out, details_html],
	)

	return demo


	if __name__ == "__main__":
	ui = build_ui()
	ui.launch(server_name="0.0.0.0", server_port=7860, show_error=True)