Spaces:
Running
Fix face recognition: align faces and enroll multiple frames
Browse filesTwo root causes for recognition failures:
1. MobileFaceNet expects faces aligned to a canonical eye position
(InsightFace 112×112 standard). Plain resize produced embeddings
that varied too much across frames for the same person. Added
_align() which uses the OpenCV eye cascade to compute a similarity
transform mapping detected eye centres to the standard positions,
with plain resize as fallback when eyes aren't detectable.
2. Only one frame was enrolled, so any change in lighting or angle
on the next encounter could push similarity below threshold.
ENROLLING state now collects up to 5 frames while the user types
their name; all frames with a detectable face are enrolled.
Also lowered the matching threshold from 0.35 → 0.25 to account for
residual variation when the eye cascade falls back to plain resize.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- recognizer/face_db.py +33 -2
- recognizer/main.py +18 -9
|
@@ -3,6 +3,8 @@
|
|
| 3 |
Detection : OpenCV Haar cascade (built into opencv, no download).
|
| 4 |
Embedding : InsightFace MobileFaceNet (w600k_mbf.onnx, ~17 MB, downloaded
|
| 5 |
once on first run from the InsightFace GitHub release).
|
|
|
|
|
|
|
| 6 |
Matching : cosine similarity on L2-normalised 512-D embeddings.
|
| 7 |
Storage : recognizer/face_db.json (gitignored).
|
| 8 |
|
|
@@ -34,8 +36,14 @@ _REC_ENTRY = "w600k_mbf.onnx" # path inside the zip (root-level since buffalo_
|
|
| 34 |
_CASCADE = cv2.CascadeClassifier(
|
| 35 |
cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
|
| 36 |
)
|
|
|
|
|
|
|
|
|
|
| 37 |
_session: Optional[ort.InferenceSession] = None
|
| 38 |
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
class NoFaceDetected(Exception):
|
| 41 |
"""Raised when no face is found in the provided image."""
|
|
@@ -77,8 +85,31 @@ def _detect(frame_bgr: np.ndarray) -> list[tuple[int, int, int, int]]:
|
|
| 77 |
return [tuple(b) for b in boxes] if len(boxes) > 0 else []
|
| 78 |
|
| 79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
def _embed(face_bgr: np.ndarray) -> np.ndarray:
|
| 81 |
-
img =
|
| 82 |
img = (img - 127.5) / 127.5
|
| 83 |
inp = np.transpose(img, (2, 0, 1))[np.newaxis] # NCHW
|
| 84 |
sess = _get_session()
|
|
@@ -105,7 +136,7 @@ def save(db: dict[str, list[list[float]]]) -> None:
|
|
| 105 |
def find_match(
|
| 106 |
frame_bgr: np.ndarray,
|
| 107 |
db: dict[str, list[list[float]]],
|
| 108 |
-
threshold: float = 0.
|
| 109 |
) -> Optional[str]:
|
| 110 |
"""Return matched name if recognised, None if face present but unknown.
|
| 111 |
|
|
|
|
| 3 |
Detection : OpenCV Haar cascade (built into opencv, no download).
|
| 4 |
Embedding : InsightFace MobileFaceNet (w600k_mbf.onnx, ~17 MB, downloaded
|
| 5 |
once on first run from the InsightFace GitHub release).
|
| 6 |
+
Alignment : eye-centre similarity transform to the InsightFace 112×112
|
| 7 |
+
canonical frame before embedding (plain resize fallback).
|
| 8 |
Matching : cosine similarity on L2-normalised 512-D embeddings.
|
| 9 |
Storage : recognizer/face_db.json (gitignored).
|
| 10 |
|
|
|
|
| 36 |
_CASCADE = cv2.CascadeClassifier(
|
| 37 |
cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
|
| 38 |
)
|
| 39 |
+
_EYE_CASCADE = cv2.CascadeClassifier(
|
| 40 |
+
cv2.data.haarcascades + "haarcascade_eye.xml"
|
| 41 |
+
)
|
| 42 |
_session: Optional[ort.InferenceSession] = None
|
| 43 |
|
| 44 |
+
# Standard InsightFace eye positions in the 112×112 aligned face
|
| 45 |
+
_ALIGN_DST = np.float32([[38.29, 51.70], [73.53, 51.50]])
|
| 46 |
+
|
| 47 |
|
| 48 |
class NoFaceDetected(Exception):
|
| 49 |
"""Raised when no face is found in the provided image."""
|
|
|
|
| 85 |
return [tuple(b) for b in boxes] if len(boxes) > 0 else []
|
| 86 |
|
| 87 |
|
| 88 |
+
def _align(face_bgr: np.ndarray) -> np.ndarray:
|
| 89 |
+
"""Return a 112×112 crop aligned on eye centres; plain resize as fallback.
|
| 90 |
+
|
| 91 |
+
MobileFaceNet is trained on faces warped to a canonical eye position.
|
| 92 |
+
Without this step, embeddings from different frames of the same person
|
| 93 |
+
can be too dissimilar for reliable matching.
|
| 94 |
+
"""
|
| 95 |
+
gray = cv2.cvtColor(face_bgr, cv2.COLOR_BGR2GRAY)
|
| 96 |
+
eyes = _EYE_CASCADE.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=3)
|
| 97 |
+
if len(eyes) >= 2:
|
| 98 |
+
# Pick the two largest detections and sort left-to-right
|
| 99 |
+
eyes = sorted(eyes, key=lambda e: e[2] * e[3], reverse=True)[:2]
|
| 100 |
+
eyes = sorted(eyes, key=lambda e: e[0])
|
| 101 |
+
src = np.float32([
|
| 102 |
+
[eyes[0][0] + eyes[0][2] / 2, eyes[0][1] + eyes[0][3] / 2],
|
| 103 |
+
[eyes[1][0] + eyes[1][2] / 2, eyes[1][1] + eyes[1][3] / 2],
|
| 104 |
+
])
|
| 105 |
+
M, _ = cv2.estimateAffinePartial2D(src, _ALIGN_DST)
|
| 106 |
+
if M is not None:
|
| 107 |
+
return cv2.warpAffine(face_bgr, M, (112, 112))
|
| 108 |
+
return cv2.resize(face_bgr, (112, 112))
|
| 109 |
+
|
| 110 |
+
|
| 111 |
def _embed(face_bgr: np.ndarray) -> np.ndarray:
|
| 112 |
+
img = _align(face_bgr).astype(np.float32)
|
| 113 |
img = (img - 127.5) / 127.5
|
| 114 |
inp = np.transpose(img, (2, 0, 1))[np.newaxis] # NCHW
|
| 115 |
sess = _get_session()
|
|
|
|
| 136 |
def find_match(
|
| 137 |
frame_bgr: np.ndarray,
|
| 138 |
db: dict[str, list[list[float]]],
|
| 139 |
+
threshold: float = 0.25,
|
| 140 |
) -> Optional[str]:
|
| 141 |
"""Return matched name if recognised, None if face present but unknown.
|
| 142 |
|
|
@@ -12,8 +12,6 @@ import math
|
|
| 12 |
import threading
|
| 13 |
import time
|
| 14 |
from enum import Enum, auto
|
| 15 |
-
from typing import Optional
|
| 16 |
-
|
| 17 |
import numpy as np
|
| 18 |
from pydantic import BaseModel
|
| 19 |
from reachy_mini import ReachyMini, ReachyMiniApp
|
|
@@ -72,7 +70,7 @@ class Recognizer(ReachyMiniApp):
|
|
| 72 |
speech_count = 0
|
| 73 |
active_start = 0.0
|
| 74 |
last_face_check = 0.0
|
| 75 |
-
|
| 76 |
scan_t0 = 0.0 # reference time for head-scan idle animation
|
| 77 |
|
| 78 |
reachy_mini.goto_sleep()
|
|
@@ -113,7 +111,7 @@ class Recognizer(ReachyMiniApp):
|
|
| 113 |
active_start = time.time()
|
| 114 |
scan_t0 = active_start
|
| 115 |
last_face_check = 0.0
|
| 116 |
-
|
| 117 |
state = State.ACTIVE
|
| 118 |
|
| 119 |
# ---------- ACTIVE ----------
|
|
@@ -145,7 +143,7 @@ class Recognizer(ReachyMiniApp):
|
|
| 145 |
"Please enter your name on the control panel.",
|
| 146 |
reachy_mini,
|
| 147 |
)
|
| 148 |
-
|
| 149 |
with _lock:
|
| 150 |
_shared["pending_name"] = None
|
| 151 |
state = State.ENROLLING
|
|
@@ -165,14 +163,25 @@ class Recognizer(ReachyMiniApp):
|
|
| 165 |
_shared["state"] = "enrolling"
|
| 166 |
name = _shared.get("pending_name")
|
| 167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
if name:
|
| 169 |
with _lock:
|
| 170 |
_shared["pending_name"] = None
|
| 171 |
-
|
|
|
|
| 172 |
try:
|
| 173 |
-
add_face(name,
|
| 174 |
-
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
speak(f"Nice to meet you, {name}!", reachy_mini)
|
| 177 |
reachy_mini.goto_sleep()
|
| 178 |
state = State.SLEEPING
|
|
|
|
| 12 |
import threading
|
| 13 |
import time
|
| 14 |
from enum import Enum, auto
|
|
|
|
|
|
|
| 15 |
import numpy as np
|
| 16 |
from pydantic import BaseModel
|
| 17 |
from reachy_mini import ReachyMini, ReachyMiniApp
|
|
|
|
| 70 |
speech_count = 0
|
| 71 |
active_start = 0.0
|
| 72 |
last_face_check = 0.0
|
| 73 |
+
enrollment_frames: list[np.ndarray] = []
|
| 74 |
scan_t0 = 0.0 # reference time for head-scan idle animation
|
| 75 |
|
| 76 |
reachy_mini.goto_sleep()
|
|
|
|
| 111 |
active_start = time.time()
|
| 112 |
scan_t0 = active_start
|
| 113 |
last_face_check = 0.0
|
| 114 |
+
enrollment_frames.clear()
|
| 115 |
state = State.ACTIVE
|
| 116 |
|
| 117 |
# ---------- ACTIVE ----------
|
|
|
|
| 143 |
"Please enter your name on the control panel.",
|
| 144 |
reachy_mini,
|
| 145 |
)
|
| 146 |
+
enrollment_frames = [frame]
|
| 147 |
with _lock:
|
| 148 |
_shared["pending_name"] = None
|
| 149 |
state = State.ENROLLING
|
|
|
|
| 163 |
_shared["state"] = "enrolling"
|
| 164 |
name = _shared.get("pending_name")
|
| 165 |
|
| 166 |
+
# Gather more frames while waiting; add_face filters out blanks
|
| 167 |
+
if len(enrollment_frames) < 5:
|
| 168 |
+
frame = reachy_mini.media.get_frame()
|
| 169 |
+
if frame is not None:
|
| 170 |
+
enrollment_frames.append(frame)
|
| 171 |
+
|
| 172 |
if name:
|
| 173 |
with _lock:
|
| 174 |
_shared["pending_name"] = None
|
| 175 |
+
enrolled = 0
|
| 176 |
+
for ef in enrollment_frames:
|
| 177 |
try:
|
| 178 |
+
add_face(name, ef, face_db)
|
| 179 |
+
enrolled += 1
|
| 180 |
+
except ValueError:
|
| 181 |
+
pass
|
| 182 |
+
if enrolled == 0:
|
| 183 |
+
logger.warning("Enrollment failed: no face detected in captured frames")
|
| 184 |
+
enrollment_frames.clear()
|
| 185 |
speak(f"Nice to meet you, {name}!", reachy_mini)
|
| 186 |
reachy_mini.goto_sleep()
|
| 187 |
state = State.SLEEPING
|