Oliver Nitsche Claude Sonnet 4.6 commited on
Commit
6026dde
·
1 Parent(s): 8ed4512

Fix face recognition: align faces and enroll multiple frames

Browse files

Two root causes for recognition failures:

1. MobileFaceNet expects faces aligned to a canonical eye position
(InsightFace 112×112 standard). Plain resize produced embeddings
that varied too much across frames for the same person. Added
_align() which uses the OpenCV eye cascade to compute a similarity
transform mapping detected eye centres to the standard positions,
with plain resize as fallback when eyes aren't detectable.

2. Only one frame was enrolled, so any change in lighting or angle
on the next encounter could push similarity below threshold.
ENROLLING state now collects up to 5 frames while the user types
their name; all frames with a detectable face are enrolled.

Also lowered the matching threshold from 0.35 → 0.25 to account for
residual variation when the eye cascade falls back to plain resize.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show
  1. recognizer/face_db.py +33 -2
  2. recognizer/main.py +18 -9
recognizer/face_db.py CHANGED
@@ -3,6 +3,8 @@
3
  Detection : OpenCV Haar cascade (built into opencv, no download).
4
  Embedding : InsightFace MobileFaceNet (w600k_mbf.onnx, ~17 MB, downloaded
5
  once on first run from the InsightFace GitHub release).
 
 
6
  Matching : cosine similarity on L2-normalised 512-D embeddings.
7
  Storage : recognizer/face_db.json (gitignored).
8
 
@@ -34,8 +36,14 @@ _REC_ENTRY = "w600k_mbf.onnx" # path inside the zip (root-level since buffalo_
34
  _CASCADE = cv2.CascadeClassifier(
35
  cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
36
  )
 
 
 
37
  _session: Optional[ort.InferenceSession] = None
38
 
 
 
 
39
 
40
  class NoFaceDetected(Exception):
41
  """Raised when no face is found in the provided image."""
@@ -77,8 +85,31 @@ def _detect(frame_bgr: np.ndarray) -> list[tuple[int, int, int, int]]:
77
  return [tuple(b) for b in boxes] if len(boxes) > 0 else []
78
 
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  def _embed(face_bgr: np.ndarray) -> np.ndarray:
81
- img = cv2.resize(face_bgr, (112, 112)).astype(np.float32)
82
  img = (img - 127.5) / 127.5
83
  inp = np.transpose(img, (2, 0, 1))[np.newaxis] # NCHW
84
  sess = _get_session()
@@ -105,7 +136,7 @@ def save(db: dict[str, list[list[float]]]) -> None:
105
  def find_match(
106
  frame_bgr: np.ndarray,
107
  db: dict[str, list[list[float]]],
108
- threshold: float = 0.35,
109
  ) -> Optional[str]:
110
  """Return matched name if recognised, None if face present but unknown.
111
 
 
3
  Detection : OpenCV Haar cascade (built into opencv, no download).
4
  Embedding : InsightFace MobileFaceNet (w600k_mbf.onnx, ~17 MB, downloaded
5
  once on first run from the InsightFace GitHub release).
6
+ Alignment : eye-centre similarity transform to the InsightFace 112×112
7
+ canonical frame before embedding (plain resize fallback).
8
  Matching : cosine similarity on L2-normalised 512-D embeddings.
9
  Storage : recognizer/face_db.json (gitignored).
10
 
 
36
  _CASCADE = cv2.CascadeClassifier(
37
  cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
38
  )
39
+ _EYE_CASCADE = cv2.CascadeClassifier(
40
+ cv2.data.haarcascades + "haarcascade_eye.xml"
41
+ )
42
  _session: Optional[ort.InferenceSession] = None
43
 
44
+ # Standard InsightFace eye positions in the 112×112 aligned face
45
+ _ALIGN_DST = np.float32([[38.29, 51.70], [73.53, 51.50]])
46
+
47
 
48
  class NoFaceDetected(Exception):
49
  """Raised when no face is found in the provided image."""
 
85
  return [tuple(b) for b in boxes] if len(boxes) > 0 else []
86
 
87
 
88
+ def _align(face_bgr: np.ndarray) -> np.ndarray:
89
+ """Return a 112×112 crop aligned on eye centres; plain resize as fallback.
90
+
91
+ MobileFaceNet is trained on faces warped to a canonical eye position.
92
+ Without this step, embeddings from different frames of the same person
93
+ can be too dissimilar for reliable matching.
94
+ """
95
+ gray = cv2.cvtColor(face_bgr, cv2.COLOR_BGR2GRAY)
96
+ eyes = _EYE_CASCADE.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=3)
97
+ if len(eyes) >= 2:
98
+ # Pick the two largest detections and sort left-to-right
99
+ eyes = sorted(eyes, key=lambda e: e[2] * e[3], reverse=True)[:2]
100
+ eyes = sorted(eyes, key=lambda e: e[0])
101
+ src = np.float32([
102
+ [eyes[0][0] + eyes[0][2] / 2, eyes[0][1] + eyes[0][3] / 2],
103
+ [eyes[1][0] + eyes[1][2] / 2, eyes[1][1] + eyes[1][3] / 2],
104
+ ])
105
+ M, _ = cv2.estimateAffinePartial2D(src, _ALIGN_DST)
106
+ if M is not None:
107
+ return cv2.warpAffine(face_bgr, M, (112, 112))
108
+ return cv2.resize(face_bgr, (112, 112))
109
+
110
+
111
  def _embed(face_bgr: np.ndarray) -> np.ndarray:
112
+ img = _align(face_bgr).astype(np.float32)
113
  img = (img - 127.5) / 127.5
114
  inp = np.transpose(img, (2, 0, 1))[np.newaxis] # NCHW
115
  sess = _get_session()
 
136
  def find_match(
137
  frame_bgr: np.ndarray,
138
  db: dict[str, list[list[float]]],
139
+ threshold: float = 0.25,
140
  ) -> Optional[str]:
141
  """Return matched name if recognised, None if face present but unknown.
142
 
recognizer/main.py CHANGED
@@ -12,8 +12,6 @@ import math
12
  import threading
13
  import time
14
  from enum import Enum, auto
15
- from typing import Optional
16
-
17
  import numpy as np
18
  from pydantic import BaseModel
19
  from reachy_mini import ReachyMini, ReachyMiniApp
@@ -72,7 +70,7 @@ class Recognizer(ReachyMiniApp):
72
  speech_count = 0
73
  active_start = 0.0
74
  last_face_check = 0.0
75
- pending_frame: Optional[np.ndarray] = None
76
  scan_t0 = 0.0 # reference time for head-scan idle animation
77
 
78
  reachy_mini.goto_sleep()
@@ -113,7 +111,7 @@ class Recognizer(ReachyMiniApp):
113
  active_start = time.time()
114
  scan_t0 = active_start
115
  last_face_check = 0.0
116
- pending_frame = None
117
  state = State.ACTIVE
118
 
119
  # ---------- ACTIVE ----------
@@ -145,7 +143,7 @@ class Recognizer(ReachyMiniApp):
145
  "Please enter your name on the control panel.",
146
  reachy_mini,
147
  )
148
- pending_frame = frame
149
  with _lock:
150
  _shared["pending_name"] = None
151
  state = State.ENROLLING
@@ -165,14 +163,25 @@ class Recognizer(ReachyMiniApp):
165
  _shared["state"] = "enrolling"
166
  name = _shared.get("pending_name")
167
 
 
 
 
 
 
 
168
  if name:
169
  with _lock:
170
  _shared["pending_name"] = None
171
- if pending_frame is not None:
 
172
  try:
173
- add_face(name, pending_frame, face_db)
174
- except ValueError as exc:
175
- logger.warning("Enrollment failed: %s", exc)
 
 
 
 
176
  speak(f"Nice to meet you, {name}!", reachy_mini)
177
  reachy_mini.goto_sleep()
178
  state = State.SLEEPING
 
12
  import threading
13
  import time
14
  from enum import Enum, auto
 
 
15
  import numpy as np
16
  from pydantic import BaseModel
17
  from reachy_mini import ReachyMini, ReachyMiniApp
 
70
  speech_count = 0
71
  active_start = 0.0
72
  last_face_check = 0.0
73
+ enrollment_frames: list[np.ndarray] = []
74
  scan_t0 = 0.0 # reference time for head-scan idle animation
75
 
76
  reachy_mini.goto_sleep()
 
111
  active_start = time.time()
112
  scan_t0 = active_start
113
  last_face_check = 0.0
114
+ enrollment_frames.clear()
115
  state = State.ACTIVE
116
 
117
  # ---------- ACTIVE ----------
 
143
  "Please enter your name on the control panel.",
144
  reachy_mini,
145
  )
146
+ enrollment_frames = [frame]
147
  with _lock:
148
  _shared["pending_name"] = None
149
  state = State.ENROLLING
 
163
  _shared["state"] = "enrolling"
164
  name = _shared.get("pending_name")
165
 
166
+ # Gather more frames while waiting; add_face filters out blanks
167
+ if len(enrollment_frames) < 5:
168
+ frame = reachy_mini.media.get_frame()
169
+ if frame is not None:
170
+ enrollment_frames.append(frame)
171
+
172
  if name:
173
  with _lock:
174
  _shared["pending_name"] = None
175
+ enrolled = 0
176
+ for ef in enrollment_frames:
177
  try:
178
+ add_face(name, ef, face_db)
179
+ enrolled += 1
180
+ except ValueError:
181
+ pass
182
+ if enrolled == 0:
183
+ logger.warning("Enrollment failed: no face detected in captured frames")
184
+ enrollment_frames.clear()
185
  speak(f"Nice to meet you, {name}!", reachy_mini)
186
  reachy_mini.goto_sleep()
187
  state = State.SLEEPING