Spaces:

onitsche
/

recognizer

Running

Oliver Nitsche Claude Sonnet 4.6 commited on 28 days ago

Commit

6026dde

1 Parent(s): 8ed4512

Fix face recognition: align faces and enroll multiple frames

Two root causes for recognition failures:

1. MobileFaceNet expects faces aligned to a canonical eye position
(InsightFace 112×112 standard). Plain resize produced embeddings
that varied too much across frames for the same person. Added
_align() which uses the OpenCV eye cascade to compute a similarity
transform mapping detected eye centres to the standard positions,
with plain resize as fallback when eyes aren't detectable.

2. Only one frame was enrolled, so any change in lighting or angle
on the next encounter could push similarity below threshold.
ENROLLING state now collects up to 5 frames while the user types
their name; all frames with a detectable face are enrolled.

Also lowered the matching threshold from 0.35 → 0.25 to account for
residual variation when the eye cascade falls back to plain resize.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show

recognizer/face_db.py +33 -2
recognizer/main.py +18 -9

recognizer/face_db.py CHANGED Viewed

@@ -3,6 +3,8 @@
 Detection : OpenCV Haar cascade (built into opencv, no download).
 Embedding : InsightFace MobileFaceNet (w600k_mbf.onnx, ~17 MB, downloaded
             once on first run from the InsightFace GitHub release).
 Matching  : cosine similarity on L2-normalised 512-D embeddings.
 Storage   : recognizer/face_db.json (gitignored).
@@ -34,8 +36,14 @@ _REC_ENTRY = "w600k_mbf.onnx"   # path inside the zip (root-level since buffalo_
 _CASCADE = cv2.CascadeClassifier(
     cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
 )
 _session: Optional[ort.InferenceSession] = None
 class NoFaceDetected(Exception):
     """Raised when no face is found in the provided image."""
@@ -77,8 +85,31 @@ def _detect(frame_bgr: np.ndarray) -> list[tuple[int, int, int, int]]:
     return [tuple(b) for b in boxes] if len(boxes) > 0 else []
 def _embed(face_bgr: np.ndarray) -> np.ndarray:
-    img = cv2.resize(face_bgr, (112, 112)).astype(np.float32)
     img = (img - 127.5) / 127.5
     inp = np.transpose(img, (2, 0, 1))[np.newaxis]          # NCHW
     sess = _get_session()
@@ -105,7 +136,7 @@ def save(db: dict[str, list[list[float]]]) -> None:
 def find_match(
     frame_bgr: np.ndarray,
     db: dict[str, list[list[float]]],
-    threshold: float = 0.35,
 ) -> Optional[str]:
     """Return matched name if recognised, None if face present but unknown.

 Detection : OpenCV Haar cascade (built into opencv, no download).
 Embedding : InsightFace MobileFaceNet (w600k_mbf.onnx, ~17 MB, downloaded
             once on first run from the InsightFace GitHub release).
+Alignment : eye-centre similarity transform to the InsightFace 112×112
+            canonical frame before embedding (plain resize fallback).
 Matching  : cosine similarity on L2-normalised 512-D embeddings.
 Storage   : recognizer/face_db.json (gitignored).
 _CASCADE = cv2.CascadeClassifier(
     cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
 )
+_EYE_CASCADE = cv2.CascadeClassifier(
+    cv2.data.haarcascades + "haarcascade_eye.xml"
+)
 _session: Optional[ort.InferenceSession] = None
+# Standard InsightFace eye positions in the 112×112 aligned face
+_ALIGN_DST = np.float32([[38.29, 51.70], [73.53, 51.50]])
 class NoFaceDetected(Exception):
     """Raised when no face is found in the provided image."""
     return [tuple(b) for b in boxes] if len(boxes) > 0 else []
+def _align(face_bgr: np.ndarray) -> np.ndarray:
+    """Return a 112×112 crop aligned on eye centres; plain resize as fallback.
+    MobileFaceNet is trained on faces warped to a canonical eye position.
+    Without this step, embeddings from different frames of the same person
+    can be too dissimilar for reliable matching.
+    """
+    gray = cv2.cvtColor(face_bgr, cv2.COLOR_BGR2GRAY)
+    eyes = _EYE_CASCADE.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=3)
+    if len(eyes) >= 2:
+        # Pick the two largest detections and sort left-to-right
+        eyes = sorted(eyes, key=lambda e: e[2] * e[3], reverse=True)[:2]
+        eyes = sorted(eyes, key=lambda e: e[0])
+        src = np.float32([
+            [eyes[0][0] + eyes[0][2] / 2, eyes[0][1] + eyes[0][3] / 2],
+            [eyes[1][0] + eyes[1][2] / 2, eyes[1][1] + eyes[1][3] / 2],
+        ])
+        M, _ = cv2.estimateAffinePartial2D(src, _ALIGN_DST)
+        if M is not None:
+            return cv2.warpAffine(face_bgr, M, (112, 112))
+    return cv2.resize(face_bgr, (112, 112))
 def _embed(face_bgr: np.ndarray) -> np.ndarray:
+    img = _align(face_bgr).astype(np.float32)
     img = (img - 127.5) / 127.5
     inp = np.transpose(img, (2, 0, 1))[np.newaxis]          # NCHW
     sess = _get_session()
 def find_match(
     frame_bgr: np.ndarray,
     db: dict[str, list[list[float]]],
+    threshold: float = 0.25,
 ) -> Optional[str]:
     """Return matched name if recognised, None if face present but unknown.

recognizer/main.py CHANGED Viewed

@@ -12,8 +12,6 @@ import math
 import threading
 import time
 from enum import Enum, auto
-from typing import Optional
 import numpy as np
 from pydantic import BaseModel
 from reachy_mini import ReachyMini, ReachyMiniApp
@@ -72,7 +70,7 @@ class Recognizer(ReachyMiniApp):
         speech_count = 0
         active_start = 0.0
         last_face_check = 0.0
-        pending_frame: Optional[np.ndarray] = None
         scan_t0 = 0.0             # reference time for head-scan idle animation
         reachy_mini.goto_sleep()
@@ -113,7 +111,7 @@ class Recognizer(ReachyMiniApp):
                 active_start = time.time()
                 scan_t0 = active_start
                 last_face_check = 0.0
-                pending_frame = None
                 state = State.ACTIVE
             # ---------- ACTIVE ----------
@@ -145,7 +143,7 @@ class Recognizer(ReachyMiniApp):
                                     "Please enter your name on the control panel.",
                                     reachy_mini,
                                 )
-                                pending_frame = frame
                                 with _lock:
                                     _shared["pending_name"] = None
                                 state = State.ENROLLING
@@ -165,14 +163,25 @@ class Recognizer(ReachyMiniApp):
                     _shared["state"] = "enrolling"
                     name = _shared.get("pending_name")
                 if name:
                     with _lock:
                         _shared["pending_name"] = None
-                    if pending_frame is not None:
                         try:
-                            add_face(name, pending_frame, face_db)
-                        except ValueError as exc:
-                            logger.warning("Enrollment failed: %s", exc)
                     speak(f"Nice to meet you, {name}!", reachy_mini)
                     reachy_mini.goto_sleep()
                     state = State.SLEEPING

 import threading
 import time
 from enum import Enum, auto
 import numpy as np
 from pydantic import BaseModel
 from reachy_mini import ReachyMini, ReachyMiniApp
         speech_count = 0
         active_start = 0.0
         last_face_check = 0.0
+        enrollment_frames: list[np.ndarray] = []
         scan_t0 = 0.0             # reference time for head-scan idle animation
         reachy_mini.goto_sleep()
                 active_start = time.time()
                 scan_t0 = active_start
                 last_face_check = 0.0
+                enrollment_frames.clear()
                 state = State.ACTIVE
             # ---------- ACTIVE ----------
                                     "Please enter your name on the control panel.",
                                     reachy_mini,
                                 )
+                                enrollment_frames = [frame]
                                 with _lock:
                                     _shared["pending_name"] = None
                                 state = State.ENROLLING
                     _shared["state"] = "enrolling"
                     name = _shared.get("pending_name")
+                # Gather more frames while waiting; add_face filters out blanks
+                if len(enrollment_frames) < 5:
+                    frame = reachy_mini.media.get_frame()
+                    if frame is not None:
+                        enrollment_frames.append(frame)
                 if name:
                     with _lock:
                         _shared["pending_name"] = None
+                    enrolled = 0
+                    for ef in enrollment_frames:
                         try:
+                            add_face(name, ef, face_db)
+                            enrolled += 1
+                        except ValueError:
+                            pass
+                    if enrolled == 0:
+                        logger.warning("Enrollment failed: no face detected in captured frames")
+                    enrollment_frames.clear()
                     speak(f"Nice to meet you, {name}!", reachy_mini)
                     reachy_mini.goto_sleep()
                     state = State.SLEEPING