Oliver Nitsche Claude Sonnet 4.6 commited on
Commit
12fca32
·
1 Parent(s): dbdff5f

Fix critical alignment bug: pass full frame to warpAffine, not crop

Browse files

YuNet returns landmark coordinates in the FULL FRAME pixel space
(e.g. right eye at x=320, y=140 in a 640×480 image).

The previous code cropped the face bounding box first:
_embed(frame_bgr[y:y+h, x:x+w], lm)

...then passed those full-frame landmarks to _align, which applied
warpAffine to the CROP using coordinates from the full frame.
The transform therefore mapped to completely wrong pixels in the crop,
producing a scrambled 112×112 tile that bore no relation to the face.
Enrollment and recognition each produced different scrambled tiles,
making cosine similarity effectively random — hence "always same person"
(random embeddings cluster near a fixed enrolled vector).

Fix: pass the full frame to _embed/_align so the landmark coordinates
and the image coordinate space match:
_embed(frame_bgr, lm) # full frame, full-frame landmarks

Also improved the _align fallback (M=None) to derive a bbox from the
landmark extents rather than plain-resizing the full frame.

SCHEMA_VERSION bumped to 5 → stale DB entries are auto-cleared and
the user will be prompted to re-enroll with correct embeddings.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. recognizer/face_db.py +34 -16
recognizer/face_db.py CHANGED
@@ -34,7 +34,7 @@ import onnxruntime as ort
34
  logger = logging.getLogger(__name__)
35
 
36
  # Bump whenever the embedding pipeline changes (detector, alignment, model, …)
37
- SCHEMA_VERSION = 4 # 3=aligned-RGB-Haar 4=YuNet+5pt-landmarks
38
 
39
  DB_PATH = Path(__file__).parent / "face_db.json"
40
  MODEL_DIR = Path(__file__).parent / "models"
@@ -151,19 +151,35 @@ def _detect(frame_bgr: np.ndarray) -> list[tuple[tuple, np.ndarray]]:
151
  return results
152
 
153
 
154
- def _align(face_bgr: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
155
- """Warp face to InsightFace 112×112 canonical frame using 5-pt landmarks."""
 
 
 
 
156
  M, _ = cv2.estimateAffinePartial2D(landmarks, _CANONICAL_LM)
157
  if M is not None:
158
- return cv2.warpAffine(face_bgr, M, (112, 112))
159
- # Fallback: plain resize (should rarely happen)
160
- logger.debug("Landmark alignment failed, falling back to plain resize")
161
- return cv2.resize(face_bgr, (112, 112))
162
-
163
-
164
- def _embed(face_bgr: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
165
- """Return an L2-normalised 512-D embedding for a detected face."""
166
- face_112 = _align(face_bgr, landmarks)
 
 
 
 
 
 
 
 
 
 
 
 
167
  face_rgb = cv2.cvtColor(face_112, cv2.COLOR_BGR2RGB)
168
  img = face_rgb.astype(np.float32)
169
  img = (img - 127.5) / 127.5 # normalise to [-1, 1]
@@ -225,8 +241,9 @@ def find_match(
225
  raise NoFaceDetected()
226
 
227
  # Use the first (highest-confidence) detection
228
- (x, y, w, h), lm = detections[0]
229
- emb = _embed(frame_bgr[y : y + h, x : x + w], lm)
 
230
 
231
  best_name, best_sim = None, -1.0
232
  for name, enc_list in db.items():
@@ -255,8 +272,9 @@ def add_face(
255
  if not detections:
256
  raise ValueError("No face detected in enrollment image")
257
 
258
- (x, y, w, h), lm = detections[0]
259
- emb = _embed(frame_bgr[y : y + h, x : x + w], lm)
 
260
 
261
  db.setdefault(name, [])
262
  if len(db[name]) < max_per_person:
 
34
  logger = logging.getLogger(__name__)
35
 
36
  # Bump whenever the embedding pipeline changes (detector, alignment, model, …)
37
+ SCHEMA_VERSION = 5 # 3=aligned-RGB-Haar 4=YuNet+5pt-landmarks 5=full-frame warp fix
38
 
39
  DB_PATH = Path(__file__).parent / "face_db.json"
40
  MODEL_DIR = Path(__file__).parent / "models"
 
151
  return results
152
 
153
 
154
+ def _align(frame_bgr: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
155
+ """Warp frame_bgr to InsightFace 112×112 canonical frame.
156
+
157
+ landmarks must be in FULL-FRAME pixel coordinates (as returned by YuNet).
158
+ warpAffine is applied to the full frame so the coordinate spaces match.
159
+ """
160
  M, _ = cv2.estimateAffinePartial2D(landmarks, _CANONICAL_LM)
161
  if M is not None:
162
+ return cv2.warpAffine(frame_bgr, M, (112, 112))
163
+ # Fallback: derive a bbox from the landmark extents and resize
164
+ logger.debug("Landmark alignment failed, falling back to bbox crop")
165
+ x1, y1 = landmarks.min(axis=0).astype(int)
166
+ x2, y2 = landmarks.max(axis=0).astype(int)
167
+ pad = max(x2 - x1, y2 - y1) // 2
168
+ h_f, w_f = frame_bgr.shape[:2]
169
+ crop = frame_bgr[
170
+ max(0, y1 - pad): min(h_f, y2 + pad),
171
+ max(0, x1 - pad): min(w_f, x2 + pad),
172
+ ]
173
+ return cv2.resize(crop, (112, 112)) if crop.size > 0 else np.zeros((112, 112, 3), np.uint8)
174
+
175
+
176
+ def _embed(frame_bgr: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
177
+ """Return an L2-normalised 512-D embedding.
178
+
179
+ frame_bgr must be the FULL camera frame; landmarks are in full-frame
180
+ coordinates so the alignment warp is applied correctly.
181
+ """
182
+ face_112 = _align(frame_bgr, landmarks)
183
  face_rgb = cv2.cvtColor(face_112, cv2.COLOR_BGR2RGB)
184
  img = face_rgb.astype(np.float32)
185
  img = (img - 127.5) / 127.5 # normalise to [-1, 1]
 
241
  raise NoFaceDetected()
242
 
243
  # Use the first (highest-confidence) detection
244
+ # Pass the full frame landmarks are in full-frame coordinates.
245
+ (_x, _y, _w, _h), lm = detections[0]
246
+ emb = _embed(frame_bgr, lm)
247
 
248
  best_name, best_sim = None, -1.0
249
  for name, enc_list in db.items():
 
272
  if not detections:
273
  raise ValueError("No face detected in enrollment image")
274
 
275
+ # Pass the full frame landmarks are in full-frame coordinates.
276
+ (_x, _y, _w, _h), lm = detections[0]
277
+ emb = _embed(frame_bgr, lm)
278
 
279
  db.setdefault(name, [])
280
  if len(db[name]) < max_per_person: