Spaces:

onitsche
/

recognizer

Running

Oliver Nitsche Claude Sonnet 4.6 commited on 26 days ago

Commit

12fca32

1 Parent(s): dbdff5f

Fix critical alignment bug: pass full frame to warpAffine, not crop

YuNet returns landmark coordinates in the FULL FRAME pixel space
(e.g. right eye at x=320, y=140 in a 640×480 image).

The previous code cropped the face bounding box first:
_embed(frame_bgr[y:y+h, x:x+w], lm)

...then passed those full-frame landmarks to _align, which applied
warpAffine to the CROP using coordinates from the full frame.
The transform therefore mapped to completely wrong pixels in the crop,
producing a scrambled 112×112 tile that bore no relation to the face.
Enrollment and recognition each produced different scrambled tiles,
making cosine similarity effectively random — hence "always same person"
(random embeddings cluster near a fixed enrolled vector).

Fix: pass the full frame to _embed/_align so the landmark coordinates
and the image coordinate space match:
_embed(frame_bgr, lm) # full frame, full-frame landmarks

Also improved the _align fallback (M=None) to derive a bbox from the
landmark extents rather than plain-resizing the full frame.

SCHEMA_VERSION bumped to 5 → stale DB entries are auto-cleared and
the user will be prompted to re-enroll with correct embeddings.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

recognizer/face_db.py +34 -16

recognizer/face_db.py CHANGED Viewed

@@ -34,7 +34,7 @@ import onnxruntime as ort
 logger = logging.getLogger(__name__)
 # Bump whenever the embedding pipeline changes (detector, alignment, model, …)
-SCHEMA_VERSION = 4   # 3=aligned-RGB-Haar  4=YuNet+5pt-landmarks
 DB_PATH    = Path(__file__).parent / "face_db.json"
 MODEL_DIR  = Path(__file__).parent / "models"
@@ -151,19 +151,35 @@ def _detect(frame_bgr: np.ndarray) -> list[tuple[tuple, np.ndarray]]:
     return results
-def _align(face_bgr: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
-    """Warp face to InsightFace 112×112 canonical frame using 5-pt landmarks."""
     M, _ = cv2.estimateAffinePartial2D(landmarks, _CANONICAL_LM)
     if M is not None:
-        return cv2.warpAffine(face_bgr, M, (112, 112))
-    # Fallback: plain resize (should rarely happen)
-    logger.debug("Landmark alignment failed, falling back to plain resize")
-    return cv2.resize(face_bgr, (112, 112))
-def _embed(face_bgr: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
-    """Return an L2-normalised 512-D embedding for a detected face."""
-    face_112  = _align(face_bgr, landmarks)
     face_rgb  = cv2.cvtColor(face_112, cv2.COLOR_BGR2RGB)
     img       = face_rgb.astype(np.float32)
     img       = (img - 127.5) / 127.5          # normalise to [-1, 1]
@@ -225,8 +241,9 @@ def find_match(
         raise NoFaceDetected()
     # Use the first (highest-confidence) detection
-    (x, y, w, h), lm = detections[0]
-    emb = _embed(frame_bgr[y : y + h, x : x + w], lm)
     best_name, best_sim = None, -1.0
     for name, enc_list in db.items():
@@ -255,8 +272,9 @@ def add_face(
     if not detections:
         raise ValueError("No face detected in enrollment image")
-    (x, y, w, h), lm = detections[0]
-    emb = _embed(frame_bgr[y : y + h, x : x + w], lm)
     db.setdefault(name, [])
     if len(db[name]) < max_per_person:

 logger = logging.getLogger(__name__)
 # Bump whenever the embedding pipeline changes (detector, alignment, model, …)
+SCHEMA_VERSION = 5   # 3=aligned-RGB-Haar  4=YuNet+5pt-landmarks  5=full-frame warp fix
 DB_PATH    = Path(__file__).parent / "face_db.json"
 MODEL_DIR  = Path(__file__).parent / "models"
     return results
+def _align(frame_bgr: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
+    """Warp frame_bgr to InsightFace 112×112 canonical frame.
+    landmarks must be in FULL-FRAME pixel coordinates (as returned by YuNet).
+    warpAffine is applied to the full frame so the coordinate spaces match.
+    """
     M, _ = cv2.estimateAffinePartial2D(landmarks, _CANONICAL_LM)
     if M is not None:
+        return cv2.warpAffine(frame_bgr, M, (112, 112))
+    # Fallback: derive a bbox from the landmark extents and resize
+    logger.debug("Landmark alignment failed, falling back to bbox crop")
+    x1, y1 = landmarks.min(axis=0).astype(int)
+    x2, y2 = landmarks.max(axis=0).astype(int)
+    pad = max(x2 - x1, y2 - y1) // 2
+    h_f, w_f = frame_bgr.shape[:2]
+    crop = frame_bgr[
+        max(0, y1 - pad): min(h_f, y2 + pad),
+        max(0, x1 - pad): min(w_f, x2 + pad),
+    ]
+    return cv2.resize(crop, (112, 112)) if crop.size > 0 else np.zeros((112, 112, 3), np.uint8)
+def _embed(frame_bgr: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
+    """Return an L2-normalised 512-D embedding.
+    frame_bgr must be the FULL camera frame; landmarks are in full-frame
+    coordinates so the alignment warp is applied correctly.
+    """
+    face_112  = _align(frame_bgr, landmarks)
     face_rgb  = cv2.cvtColor(face_112, cv2.COLOR_BGR2RGB)
     img       = face_rgb.astype(np.float32)
     img       = (img - 127.5) / 127.5          # normalise to [-1, 1]
         raise NoFaceDetected()
     # Use the first (highest-confidence) detection
+    # Pass the full frame — landmarks are in full-frame coordinates.
+    (_x, _y, _w, _h), lm = detections[0]
+    emb = _embed(frame_bgr, lm)
     best_name, best_sim = None, -1.0
     for name, enc_list in db.items():
     if not detections:
         raise ValueError("No face detected in enrollment image")
+    # Pass the full frame — landmarks are in full-frame coordinates.
+    (_x, _y, _w, _h), lm = detections[0]
+    emb = _embed(frame_bgr, lm)
     db.setdefault(name, [])
     if len(db[name]) < max_per_person: