Spaces:
Running
Fix critical alignment bug: pass full frame to warpAffine, not crop
Browse filesYuNet returns landmark coordinates in the FULL FRAME pixel space
(e.g. right eye at x=320, y=140 in a 640×480 image).
The previous code cropped the face bounding box first:
_embed(frame_bgr[y:y+h, x:x+w], lm)
...then passed those full-frame landmarks to _align, which applied
warpAffine to the CROP using coordinates from the full frame.
The transform therefore mapped to completely wrong pixels in the crop,
producing a scrambled 112×112 tile that bore no relation to the face.
Enrollment and recognition each produced different scrambled tiles,
making cosine similarity effectively random — hence "always same person"
(random embeddings cluster near a fixed enrolled vector).
Fix: pass the full frame to _embed/_align so the landmark coordinates
and the image coordinate space match:
_embed(frame_bgr, lm) # full frame, full-frame landmarks
Also improved the _align fallback (M=None) to derive a bbox from the
landmark extents rather than plain-resizing the full frame.
SCHEMA_VERSION bumped to 5 → stale DB entries are auto-cleared and
the user will be prompted to re-enroll with correct embeddings.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- recognizer/face_db.py +34 -16
|
@@ -34,7 +34,7 @@ import onnxruntime as ort
|
|
| 34 |
logger = logging.getLogger(__name__)
|
| 35 |
|
| 36 |
# Bump whenever the embedding pipeline changes (detector, alignment, model, …)
|
| 37 |
-
SCHEMA_VERSION =
|
| 38 |
|
| 39 |
DB_PATH = Path(__file__).parent / "face_db.json"
|
| 40 |
MODEL_DIR = Path(__file__).parent / "models"
|
|
@@ -151,19 +151,35 @@ def _detect(frame_bgr: np.ndarray) -> list[tuple[tuple, np.ndarray]]:
|
|
| 151 |
return results
|
| 152 |
|
| 153 |
|
| 154 |
-
def _align(
|
| 155 |
-
"""Warp
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
M, _ = cv2.estimateAffinePartial2D(landmarks, _CANONICAL_LM)
|
| 157 |
if M is not None:
|
| 158 |
-
return cv2.warpAffine(
|
| 159 |
-
# Fallback:
|
| 160 |
-
logger.debug("Landmark alignment failed, falling back to
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
face_rgb = cv2.cvtColor(face_112, cv2.COLOR_BGR2RGB)
|
| 168 |
img = face_rgb.astype(np.float32)
|
| 169 |
img = (img - 127.5) / 127.5 # normalise to [-1, 1]
|
|
@@ -225,8 +241,9 @@ def find_match(
|
|
| 225 |
raise NoFaceDetected()
|
| 226 |
|
| 227 |
# Use the first (highest-confidence) detection
|
| 228 |
-
|
| 229 |
-
|
|
|
|
| 230 |
|
| 231 |
best_name, best_sim = None, -1.0
|
| 232 |
for name, enc_list in db.items():
|
|
@@ -255,8 +272,9 @@ def add_face(
|
|
| 255 |
if not detections:
|
| 256 |
raise ValueError("No face detected in enrollment image")
|
| 257 |
|
| 258 |
-
|
| 259 |
-
|
|
|
|
| 260 |
|
| 261 |
db.setdefault(name, [])
|
| 262 |
if len(db[name]) < max_per_person:
|
|
|
|
| 34 |
logger = logging.getLogger(__name__)
|
| 35 |
|
| 36 |
# Bump whenever the embedding pipeline changes (detector, alignment, model, …)
|
| 37 |
+
SCHEMA_VERSION = 5 # 3=aligned-RGB-Haar 4=YuNet+5pt-landmarks 5=full-frame warp fix
|
| 38 |
|
| 39 |
DB_PATH = Path(__file__).parent / "face_db.json"
|
| 40 |
MODEL_DIR = Path(__file__).parent / "models"
|
|
|
|
| 151 |
return results
|
| 152 |
|
| 153 |
|
| 154 |
+
def _align(frame_bgr: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
|
| 155 |
+
"""Warp frame_bgr to InsightFace 112×112 canonical frame.
|
| 156 |
+
|
| 157 |
+
landmarks must be in FULL-FRAME pixel coordinates (as returned by YuNet).
|
| 158 |
+
warpAffine is applied to the full frame so the coordinate spaces match.
|
| 159 |
+
"""
|
| 160 |
M, _ = cv2.estimateAffinePartial2D(landmarks, _CANONICAL_LM)
|
| 161 |
if M is not None:
|
| 162 |
+
return cv2.warpAffine(frame_bgr, M, (112, 112))
|
| 163 |
+
# Fallback: derive a bbox from the landmark extents and resize
|
| 164 |
+
logger.debug("Landmark alignment failed, falling back to bbox crop")
|
| 165 |
+
x1, y1 = landmarks.min(axis=0).astype(int)
|
| 166 |
+
x2, y2 = landmarks.max(axis=0).astype(int)
|
| 167 |
+
pad = max(x2 - x1, y2 - y1) // 2
|
| 168 |
+
h_f, w_f = frame_bgr.shape[:2]
|
| 169 |
+
crop = frame_bgr[
|
| 170 |
+
max(0, y1 - pad): min(h_f, y2 + pad),
|
| 171 |
+
max(0, x1 - pad): min(w_f, x2 + pad),
|
| 172 |
+
]
|
| 173 |
+
return cv2.resize(crop, (112, 112)) if crop.size > 0 else np.zeros((112, 112, 3), np.uint8)
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def _embed(frame_bgr: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
|
| 177 |
+
"""Return an L2-normalised 512-D embedding.
|
| 178 |
+
|
| 179 |
+
frame_bgr must be the FULL camera frame; landmarks are in full-frame
|
| 180 |
+
coordinates so the alignment warp is applied correctly.
|
| 181 |
+
"""
|
| 182 |
+
face_112 = _align(frame_bgr, landmarks)
|
| 183 |
face_rgb = cv2.cvtColor(face_112, cv2.COLOR_BGR2RGB)
|
| 184 |
img = face_rgb.astype(np.float32)
|
| 185 |
img = (img - 127.5) / 127.5 # normalise to [-1, 1]
|
|
|
|
| 241 |
raise NoFaceDetected()
|
| 242 |
|
| 243 |
# Use the first (highest-confidence) detection
|
| 244 |
+
# Pass the full frame — landmarks are in full-frame coordinates.
|
| 245 |
+
(_x, _y, _w, _h), lm = detections[0]
|
| 246 |
+
emb = _embed(frame_bgr, lm)
|
| 247 |
|
| 248 |
best_name, best_sim = None, -1.0
|
| 249 |
for name, enc_list in db.items():
|
|
|
|
| 272 |
if not detections:
|
| 273 |
raise ValueError("No face detected in enrollment image")
|
| 274 |
|
| 275 |
+
# Pass the full frame — landmarks are in full-frame coordinates.
|
| 276 |
+
(_x, _y, _w, _h), lm = detections[0]
|
| 277 |
+
emb = _embed(frame_bgr, lm)
|
| 278 |
|
| 279 |
db.setdefault(name, [])
|
| 280 |
if len(db[name]) < max_per_person:
|