Oliver Nitsche Claude Sonnet 4.6 commited on
Commit
f7c60d9
·
1 Parent(s): f4e577f

Replace Haar cascade with YuNet deep-learning face detector

Browse files

The Haar cascade produced frequent false positives (walls, patterns)
and had no confidence score to filter them. Replaced with YuNet
(cv2.FaceDetectorYN), a small deep-learning detector built into
OpenCV >= 4.8:

- Confidence threshold (0.75) eliminates non-face detections
- Returns 5-point facial landmarks (eyes, nose, mouth corners) directly,
so the separate haarcascade_eye step is gone
- 5-point similarity transform gives much more accurate alignment than
the previous two-eye-centre approach
- Model is tiny (~337 KB) and downloaded once from opencv_zoo

Canonical 5-point InsightFace landmarks used for the warpAffine so
MobileFaceNet embeddings are as consistent as possible across frames.

SCHEMA_VERSION bumped to 4 → stale DB entries from the Haar pipeline
are auto-discarded and the user is prompted to re-enroll.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. recognizer/face_db.py +146 -97
recognizer/face_db.py CHANGED
@@ -1,10 +1,18 @@
1
  """Face database: local face recognition via ONNX + OpenCV.
2
 
3
- Detection : OpenCV Haar cascade (built into opencv, no download).
 
 
 
 
 
4
  Embedding : InsightFace MobileFaceNet (w600k_mbf.onnx, ~17 MB, downloaded
5
  once on first run from the InsightFace GitHub release).
6
- Alignment : eye-centre similarity transform to the InsightFace 112×112
7
- canonical frame before embedding (plain resize fallback).
 
 
 
8
  Matching : cosine similarity on L2-normalised 512-D embeddings.
9
  Storage : recognizer/face_db.json (gitignored).
10
 
@@ -25,28 +33,40 @@ import onnxruntime as ort
25
 
26
  logger = logging.getLogger(__name__)
27
 
28
- # Bump this whenever the embedding pipeline changes (alignment, colour space,
29
- # model weights, normalisation, …). Mismatched DBs are auto-cleared on load.
30
- SCHEMA_VERSION = 3 # 1=plain-resize BGR 2=aligned BGR 3=aligned RGB
31
 
32
  DB_PATH = Path(__file__).parent / "face_db.json"
33
  MODEL_DIR = Path(__file__).parent / "models"
34
- MODEL_FILE = MODEL_DIR / "w600k_mbf.onnx"
35
- MODEL_URL = (
 
 
36
  "https://github.com/deepinsight/insightface"
37
  "/releases/download/v0.7/buffalo_sc.zip"
38
  )
39
 
40
- _CASCADE = cv2.CascadeClassifier(
41
- cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
42
- )
43
- _EYE_CASCADE = cv2.CascadeClassifier(
44
- cv2.data.haarcascades + "haarcascade_eye.xml"
45
  )
46
- _session: Optional[ort.InferenceSession] = None
 
 
 
 
 
 
 
 
 
 
47
 
48
- # Standard InsightFace eye positions in the 112×112 aligned face
49
- _ALIGN_DST = np.float32([[38.29, 51.70], [73.53, 51.50]])
 
50
 
51
 
52
  class NoFaceDetected(Exception):
@@ -57,73 +77,100 @@ class NoFaceDetected(Exception):
57
  # Internal helpers
58
  # ---------------------------------------------------------------------------
59
 
60
- def _ensure_model() -> None:
61
- if MODEL_FILE.exists():
62
  return
63
  MODEL_DIR.mkdir(exist_ok=True)
64
  zip_path = MODEL_DIR / "buffalo_sc.zip"
65
- logger.info("Downloading face recognition model (~17 MB) — one-time setup…")
66
- urllib.request.urlretrieve(MODEL_URL, zip_path)
67
  with zipfile.ZipFile(zip_path) as zf:
68
- # The file may live at root or inside a named subdirectory (e.g. buffalo_sc/).
69
  matches = [n for n in zf.namelist() if n.endswith("w600k_mbf.onnx")]
70
  if not matches:
71
  raise RuntimeError(
72
  f"w600k_mbf.onnx not found in downloaded zip. "
73
- f"Available entries: {zf.namelist()}"
74
  )
75
- with zf.open(matches[0]) as src, open(MODEL_FILE, "wb") as dst:
76
  dst.write(src.read())
77
  zip_path.unlink()
78
- logger.info("Model saved to %s", MODEL_FILE)
79
 
80
 
81
- def _get_session() -> ort.InferenceSession:
82
- global _session
83
- if _session is None:
84
- _ensure_model()
85
- _session = ort.InferenceSession(
86
- str(MODEL_FILE), providers=["CPUExecutionProvider"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  )
88
- return _session
89
-
90
-
91
- def _detect(frame_bgr: np.ndarray) -> list[tuple[int, int, int, int]]:
92
- gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
93
- boxes = _CASCADE.detectMultiScale(
94
- gray, scaleFactor=1.1, minNeighbors=3, minSize=(40, 40)
95
- )
96
- return [tuple(b) for b in boxes] if len(boxes) > 0 else []
97
-
98
-
99
- def _align(face_bgr: np.ndarray) -> np.ndarray:
100
- """Return a 112×112 crop aligned on eye centres; plain resize as fallback."""
101
- gray = cv2.cvtColor(face_bgr, cv2.COLOR_BGR2GRAY)
102
- eyes = _EYE_CASCADE.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=3)
103
- if len(eyes) >= 2:
104
- eyes = sorted(eyes, key=lambda e: e[2] * e[3], reverse=True)[:2]
105
- eyes = sorted(eyes, key=lambda e: e[0])
106
- src = np.float32([
107
- [eyes[0][0] + eyes[0][2] / 2, eyes[0][1] + eyes[0][3] / 2],
108
- [eyes[1][0] + eyes[1][2] / 2, eyes[1][1] + eyes[1][3] / 2],
109
- ])
110
- M, _ = cv2.estimateAffinePartial2D(src, _ALIGN_DST)
111
- if M is not None:
112
- return cv2.warpAffine(face_bgr, M, (112, 112))
 
 
 
 
 
 
113
  return cv2.resize(face_bgr, (112, 112))
114
 
115
 
116
- def _embed(face_bgr: np.ndarray) -> np.ndarray:
117
- """Return an L2-normalised 512-D embedding for face_bgr."""
118
- face_112 = _align(face_bgr)
119
- # MobileFaceNet (InsightFace) is trained on RGB — convert from OpenCV BGR.
120
- face_rgb = cv2.cvtColor(face_112, cv2.COLOR_BGR2RGB)
121
- img = face_rgb.astype(np.float32)
122
- img = (img - 127.5) / 127.5 # normalise to [-1, 1]
123
- inp = np.transpose(img, (2, 0, 1))[np.newaxis] # NCHW
124
- sess = _get_session()
125
- emb = sess.run(None, {sess.get_inputs()[0].name: inp})[0][0]
126
- return emb / np.linalg.norm(emb) # L2-normalise
127
 
128
 
129
  # ---------------------------------------------------------------------------
@@ -131,12 +178,13 @@ def _embed(face_bgr: np.ndarray) -> np.ndarray:
131
  # ---------------------------------------------------------------------------
132
 
133
  def load() -> dict[str, list[list[float]]]:
134
- """Load face DB from disk and warm up the ONNX session.
135
 
136
- Returns an empty dict if the DB is missing or was produced by an older
137
- embedding pipeline (schema mismatch auto-clear).
138
  """
139
- _get_session()
 
140
  if not DB_PATH.exists():
141
  return {}
142
  raw = json.loads(DB_PATH.read_text())
@@ -170,14 +218,15 @@ def find_match(
170
  ) -> Optional[str]:
171
  """Return matched name if recognised, None if face present but unknown.
172
 
173
- Raises NoFaceDetected if no face is detected in the frame at all.
174
  """
175
- boxes = _detect(frame_bgr)
176
- if not boxes:
177
  raise NoFaceDetected()
178
 
179
- x, y, w, h = boxes[0]
180
- emb = _embed(frame_bgr[y : y + h, x : x + w])
 
181
 
182
  best_name, best_sim = None, -1.0
183
  for name, enc_list in db.items():
@@ -195,24 +244,6 @@ def find_match(
195
  return None
196
 
197
 
198
- def get_face_jpeg(frame_bgr: np.ndarray, padding: float = 0.4) -> Optional[bytes]:
199
- """Return a JPEG-encoded crop of the largest detected face, or None."""
200
- boxes = _detect(frame_bgr)
201
- if not boxes:
202
- return None
203
- x, y, w, h = boxes[0]
204
- pad_x = int(w * padding)
205
- pad_y = int(h * padding)
206
- h_img, w_img = frame_bgr.shape[:2]
207
- x1 = max(0, x - pad_x)
208
- y1 = max(0, y - pad_y)
209
- x2 = min(w_img, x + w + pad_x)
210
- y2 = min(h_img, y + h + pad_y)
211
- crop = frame_bgr[y1:y2, x1:x2]
212
- ok, buf = cv2.imencode(".jpg", crop, [cv2.IMWRITE_JPEG_QUALITY, 85])
213
- return bytes(buf) if ok else None
214
-
215
-
216
  def add_face(
217
  name: str,
218
  frame_bgr: np.ndarray,
@@ -220,14 +251,32 @@ def add_face(
220
  max_per_person: int = 5,
221
  ) -> None:
222
  """Embed and store the face from frame_bgr under name."""
223
- boxes = _detect(frame_bgr)
224
- if not boxes:
225
  raise ValueError("No face detected in enrollment image")
226
 
227
- x, y, w, h = boxes[0]
228
- emb = _embed(frame_bgr[y : y + h, x : x + w])
229
 
230
  db.setdefault(name, [])
231
  if len(db[name]) < max_per_person:
232
  db[name].append(emb.tolist())
233
  save(db)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """Face database: local face recognition via ONNX + OpenCV.
2
 
3
+ Detection : YuNet (cv2.FaceDetectorYN) deep-learning detector built into
4
+ OpenCV >= 4.8. Returns a confidence score (false-positive walls
5
+ are eliminated) and 5-point facial landmarks used directly for
6
+ alignment. Model: face_detection_yunet_2023mar.onnx (~337 KB,
7
+ downloaded once).
8
+
9
  Embedding : InsightFace MobileFaceNet (w600k_mbf.onnx, ~17 MB, downloaded
10
  once on first run from the InsightFace GitHub release).
11
+
12
+ Alignment : 5-point similarity transform to the InsightFace 112×112
13
+ canonical frame (right eye, left eye, nose, mouth corners).
14
+ Much more accurate than a separate eye cascade.
15
+
16
  Matching : cosine similarity on L2-normalised 512-D embeddings.
17
  Storage : recognizer/face_db.json (gitignored).
18
 
 
33
 
34
  logger = logging.getLogger(__name__)
35
 
36
+ # Bump whenever the embedding pipeline changes (detector, alignment, model, …)
37
+ SCHEMA_VERSION = 4 # 3=aligned-RGB-Haar 4=YuNet+5pt-landmarks
 
38
 
39
  DB_PATH = Path(__file__).parent / "face_db.json"
40
  MODEL_DIR = Path(__file__).parent / "models"
41
+
42
+ # --- Recognition model (MobileFaceNet) ---
43
+ REC_FILE = MODEL_DIR / "w600k_mbf.onnx"
44
+ REC_URL = (
45
  "https://github.com/deepinsight/insightface"
46
  "/releases/download/v0.7/buffalo_sc.zip"
47
  )
48
 
49
+ # --- Detection model (YuNet) ---
50
+ DET_FILE = MODEL_DIR / "face_detection_yunet_2023mar.onnx"
51
+ DET_URL = (
52
+ "https://github.com/opencv/opencv_zoo/raw/main/models/"
53
+ "face_detection_yunet/face_detection_yunet_2023mar.onnx"
54
  )
55
+ DET_CONFIDENCE = 0.75 # discard detections below this score
56
+
57
+ # InsightFace canonical 5-point landmarks in 112×112 space
58
+ # order: right_eye, left_eye, nose_tip, right_mouth, left_mouth
59
+ _CANONICAL_LM = np.float32([
60
+ [38.2946, 51.6963],
61
+ [73.5318, 51.5014],
62
+ [56.0252, 71.7366],
63
+ [41.5493, 92.3655],
64
+ [70.7299, 92.2041],
65
+ ])
66
 
67
+ _rec_session: Optional[ort.InferenceSession] = None
68
+ _detector: Optional[cv2.FaceDetectorYN] = None
69
+ _det_size: tuple[int, int] = (0, 0)
70
 
71
 
72
  class NoFaceDetected(Exception):
 
77
  # Internal helpers
78
  # ---------------------------------------------------------------------------
79
 
80
+ def _ensure_rec_model() -> None:
81
+ if REC_FILE.exists():
82
  return
83
  MODEL_DIR.mkdir(exist_ok=True)
84
  zip_path = MODEL_DIR / "buffalo_sc.zip"
85
+ logger.info("Downloading MobileFaceNet recognition model (~17 MB)…")
86
+ urllib.request.urlretrieve(REC_URL, zip_path)
87
  with zipfile.ZipFile(zip_path) as zf:
 
88
  matches = [n for n in zf.namelist() if n.endswith("w600k_mbf.onnx")]
89
  if not matches:
90
  raise RuntimeError(
91
  f"w600k_mbf.onnx not found in downloaded zip. "
92
+ f"Entries: {zf.namelist()}"
93
  )
94
+ with zf.open(matches[0]) as src, open(REC_FILE, "wb") as dst:
95
  dst.write(src.read())
96
  zip_path.unlink()
97
+ logger.info("Recognition model saved to %s", REC_FILE)
98
 
99
 
100
+ def _ensure_det_model() -> None:
101
+ if DET_FILE.exists():
102
+ return
103
+ MODEL_DIR.mkdir(exist_ok=True)
104
+ logger.info("Downloading YuNet face detector (~337 KB)…")
105
+ urllib.request.urlretrieve(DET_URL, DET_FILE)
106
+ logger.info("Detector saved to %s", DET_FILE)
107
+
108
+
109
+ def _get_rec_session() -> ort.InferenceSession:
110
+ global _rec_session
111
+ if _rec_session is None:
112
+ _ensure_rec_model()
113
+ _rec_session = ort.InferenceSession(
114
+ str(REC_FILE), providers=["CPUExecutionProvider"]
115
+ )
116
+ return _rec_session
117
+
118
+
119
+ def _get_detector(width: int, height: int) -> cv2.FaceDetectorYN:
120
+ global _detector, _det_size
121
+ _ensure_det_model()
122
+ if _detector is None or _det_size != (width, height):
123
+ _detector = cv2.FaceDetectorYN.create(
124
+ str(DET_FILE), "",
125
+ (width, height),
126
+ score_threshold=DET_CONFIDENCE,
127
+ nms_threshold=0.3,
128
+ top_k=10,
129
  )
130
+ _det_size = (width, height)
131
+ return _detector
132
+
133
+
134
+ def _detect(frame_bgr: np.ndarray) -> list[tuple[tuple, np.ndarray]]:
135
+ """Return [(bbox, landmarks)] for each face found.
136
+
137
+ bbox = (x, y, w, h) in pixel coordinates
138
+ landmarks = float32 array shape (5, 2) right_eye, left_eye, nose,
139
+ right_mouth, left_mouth
140
+ """
141
+ h, w = frame_bgr.shape[:2]
142
+ det = _get_detector(w, h)
143
+ _, faces = det.detect(frame_bgr)
144
+ if faces is None:
145
+ return []
146
+ results = []
147
+ for f in faces:
148
+ bbox = (int(f[0]), int(f[1]), int(f[2]), int(f[3]))
149
+ lm = f[4:14].reshape(5, 2).astype(np.float32)
150
+ results.append((bbox, lm))
151
+ return results
152
+
153
+
154
+ def _align(face_bgr: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
155
+ """Warp face to InsightFace 112×112 canonical frame using 5-pt landmarks."""
156
+ M, _ = cv2.estimateAffinePartial2D(landmarks, _CANONICAL_LM)
157
+ if M is not None:
158
+ return cv2.warpAffine(face_bgr, M, (112, 112))
159
+ # Fallback: plain resize (should rarely happen)
160
+ logger.debug("Landmark alignment failed, falling back to plain resize")
161
  return cv2.resize(face_bgr, (112, 112))
162
 
163
 
164
+ def _embed(face_bgr: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
165
+ """Return an L2-normalised 512-D embedding for a detected face."""
166
+ face_112 = _align(face_bgr, landmarks)
167
+ face_rgb = cv2.cvtColor(face_112, cv2.COLOR_BGR2RGB)
168
+ img = face_rgb.astype(np.float32)
169
+ img = (img - 127.5) / 127.5 # normalise to [-1, 1]
170
+ inp = np.transpose(img, (2, 0, 1))[np.newaxis] # NCHW
171
+ sess = _get_rec_session()
172
+ emb = sess.run(None, {sess.get_inputs()[0].name: inp})[0][0]
173
+ return emb / np.linalg.norm(emb)
 
174
 
175
 
176
  # ---------------------------------------------------------------------------
 
178
  # ---------------------------------------------------------------------------
179
 
180
  def load() -> dict[str, list[list[float]]]:
181
+ """Load face DB from disk and warm up both ONNX models.
182
 
183
+ Returns an empty dict and removes the DB file if the schema version
184
+ doesn't match (stale embeddings from an older pipeline).
185
  """
186
+ _get_rec_session()
187
+ _ensure_det_model()
188
  if not DB_PATH.exists():
189
  return {}
190
  raw = json.loads(DB_PATH.read_text())
 
218
  ) -> Optional[str]:
219
  """Return matched name if recognised, None if face present but unknown.
220
 
221
+ Raises NoFaceDetected if the detector finds no face in the frame.
222
  """
223
+ detections = _detect(frame_bgr)
224
+ if not detections:
225
  raise NoFaceDetected()
226
 
227
+ # Use the first (highest-confidence) detection
228
+ (x, y, w, h), lm = detections[0]
229
+ emb = _embed(frame_bgr[y : y + h, x : x + w], lm)
230
 
231
  best_name, best_sim = None, -1.0
232
  for name, enc_list in db.items():
 
244
  return None
245
 
246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  def add_face(
248
  name: str,
249
  frame_bgr: np.ndarray,
 
251
  max_per_person: int = 5,
252
  ) -> None:
253
  """Embed and store the face from frame_bgr under name."""
254
+ detections = _detect(frame_bgr)
255
+ if not detections:
256
  raise ValueError("No face detected in enrollment image")
257
 
258
+ (x, y, w, h), lm = detections[0]
259
+ emb = _embed(frame_bgr[y : y + h, x : x + w], lm)
260
 
261
  db.setdefault(name, [])
262
  if len(db[name]) < max_per_person:
263
  db[name].append(emb.tolist())
264
  save(db)
265
+
266
+
267
+ def get_face_jpeg(frame_bgr: np.ndarray, padding: float = 0.4) -> Optional[bytes]:
268
+ """Return a JPEG-encoded crop of the best detected face, or None."""
269
+ detections = _detect(frame_bgr)
270
+ if not detections:
271
+ return None
272
+ x, y, w, h = detections[0][0]
273
+ pad_x = int(w * padding)
274
+ pad_y = int(h * padding)
275
+ h_img, w_img = frame_bgr.shape[:2]
276
+ x1 = max(0, x - pad_x)
277
+ y1 = max(0, y - pad_y)
278
+ x2 = min(w_img, x + w + pad_x)
279
+ y2 = min(h_img, y + h + pad_y)
280
+ crop = frame_bgr[y1:y2, x1:x2]
281
+ ok, buf = cv2.imencode(".jpg", crop, [cv2.IMWRITE_JPEG_QUALITY, 85])
282
+ return bytes(buf) if ok else None