Spaces:
Running
Replace Haar cascade with YuNet deep-learning face detector
Browse filesThe Haar cascade produced frequent false positives (walls, patterns)
and had no confidence score to filter them. Replaced with YuNet
(cv2.FaceDetectorYN), a small deep-learning detector built into
OpenCV >= 4.8:
- Confidence threshold (0.75) eliminates non-face detections
- Returns 5-point facial landmarks (eyes, nose, mouth corners) directly,
so the separate haarcascade_eye step is gone
- 5-point similarity transform gives much more accurate alignment than
the previous two-eye-centre approach
- Model is tiny (~337 KB) and downloaded once from opencv_zoo
Canonical 5-point InsightFace landmarks used for the warpAffine so
MobileFaceNet embeddings are as consistent as possible across frames.
SCHEMA_VERSION bumped to 4 → stale DB entries from the Haar pipeline
are auto-discarded and the user is prompted to re-enroll.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- recognizer/face_db.py +146 -97
|
@@ -1,10 +1,18 @@
|
|
| 1 |
"""Face database: local face recognition via ONNX + OpenCV.
|
| 2 |
|
| 3 |
-
Detection :
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
Embedding : InsightFace MobileFaceNet (w600k_mbf.onnx, ~17 MB, downloaded
|
| 5 |
once on first run from the InsightFace GitHub release).
|
| 6 |
-
|
| 7 |
-
|
|
|
|
|
|
|
|
|
|
| 8 |
Matching : cosine similarity on L2-normalised 512-D embeddings.
|
| 9 |
Storage : recognizer/face_db.json (gitignored).
|
| 10 |
|
|
@@ -25,28 +33,40 @@ import onnxruntime as ort
|
|
| 25 |
|
| 26 |
logger = logging.getLogger(__name__)
|
| 27 |
|
| 28 |
-
# Bump
|
| 29 |
-
|
| 30 |
-
SCHEMA_VERSION = 3 # 1=plain-resize BGR 2=aligned BGR 3=aligned RGB
|
| 31 |
|
| 32 |
DB_PATH = Path(__file__).parent / "face_db.json"
|
| 33 |
MODEL_DIR = Path(__file__).parent / "models"
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
| 36 |
"https://github.com/deepinsight/insightface"
|
| 37 |
"/releases/download/v0.7/buffalo_sc.zip"
|
| 38 |
)
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
)
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
class NoFaceDetected(Exception):
|
|
@@ -57,73 +77,100 @@ class NoFaceDetected(Exception):
|
|
| 57 |
# Internal helpers
|
| 58 |
# ---------------------------------------------------------------------------
|
| 59 |
|
| 60 |
-
def
|
| 61 |
-
if
|
| 62 |
return
|
| 63 |
MODEL_DIR.mkdir(exist_ok=True)
|
| 64 |
zip_path = MODEL_DIR / "buffalo_sc.zip"
|
| 65 |
-
logger.info("Downloading
|
| 66 |
-
urllib.request.urlretrieve(
|
| 67 |
with zipfile.ZipFile(zip_path) as zf:
|
| 68 |
-
# The file may live at root or inside a named subdirectory (e.g. buffalo_sc/).
|
| 69 |
matches = [n for n in zf.namelist() if n.endswith("w600k_mbf.onnx")]
|
| 70 |
if not matches:
|
| 71 |
raise RuntimeError(
|
| 72 |
f"w600k_mbf.onnx not found in downloaded zip. "
|
| 73 |
-
f"
|
| 74 |
)
|
| 75 |
-
with zf.open(matches[0]) as src, open(
|
| 76 |
dst.write(src.read())
|
| 77 |
zip_path.unlink()
|
| 78 |
-
logger.info("
|
| 79 |
|
| 80 |
|
| 81 |
-
def
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
)
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
)
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
return cv2.resize(face_bgr, (112, 112))
|
| 114 |
|
| 115 |
|
| 116 |
-
def _embed(face_bgr: np.ndarray) -> np.ndarray:
|
| 117 |
-
"""Return an L2-normalised 512-D embedding for
|
| 118 |
-
face_112
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
img
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
emb
|
| 126 |
-
return emb / np.linalg.norm(emb) # L2-normalise
|
| 127 |
|
| 128 |
|
| 129 |
# ---------------------------------------------------------------------------
|
|
@@ -131,12 +178,13 @@ def _embed(face_bgr: np.ndarray) -> np.ndarray:
|
|
| 131 |
# ---------------------------------------------------------------------------
|
| 132 |
|
| 133 |
def load() -> dict[str, list[list[float]]]:
|
| 134 |
-
"""Load face DB from disk and warm up
|
| 135 |
|
| 136 |
-
Returns an empty dict
|
| 137 |
-
|
| 138 |
"""
|
| 139 |
-
|
|
|
|
| 140 |
if not DB_PATH.exists():
|
| 141 |
return {}
|
| 142 |
raw = json.loads(DB_PATH.read_text())
|
|
@@ -170,14 +218,15 @@ def find_match(
|
|
| 170 |
) -> Optional[str]:
|
| 171 |
"""Return matched name if recognised, None if face present but unknown.
|
| 172 |
|
| 173 |
-
Raises NoFaceDetected if
|
| 174 |
"""
|
| 175 |
-
|
| 176 |
-
if not
|
| 177 |
raise NoFaceDetected()
|
| 178 |
|
| 179 |
-
|
| 180 |
-
|
|
|
|
| 181 |
|
| 182 |
best_name, best_sim = None, -1.0
|
| 183 |
for name, enc_list in db.items():
|
|
@@ -195,24 +244,6 @@ def find_match(
|
|
| 195 |
return None
|
| 196 |
|
| 197 |
|
| 198 |
-
def get_face_jpeg(frame_bgr: np.ndarray, padding: float = 0.4) -> Optional[bytes]:
|
| 199 |
-
"""Return a JPEG-encoded crop of the largest detected face, or None."""
|
| 200 |
-
boxes = _detect(frame_bgr)
|
| 201 |
-
if not boxes:
|
| 202 |
-
return None
|
| 203 |
-
x, y, w, h = boxes[0]
|
| 204 |
-
pad_x = int(w * padding)
|
| 205 |
-
pad_y = int(h * padding)
|
| 206 |
-
h_img, w_img = frame_bgr.shape[:2]
|
| 207 |
-
x1 = max(0, x - pad_x)
|
| 208 |
-
y1 = max(0, y - pad_y)
|
| 209 |
-
x2 = min(w_img, x + w + pad_x)
|
| 210 |
-
y2 = min(h_img, y + h + pad_y)
|
| 211 |
-
crop = frame_bgr[y1:y2, x1:x2]
|
| 212 |
-
ok, buf = cv2.imencode(".jpg", crop, [cv2.IMWRITE_JPEG_QUALITY, 85])
|
| 213 |
-
return bytes(buf) if ok else None
|
| 214 |
-
|
| 215 |
-
|
| 216 |
def add_face(
|
| 217 |
name: str,
|
| 218 |
frame_bgr: np.ndarray,
|
|
@@ -220,14 +251,32 @@ def add_face(
|
|
| 220 |
max_per_person: int = 5,
|
| 221 |
) -> None:
|
| 222 |
"""Embed and store the face from frame_bgr under name."""
|
| 223 |
-
|
| 224 |
-
if not
|
| 225 |
raise ValueError("No face detected in enrollment image")
|
| 226 |
|
| 227 |
-
x, y, w, h =
|
| 228 |
-
emb = _embed(frame_bgr[y : y + h, x : x + w])
|
| 229 |
|
| 230 |
db.setdefault(name, [])
|
| 231 |
if len(db[name]) < max_per_person:
|
| 232 |
db[name].append(emb.tolist())
|
| 233 |
save(db)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""Face database: local face recognition via ONNX + OpenCV.
|
| 2 |
|
| 3 |
+
Detection : YuNet (cv2.FaceDetectorYN) — deep-learning detector built into
|
| 4 |
+
OpenCV >= 4.8. Returns a confidence score (false-positive walls
|
| 5 |
+
are eliminated) and 5-point facial landmarks used directly for
|
| 6 |
+
alignment. Model: face_detection_yunet_2023mar.onnx (~337 KB,
|
| 7 |
+
downloaded once).
|
| 8 |
+
|
| 9 |
Embedding : InsightFace MobileFaceNet (w600k_mbf.onnx, ~17 MB, downloaded
|
| 10 |
once on first run from the InsightFace GitHub release).
|
| 11 |
+
|
| 12 |
+
Alignment : 5-point similarity transform to the InsightFace 112×112
|
| 13 |
+
canonical frame (right eye, left eye, nose, mouth corners).
|
| 14 |
+
Much more accurate than a separate eye cascade.
|
| 15 |
+
|
| 16 |
Matching : cosine similarity on L2-normalised 512-D embeddings.
|
| 17 |
Storage : recognizer/face_db.json (gitignored).
|
| 18 |
|
|
|
|
| 33 |
|
| 34 |
logger = logging.getLogger(__name__)
|
| 35 |
|
| 36 |
+
# Bump whenever the embedding pipeline changes (detector, alignment, model, …)
|
| 37 |
+
SCHEMA_VERSION = 4 # 3=aligned-RGB-Haar 4=YuNet+5pt-landmarks
|
|
|
|
| 38 |
|
| 39 |
DB_PATH = Path(__file__).parent / "face_db.json"
|
| 40 |
MODEL_DIR = Path(__file__).parent / "models"
|
| 41 |
+
|
| 42 |
+
# --- Recognition model (MobileFaceNet) ---
|
| 43 |
+
REC_FILE = MODEL_DIR / "w600k_mbf.onnx"
|
| 44 |
+
REC_URL = (
|
| 45 |
"https://github.com/deepinsight/insightface"
|
| 46 |
"/releases/download/v0.7/buffalo_sc.zip"
|
| 47 |
)
|
| 48 |
|
| 49 |
+
# --- Detection model (YuNet) ---
|
| 50 |
+
DET_FILE = MODEL_DIR / "face_detection_yunet_2023mar.onnx"
|
| 51 |
+
DET_URL = (
|
| 52 |
+
"https://github.com/opencv/opencv_zoo/raw/main/models/"
|
| 53 |
+
"face_detection_yunet/face_detection_yunet_2023mar.onnx"
|
| 54 |
)
|
| 55 |
+
DET_CONFIDENCE = 0.75 # discard detections below this score
|
| 56 |
+
|
| 57 |
+
# InsightFace canonical 5-point landmarks in 112×112 space
|
| 58 |
+
# order: right_eye, left_eye, nose_tip, right_mouth, left_mouth
|
| 59 |
+
_CANONICAL_LM = np.float32([
|
| 60 |
+
[38.2946, 51.6963],
|
| 61 |
+
[73.5318, 51.5014],
|
| 62 |
+
[56.0252, 71.7366],
|
| 63 |
+
[41.5493, 92.3655],
|
| 64 |
+
[70.7299, 92.2041],
|
| 65 |
+
])
|
| 66 |
|
| 67 |
+
_rec_session: Optional[ort.InferenceSession] = None
|
| 68 |
+
_detector: Optional[cv2.FaceDetectorYN] = None
|
| 69 |
+
_det_size: tuple[int, int] = (0, 0)
|
| 70 |
|
| 71 |
|
| 72 |
class NoFaceDetected(Exception):
|
|
|
|
| 77 |
# Internal helpers
|
| 78 |
# ---------------------------------------------------------------------------
|
| 79 |
|
| 80 |
+
def _ensure_rec_model() -> None:
|
| 81 |
+
if REC_FILE.exists():
|
| 82 |
return
|
| 83 |
MODEL_DIR.mkdir(exist_ok=True)
|
| 84 |
zip_path = MODEL_DIR / "buffalo_sc.zip"
|
| 85 |
+
logger.info("Downloading MobileFaceNet recognition model (~17 MB)…")
|
| 86 |
+
urllib.request.urlretrieve(REC_URL, zip_path)
|
| 87 |
with zipfile.ZipFile(zip_path) as zf:
|
|
|
|
| 88 |
matches = [n for n in zf.namelist() if n.endswith("w600k_mbf.onnx")]
|
| 89 |
if not matches:
|
| 90 |
raise RuntimeError(
|
| 91 |
f"w600k_mbf.onnx not found in downloaded zip. "
|
| 92 |
+
f"Entries: {zf.namelist()}"
|
| 93 |
)
|
| 94 |
+
with zf.open(matches[0]) as src, open(REC_FILE, "wb") as dst:
|
| 95 |
dst.write(src.read())
|
| 96 |
zip_path.unlink()
|
| 97 |
+
logger.info("Recognition model saved to %s", REC_FILE)
|
| 98 |
|
| 99 |
|
| 100 |
+
def _ensure_det_model() -> None:
|
| 101 |
+
if DET_FILE.exists():
|
| 102 |
+
return
|
| 103 |
+
MODEL_DIR.mkdir(exist_ok=True)
|
| 104 |
+
logger.info("Downloading YuNet face detector (~337 KB)…")
|
| 105 |
+
urllib.request.urlretrieve(DET_URL, DET_FILE)
|
| 106 |
+
logger.info("Detector saved to %s", DET_FILE)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def _get_rec_session() -> ort.InferenceSession:
|
| 110 |
+
global _rec_session
|
| 111 |
+
if _rec_session is None:
|
| 112 |
+
_ensure_rec_model()
|
| 113 |
+
_rec_session = ort.InferenceSession(
|
| 114 |
+
str(REC_FILE), providers=["CPUExecutionProvider"]
|
| 115 |
+
)
|
| 116 |
+
return _rec_session
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def _get_detector(width: int, height: int) -> cv2.FaceDetectorYN:
|
| 120 |
+
global _detector, _det_size
|
| 121 |
+
_ensure_det_model()
|
| 122 |
+
if _detector is None or _det_size != (width, height):
|
| 123 |
+
_detector = cv2.FaceDetectorYN.create(
|
| 124 |
+
str(DET_FILE), "",
|
| 125 |
+
(width, height),
|
| 126 |
+
score_threshold=DET_CONFIDENCE,
|
| 127 |
+
nms_threshold=0.3,
|
| 128 |
+
top_k=10,
|
| 129 |
)
|
| 130 |
+
_det_size = (width, height)
|
| 131 |
+
return _detector
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def _detect(frame_bgr: np.ndarray) -> list[tuple[tuple, np.ndarray]]:
|
| 135 |
+
"""Return [(bbox, landmarks)] for each face found.
|
| 136 |
+
|
| 137 |
+
bbox = (x, y, w, h) in pixel coordinates
|
| 138 |
+
landmarks = float32 array shape (5, 2) — right_eye, left_eye, nose,
|
| 139 |
+
right_mouth, left_mouth
|
| 140 |
+
"""
|
| 141 |
+
h, w = frame_bgr.shape[:2]
|
| 142 |
+
det = _get_detector(w, h)
|
| 143 |
+
_, faces = det.detect(frame_bgr)
|
| 144 |
+
if faces is None:
|
| 145 |
+
return []
|
| 146 |
+
results = []
|
| 147 |
+
for f in faces:
|
| 148 |
+
bbox = (int(f[0]), int(f[1]), int(f[2]), int(f[3]))
|
| 149 |
+
lm = f[4:14].reshape(5, 2).astype(np.float32)
|
| 150 |
+
results.append((bbox, lm))
|
| 151 |
+
return results
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def _align(face_bgr: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
|
| 155 |
+
"""Warp face to InsightFace 112×112 canonical frame using 5-pt landmarks."""
|
| 156 |
+
M, _ = cv2.estimateAffinePartial2D(landmarks, _CANONICAL_LM)
|
| 157 |
+
if M is not None:
|
| 158 |
+
return cv2.warpAffine(face_bgr, M, (112, 112))
|
| 159 |
+
# Fallback: plain resize (should rarely happen)
|
| 160 |
+
logger.debug("Landmark alignment failed, falling back to plain resize")
|
| 161 |
return cv2.resize(face_bgr, (112, 112))
|
| 162 |
|
| 163 |
|
| 164 |
+
def _embed(face_bgr: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
|
| 165 |
+
"""Return an L2-normalised 512-D embedding for a detected face."""
|
| 166 |
+
face_112 = _align(face_bgr, landmarks)
|
| 167 |
+
face_rgb = cv2.cvtColor(face_112, cv2.COLOR_BGR2RGB)
|
| 168 |
+
img = face_rgb.astype(np.float32)
|
| 169 |
+
img = (img - 127.5) / 127.5 # normalise to [-1, 1]
|
| 170 |
+
inp = np.transpose(img, (2, 0, 1))[np.newaxis] # NCHW
|
| 171 |
+
sess = _get_rec_session()
|
| 172 |
+
emb = sess.run(None, {sess.get_inputs()[0].name: inp})[0][0]
|
| 173 |
+
return emb / np.linalg.norm(emb)
|
|
|
|
| 174 |
|
| 175 |
|
| 176 |
# ---------------------------------------------------------------------------
|
|
|
|
| 178 |
# ---------------------------------------------------------------------------
|
| 179 |
|
| 180 |
def load() -> dict[str, list[list[float]]]:
|
| 181 |
+
"""Load face DB from disk and warm up both ONNX models.
|
| 182 |
|
| 183 |
+
Returns an empty dict and removes the DB file if the schema version
|
| 184 |
+
doesn't match (stale embeddings from an older pipeline).
|
| 185 |
"""
|
| 186 |
+
_get_rec_session()
|
| 187 |
+
_ensure_det_model()
|
| 188 |
if not DB_PATH.exists():
|
| 189 |
return {}
|
| 190 |
raw = json.loads(DB_PATH.read_text())
|
|
|
|
| 218 |
) -> Optional[str]:
|
| 219 |
"""Return matched name if recognised, None if face present but unknown.
|
| 220 |
|
| 221 |
+
Raises NoFaceDetected if the detector finds no face in the frame.
|
| 222 |
"""
|
| 223 |
+
detections = _detect(frame_bgr)
|
| 224 |
+
if not detections:
|
| 225 |
raise NoFaceDetected()
|
| 226 |
|
| 227 |
+
# Use the first (highest-confidence) detection
|
| 228 |
+
(x, y, w, h), lm = detections[0]
|
| 229 |
+
emb = _embed(frame_bgr[y : y + h, x : x + w], lm)
|
| 230 |
|
| 231 |
best_name, best_sim = None, -1.0
|
| 232 |
for name, enc_list in db.items():
|
|
|
|
| 244 |
return None
|
| 245 |
|
| 246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
def add_face(
|
| 248 |
name: str,
|
| 249 |
frame_bgr: np.ndarray,
|
|
|
|
| 251 |
max_per_person: int = 5,
|
| 252 |
) -> None:
|
| 253 |
"""Embed and store the face from frame_bgr under name."""
|
| 254 |
+
detections = _detect(frame_bgr)
|
| 255 |
+
if not detections:
|
| 256 |
raise ValueError("No face detected in enrollment image")
|
| 257 |
|
| 258 |
+
(x, y, w, h), lm = detections[0]
|
| 259 |
+
emb = _embed(frame_bgr[y : y + h, x : x + w], lm)
|
| 260 |
|
| 261 |
db.setdefault(name, [])
|
| 262 |
if len(db[name]) < max_per_person:
|
| 263 |
db[name].append(emb.tolist())
|
| 264 |
save(db)
|
| 265 |
+
|
| 266 |
+
|
| 267 |
+
def get_face_jpeg(frame_bgr: np.ndarray, padding: float = 0.4) -> Optional[bytes]:
|
| 268 |
+
"""Return a JPEG-encoded crop of the best detected face, or None."""
|
| 269 |
+
detections = _detect(frame_bgr)
|
| 270 |
+
if not detections:
|
| 271 |
+
return None
|
| 272 |
+
x, y, w, h = detections[0][0]
|
| 273 |
+
pad_x = int(w * padding)
|
| 274 |
+
pad_y = int(h * padding)
|
| 275 |
+
h_img, w_img = frame_bgr.shape[:2]
|
| 276 |
+
x1 = max(0, x - pad_x)
|
| 277 |
+
y1 = max(0, y - pad_y)
|
| 278 |
+
x2 = min(w_img, x + w + pad_x)
|
| 279 |
+
y2 = min(h_img, y + h + pad_y)
|
| 280 |
+
crop = frame_bgr[y1:y2, x1:x2]
|
| 281 |
+
ok, buf = cv2.imencode(".jpg", crop, [cv2.IMWRITE_JPEG_QUALITY, 85])
|
| 282 |
+
return bytes(buf) if ok else None
|