Spaces:

onitsche
/

recognizer

Running

Oliver Nitsche Claude Sonnet 4.6 commited on 28 days ago

Commit

f7c60d9

1 Parent(s): f4e577f

Replace Haar cascade with YuNet deep-learning face detector

The Haar cascade produced frequent false positives (walls, patterns)
and had no confidence score to filter them. Replaced with YuNet
(cv2.FaceDetectorYN), a small deep-learning detector built into
OpenCV >= 4.8:

- Confidence threshold (0.75) eliminates non-face detections
- Returns 5-point facial landmarks (eyes, nose, mouth corners) directly,
so the separate haarcascade_eye step is gone
- 5-point similarity transform gives much more accurate alignment than
the previous two-eye-centre approach
- Model is tiny (~337 KB) and downloaded once from opencv_zoo

Canonical 5-point InsightFace landmarks used for the warpAffine so
MobileFaceNet embeddings are as consistent as possible across frames.

SCHEMA_VERSION bumped to 4 → stale DB entries from the Haar pipeline
are auto-discarded and the user is prompted to re-enroll.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show

recognizer/face_db.py +146 -97

recognizer/face_db.py CHANGED Viewed

@@ -1,10 +1,18 @@
 """Face database: local face recognition via ONNX + OpenCV.
-Detection : OpenCV Haar cascade (built into opencv, no download).
 Embedding : InsightFace MobileFaceNet (w600k_mbf.onnx, ~17 MB, downloaded
             once on first run from the InsightFace GitHub release).
-Alignment : eye-centre similarity transform to the InsightFace 112×112
-            canonical frame before embedding (plain resize fallback).
 Matching  : cosine similarity on L2-normalised 512-D embeddings.
 Storage   : recognizer/face_db.json (gitignored).
@@ -25,28 +33,40 @@ import onnxruntime as ort
 logger = logging.getLogger(__name__)
-# Bump this whenever the embedding pipeline changes (alignment, colour space,
-# model weights, normalisation, …). Mismatched DBs are auto-cleared on load.
-SCHEMA_VERSION = 3   # 1=plain-resize BGR  2=aligned BGR  3=aligned RGB
 DB_PATH    = Path(__file__).parent / "face_db.json"
 MODEL_DIR  = Path(__file__).parent / "models"
-MODEL_FILE = MODEL_DIR / "w600k_mbf.onnx"
-MODEL_URL  = (
     "https://github.com/deepinsight/insightface"
     "/releases/download/v0.7/buffalo_sc.zip"
 )
-_CASCADE = cv2.CascadeClassifier(
-    cv2.data.haarcascades + "haarcascade_frontalface_default.xml"
-)
-_EYE_CASCADE = cv2.CascadeClassifier(
-    cv2.data.haarcascades + "haarcascade_eye.xml"
 )
-_session: Optional[ort.InferenceSession] = None
-# Standard InsightFace eye positions in the 112×112 aligned face
-_ALIGN_DST = np.float32([[38.29, 51.70], [73.53, 51.50]])
 class NoFaceDetected(Exception):
@@ -57,73 +77,100 @@ class NoFaceDetected(Exception):
 # Internal helpers
 # ---------------------------------------------------------------------------
-def _ensure_model() -> None:
-    if MODEL_FILE.exists():
         return
     MODEL_DIR.mkdir(exist_ok=True)
     zip_path = MODEL_DIR / "buffalo_sc.zip"
-    logger.info("Downloading face recognition model (~17 MB) — one-time setup…")
-    urllib.request.urlretrieve(MODEL_URL, zip_path)
     with zipfile.ZipFile(zip_path) as zf:
-        # The file may live at root or inside a named subdirectory (e.g. buffalo_sc/).
         matches = [n for n in zf.namelist() if n.endswith("w600k_mbf.onnx")]
         if not matches:
             raise RuntimeError(
                 f"w600k_mbf.onnx not found in downloaded zip. "
-                f"Available entries: {zf.namelist()}"
             )
-        with zf.open(matches[0]) as src, open(MODEL_FILE, "wb") as dst:
             dst.write(src.read())
     zip_path.unlink()
-    logger.info("Model saved to %s", MODEL_FILE)
-def _get_session() -> ort.InferenceSession:
-    global _session
-    if _session is None:
-        _ensure_model()
-        _session = ort.InferenceSession(
-            str(MODEL_FILE), providers=["CPUExecutionProvider"]
         )
-    return _session
-def _detect(frame_bgr: np.ndarray) -> list[tuple[int, int, int, int]]:
-    gray = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2GRAY)
-    boxes = _CASCADE.detectMultiScale(
-        gray, scaleFactor=1.1, minNeighbors=3, minSize=(40, 40)
-    )
-    return [tuple(b) for b in boxes] if len(boxes) > 0 else []
-def _align(face_bgr: np.ndarray) -> np.ndarray:
-    """Return a 112×112 crop aligned on eye centres; plain resize as fallback."""
-    gray = cv2.cvtColor(face_bgr, cv2.COLOR_BGR2GRAY)
-    eyes = _EYE_CASCADE.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=3)
-    if len(eyes) >= 2:
-        eyes = sorted(eyes, key=lambda e: e[2] * e[3], reverse=True)[:2]
-        eyes = sorted(eyes, key=lambda e: e[0])
-        src = np.float32([
-            [eyes[0][0] + eyes[0][2] / 2, eyes[0][1] + eyes[0][3] / 2],
-            [eyes[1][0] + eyes[1][2] / 2, eyes[1][1] + eyes[1][3] / 2],
-        ])
-        M, _ = cv2.estimateAffinePartial2D(src, _ALIGN_DST)
-        if M is not None:
-            return cv2.warpAffine(face_bgr, M, (112, 112))
     return cv2.resize(face_bgr, (112, 112))
-def _embed(face_bgr: np.ndarray) -> np.ndarray:
-    """Return an L2-normalised 512-D embedding for face_bgr."""
-    face_112 = _align(face_bgr)
-    # MobileFaceNet (InsightFace) is trained on RGB — convert from OpenCV BGR.
-    face_rgb = cv2.cvtColor(face_112, cv2.COLOR_BGR2RGB)
-    img = face_rgb.astype(np.float32)
-    img = (img - 127.5) / 127.5          # normalise to [-1, 1]
-    inp = np.transpose(img, (2, 0, 1))[np.newaxis]   # NCHW
-    sess = _get_session()
-    emb = sess.run(None, {sess.get_inputs()[0].name: inp})[0][0]
-    return emb / np.linalg.norm(emb)     # L2-normalise
 # ---------------------------------------------------------------------------
@@ -131,12 +178,13 @@ def _embed(face_bgr: np.ndarray) -> np.ndarray:
 # ---------------------------------------------------------------------------
 def load() -> dict[str, list[list[float]]]:
-    """Load face DB from disk and warm up the ONNX session.
-    Returns an empty dict if the DB is missing or was produced by an older
-    embedding pipeline (schema mismatch → auto-clear).
     """
-    _get_session()
     if not DB_PATH.exists():
         return {}
     raw = json.loads(DB_PATH.read_text())
@@ -170,14 +218,15 @@ def find_match(
 ) -> Optional[str]:
     """Return matched name if recognised, None if face present but unknown.
-    Raises NoFaceDetected if no face is detected in the frame at all.
     """
-    boxes = _detect(frame_bgr)
-    if not boxes:
         raise NoFaceDetected()
-    x, y, w, h = boxes[0]
-    emb = _embed(frame_bgr[y : y + h, x : x + w])
     best_name, best_sim = None, -1.0
     for name, enc_list in db.items():
@@ -195,24 +244,6 @@ def find_match(
     return None
-def get_face_jpeg(frame_bgr: np.ndarray, padding: float = 0.4) -> Optional[bytes]:
-    """Return a JPEG-encoded crop of the largest detected face, or None."""
-    boxes = _detect(frame_bgr)
-    if not boxes:
-        return None
-    x, y, w, h = boxes[0]
-    pad_x = int(w * padding)
-    pad_y = int(h * padding)
-    h_img, w_img = frame_bgr.shape[:2]
-    x1 = max(0, x - pad_x)
-    y1 = max(0, y - pad_y)
-    x2 = min(w_img, x + w + pad_x)
-    y2 = min(h_img, y + h + pad_y)
-    crop = frame_bgr[y1:y2, x1:x2]
-    ok, buf = cv2.imencode(".jpg", crop, [cv2.IMWRITE_JPEG_QUALITY, 85])
-    return bytes(buf) if ok else None
 def add_face(
     name: str,
     frame_bgr: np.ndarray,
@@ -220,14 +251,32 @@ def add_face(
     max_per_person: int = 5,
 ) -> None:
     """Embed and store the face from frame_bgr under name."""
-    boxes = _detect(frame_bgr)
-    if not boxes:
         raise ValueError("No face detected in enrollment image")
-    x, y, w, h = boxes[0]
-    emb = _embed(frame_bgr[y : y + h, x : x + w])
     db.setdefault(name, [])
     if len(db[name]) < max_per_person:
         db[name].append(emb.tolist())
     save(db)

 """Face database: local face recognition via ONNX + OpenCV.
+Detection : YuNet (cv2.FaceDetectorYN) — deep-learning detector built into
+            OpenCV >= 4.8.  Returns a confidence score (false-positive walls
+            are eliminated) and 5-point facial landmarks used directly for
+            alignment.  Model: face_detection_yunet_2023mar.onnx (~337 KB,
+            downloaded once).
 Embedding : InsightFace MobileFaceNet (w600k_mbf.onnx, ~17 MB, downloaded
             once on first run from the InsightFace GitHub release).
+Alignment : 5-point similarity transform to the InsightFace 112×112
+            canonical frame (right eye, left eye, nose, mouth corners).
+            Much more accurate than a separate eye cascade.
 Matching  : cosine similarity on L2-normalised 512-D embeddings.
 Storage   : recognizer/face_db.json (gitignored).
 logger = logging.getLogger(__name__)
+# Bump whenever the embedding pipeline changes (detector, alignment, model, …)
+SCHEMA_VERSION = 4   # 3=aligned-RGB-Haar  4=YuNet+5pt-landmarks
 DB_PATH    = Path(__file__).parent / "face_db.json"
 MODEL_DIR  = Path(__file__).parent / "models"
+# --- Recognition model (MobileFaceNet) ---
+REC_FILE = MODEL_DIR / "w600k_mbf.onnx"
+REC_URL  = (
     "https://github.com/deepinsight/insightface"
     "/releases/download/v0.7/buffalo_sc.zip"
 )
+# --- Detection model (YuNet) ---
+DET_FILE = MODEL_DIR / "face_detection_yunet_2023mar.onnx"
+DET_URL  = (
+    "https://github.com/opencv/opencv_zoo/raw/main/models/"
+    "face_detection_yunet/face_detection_yunet_2023mar.onnx"
 )
+DET_CONFIDENCE = 0.75   # discard detections below this score
+# InsightFace canonical 5-point landmarks in 112×112 space
+# order: right_eye, left_eye, nose_tip, right_mouth, left_mouth
+_CANONICAL_LM = np.float32([
+    [38.2946, 51.6963],
+    [73.5318, 51.5014],
+    [56.0252, 71.7366],
+    [41.5493, 92.3655],
+    [70.7299, 92.2041],
+])
+_rec_session: Optional[ort.InferenceSession] = None
+_detector:    Optional[cv2.FaceDetectorYN]   = None
+_det_size:    tuple[int, int]                = (0, 0)
 class NoFaceDetected(Exception):
 # Internal helpers
 # ---------------------------------------------------------------------------
+def _ensure_rec_model() -> None:
+    if REC_FILE.exists():
         return
     MODEL_DIR.mkdir(exist_ok=True)
     zip_path = MODEL_DIR / "buffalo_sc.zip"
+    logger.info("Downloading MobileFaceNet recognition model (~17 MB)…")
+    urllib.request.urlretrieve(REC_URL, zip_path)
     with zipfile.ZipFile(zip_path) as zf:
         matches = [n for n in zf.namelist() if n.endswith("w600k_mbf.onnx")]
         if not matches:
             raise RuntimeError(
                 f"w600k_mbf.onnx not found in downloaded zip. "
+                f"Entries: {zf.namelist()}"
             )
+        with zf.open(matches[0]) as src, open(REC_FILE, "wb") as dst:
             dst.write(src.read())
     zip_path.unlink()
+    logger.info("Recognition model saved to %s", REC_FILE)
+def _ensure_det_model() -> None:
+    if DET_FILE.exists():
+        return
+    MODEL_DIR.mkdir(exist_ok=True)
+    logger.info("Downloading YuNet face detector (~337 KB)…")
+    urllib.request.urlretrieve(DET_URL, DET_FILE)
+    logger.info("Detector saved to %s", DET_FILE)
+def _get_rec_session() -> ort.InferenceSession:
+    global _rec_session
+    if _rec_session is None:
+        _ensure_rec_model()
+        _rec_session = ort.InferenceSession(
+            str(REC_FILE), providers=["CPUExecutionProvider"]
+        )
+    return _rec_session
+def _get_detector(width: int, height: int) -> cv2.FaceDetectorYN:
+    global _detector, _det_size
+    _ensure_det_model()
+    if _detector is None or _det_size != (width, height):
+        _detector = cv2.FaceDetectorYN.create(
+            str(DET_FILE), "",
+            (width, height),
+            score_threshold=DET_CONFIDENCE,
+            nms_threshold=0.3,
+            top_k=10,
         )
+        _det_size = (width, height)
+    return _detector
+def _detect(frame_bgr: np.ndarray) -> list[tuple[tuple, np.ndarray]]:
+    """Return [(bbox, landmarks)] for each face found.
+    bbox     = (x, y, w, h)  in pixel coordinates
+    landmarks = float32 array shape (5, 2) — right_eye, left_eye, nose,
+                right_mouth, left_mouth
+    """
+    h, w = frame_bgr.shape[:2]
+    det = _get_detector(w, h)
+    _, faces = det.detect(frame_bgr)
+    if faces is None:
+        return []
+    results = []
+    for f in faces:
+        bbox = (int(f[0]), int(f[1]), int(f[2]), int(f[3]))
+        lm   = f[4:14].reshape(5, 2).astype(np.float32)
+        results.append((bbox, lm))
+    return results
+def _align(face_bgr: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
+    """Warp face to InsightFace 112×112 canonical frame using 5-pt landmarks."""
+    M, _ = cv2.estimateAffinePartial2D(landmarks, _CANONICAL_LM)
+    if M is not None:
+        return cv2.warpAffine(face_bgr, M, (112, 112))
+    # Fallback: plain resize (should rarely happen)
+    logger.debug("Landmark alignment failed, falling back to plain resize")
     return cv2.resize(face_bgr, (112, 112))
+def _embed(face_bgr: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
+    """Return an L2-normalised 512-D embedding for a detected face."""
+    face_112  = _align(face_bgr, landmarks)
+    face_rgb  = cv2.cvtColor(face_112, cv2.COLOR_BGR2RGB)
+    img       = face_rgb.astype(np.float32)
+    img       = (img - 127.5) / 127.5          # normalise to [-1, 1]
+    inp       = np.transpose(img, (2, 0, 1))[np.newaxis]   # NCHW
+    sess      = _get_rec_session()
+    emb       = sess.run(None, {sess.get_inputs()[0].name: inp})[0][0]
+    return emb / np.linalg.norm(emb)
 # ---------------------------------------------------------------------------
 # ---------------------------------------------------------------------------
 def load() -> dict[str, list[list[float]]]:
+    """Load face DB from disk and warm up both ONNX models.
+    Returns an empty dict and removes the DB file if the schema version
+    doesn't match (stale embeddings from an older pipeline).
     """
+    _get_rec_session()
+    _ensure_det_model()
     if not DB_PATH.exists():
         return {}
     raw = json.loads(DB_PATH.read_text())
 ) -> Optional[str]:
     """Return matched name if recognised, None if face present but unknown.
+    Raises NoFaceDetected if the detector finds no face in the frame.
     """
+    detections = _detect(frame_bgr)
+    if not detections:
         raise NoFaceDetected()
+    # Use the first (highest-confidence) detection
+    (x, y, w, h), lm = detections[0]
+    emb = _embed(frame_bgr[y : y + h, x : x + w], lm)
     best_name, best_sim = None, -1.0
     for name, enc_list in db.items():
     return None
 def add_face(
     name: str,
     frame_bgr: np.ndarray,
     max_per_person: int = 5,
 ) -> None:
     """Embed and store the face from frame_bgr under name."""
+    detections = _detect(frame_bgr)
+    if not detections:
         raise ValueError("No face detected in enrollment image")
+    (x, y, w, h), lm = detections[0]
+    emb = _embed(frame_bgr[y : y + h, x : x + w], lm)
     db.setdefault(name, [])
     if len(db[name]) < max_per_person:
         db[name].append(emb.tolist())
     save(db)
+def get_face_jpeg(frame_bgr: np.ndarray, padding: float = 0.4) -> Optional[bytes]:
+    """Return a JPEG-encoded crop of the best detected face, or None."""
+    detections = _detect(frame_bgr)
+    if not detections:
+        return None
+    x, y, w, h = detections[0][0]
+    pad_x = int(w * padding)
+    pad_y = int(h * padding)
+    h_img, w_img = frame_bgr.shape[:2]
+    x1 = max(0, x - pad_x)
+    y1 = max(0, y - pad_y)
+    x2 = min(w_img, x + w + pad_x)
+    y2 = min(h_img, y + h + pad_y)
+    crop = frame_bgr[y1:y2, x1:x2]
+    ok, buf = cv2.imencode(".jpg", crop, [cv2.IMWRITE_JPEG_QUALITY, 85])
+    return bytes(buf) if ok else None