| """ |
| Pickle-free face embedding storage for Hugging Face Hub (avoids HF Picklescan blocks on .npy). |
| |
| Format (.f32emb): |
| magic 8 bytes b'CEPF32E1' |
| n_vec uint32 number of embedding vectors |
| dim uint32 floats per vector (typically 512) |
| data float32[n_vec * dim] little-endian |
| """ |
| from __future__ import annotations |
|
|
| import logging |
| import os |
| import struct |
| from typing import Iterable |
|
|
| import numpy as np |
|
|
| logger = logging.getLogger(__name__) |
|
|
| MAGIC = b"CEPF32E1" |
| HEADER = struct.Struct("<8sII") |
| EMB_SUFFIX = ".f32emb" |
|
|
|
|
| def save_f32emb(path: str, embeddings: np.ndarray) -> None: |
| """Write embeddings as raw float32 (Hub-safe, no pickle).""" |
| arr = np.asarray(embeddings, dtype=np.float32) |
| if arr.ndim == 1: |
| arr = arr.reshape(1, -1) |
| elif arr.ndim != 2: |
| raise ValueError(f"Expected 1D or 2D embedding array, got shape {arr.shape}") |
| n_vec, dim = arr.shape |
| os.makedirs(os.path.dirname(path) or ".", exist_ok=True) |
| with open(path, "wb") as fh: |
| fh.write(HEADER.pack(MAGIC, n_vec, dim)) |
| fh.write(arr.tobytes(order="C")) |
|
|
|
|
| def load_f32emb(path: str) -> np.ndarray: |
| with open(path, "rb") as fh: |
| header = fh.read(HEADER.size) |
| if len(header) != HEADER.size: |
| raise ValueError(f"Truncated embedding file: {path}") |
| magic, n_vec, dim = HEADER.unpack(header) |
| if magic != MAGIC: |
| raise ValueError(f"Bad magic in {path}") |
| raw = fh.read(n_vec * dim * 4) |
| arr = np.frombuffer(raw, dtype=np.float32).reshape(n_vec, dim) |
| return arr[0] if n_vec == 1 else arr |
|
|
|
|
| def load_embedding(path_base: str) -> np.ndarray | None: |
| """Load from path without extension — prefers .f32emb then .npy.""" |
| f32 = f"{path_base}{EMB_SUFFIX}" |
| npy = f"{path_base}.npy" |
| if os.path.isfile(f32): |
| try: |
| return load_f32emb(f32) |
| except Exception as exc: |
| logger.warning("Failed loading %s: %s", f32, exc) |
| if os.path.isfile(npy): |
| try: |
| return np.load(npy) |
| except Exception as exc: |
| logger.warning("Failed loading %s: %s", npy, exc) |
| return None |
|
|
|
|
| def save_embeddings(name: str, emb_root: str, embeddings: np.ndarray) -> None: |
| """Persist to .f32emb (Hub-safe) and .npy (local dev compatibility).""" |
| os.makedirs(emb_root, exist_ok=True) |
| base = os.path.join(emb_root, name) |
| save_f32emb(f"{base}{EMB_SUFFIX}", embeddings) |
| np.save(f"{base}.npy", np.asarray(embeddings)) |
|
|
|
|
| def iter_embedding_files(folder: str) -> Iterable[tuple[str, str]]: |
| """Yield (person_name, full_path) for .f32emb and .npy in folder.""" |
| if not os.path.isdir(folder): |
| return |
| seen: set[str] = set() |
| for fname in os.listdir(folder): |
| if fname.endswith(EMB_SUFFIX): |
| name = fname[: -len(EMB_SUFFIX)] |
| elif fname.endswith(".npy"): |
| name = fname[:-4] |
| else: |
| continue |
| if name in seen: |
| continue |
| seen.add(name) |
| yield name, os.path.join(folder, fname) |
|
|
|
|
| def export_npy_tree_to_f32emb(root: str) -> int: |
| """Convert all .npy under root to sibling .f32emb files. Returns count converted.""" |
| if not os.path.isdir(root): |
| return 0 |
| converted = 0 |
| for dirpath, _, files in os.walk(root): |
| for fname in files: |
| if not fname.endswith(".npy"): |
| continue |
| npy_path = os.path.join(dirpath, fname) |
| f32_path = npy_path[:-4] + EMB_SUFFIX |
| try: |
| emb = np.load(npy_path) |
| save_f32emb(f32_path, emb) |
| converted += 1 |
| logger.info("Exported %s -> %s", npy_path, f32_path) |
| except Exception as exc: |
| logger.warning("Could not export %s: %s", npy_path, exc) |
| return converted |
|
|