""" Pickle-free face embedding storage for Hugging Face Hub (avoids HF Picklescan blocks on .npy). Format (.f32emb): magic 8 bytes b'CEPF32E1' n_vec uint32 number of embedding vectors dim uint32 floats per vector (typically 512) data float32[n_vec * dim] little-endian """ from __future__ import annotations import logging import os import struct from typing import Iterable import numpy as np logger = logging.getLogger(__name__) MAGIC = b"CEPF32E1" HEADER = struct.Struct("<8sII") # magic, n_vec, dim EMB_SUFFIX = ".f32emb" def save_f32emb(path: str, embeddings: np.ndarray) -> None: """Write embeddings as raw float32 (Hub-safe, no pickle).""" arr = np.asarray(embeddings, dtype=np.float32) if arr.ndim == 1: arr = arr.reshape(1, -1) elif arr.ndim != 2: raise ValueError(f"Expected 1D or 2D embedding array, got shape {arr.shape}") n_vec, dim = arr.shape os.makedirs(os.path.dirname(path) or ".", exist_ok=True) with open(path, "wb") as fh: fh.write(HEADER.pack(MAGIC, n_vec, dim)) fh.write(arr.tobytes(order="C")) def load_f32emb(path: str) -> np.ndarray: with open(path, "rb") as fh: header = fh.read(HEADER.size) if len(header) != HEADER.size: raise ValueError(f"Truncated embedding file: {path}") magic, n_vec, dim = HEADER.unpack(header) if magic != MAGIC: raise ValueError(f"Bad magic in {path}") raw = fh.read(n_vec * dim * 4) arr = np.frombuffer(raw, dtype=np.float32).reshape(n_vec, dim) return arr[0] if n_vec == 1 else arr def load_embedding(path_base: str) -> np.ndarray | None: """Load from path without extension — prefers .f32emb then .npy.""" f32 = f"{path_base}{EMB_SUFFIX}" npy = f"{path_base}.npy" if os.path.isfile(f32): try: return load_f32emb(f32) except Exception as exc: logger.warning("Failed loading %s: %s", f32, exc) if os.path.isfile(npy): try: return np.load(npy) except Exception as exc: logger.warning("Failed loading %s: %s", npy, exc) return None def save_embeddings(name: str, emb_root: str, embeddings: np.ndarray) -> None: """Persist to .f32emb (Hub-safe) and .npy (local dev compatibility).""" os.makedirs(emb_root, exist_ok=True) base = os.path.join(emb_root, name) save_f32emb(f"{base}{EMB_SUFFIX}", embeddings) np.save(f"{base}.npy", np.asarray(embeddings)) def iter_embedding_files(folder: str) -> Iterable[tuple[str, str]]: """Yield (person_name, full_path) for .f32emb and .npy in folder.""" if not os.path.isdir(folder): return seen: set[str] = set() for fname in os.listdir(folder): if fname.endswith(EMB_SUFFIX): name = fname[: -len(EMB_SUFFIX)] elif fname.endswith(".npy"): name = fname[:-4] else: continue if name in seen: continue seen.add(name) yield name, os.path.join(folder, fname) def export_npy_tree_to_f32emb(root: str) -> int: """Convert all .npy under root to sibling .f32emb files. Returns count converted.""" if not os.path.isdir(root): return 0 converted = 0 for dirpath, _, files in os.walk(root): for fname in files: if not fname.endswith(".npy"): continue npy_path = os.path.join(dirpath, fname) f32_path = npy_path[:-4] + EMB_SUFFIX try: emb = np.load(npy_path) save_f32emb(f32_path, emb) converted += 1 logger.info("Exported %s -> %s", npy_path, f32_path) except Exception as exc: logger.warning("Could not export %s: %s", npy_path, exc) return converted