github-actions
Deploy to Hugging Face
c794b6b
Raw
History Blame Contribute Delete
3.84 kB
"""
Pickle-free face embedding storage for Hugging Face Hub (avoids HF Picklescan blocks on .npy).
Format (.f32emb):
magic 8 bytes b'CEPF32E1'
n_vec uint32 number of embedding vectors
dim uint32 floats per vector (typically 512)
data float32[n_vec * dim] little-endian
"""
from __future__ import annotations
import logging
import os
import struct
from typing import Iterable
import numpy as np
logger = logging.getLogger(__name__)
MAGIC = b"CEPF32E1"
HEADER = struct.Struct("<8sII") # magic, n_vec, dim
EMB_SUFFIX = ".f32emb"
def save_f32emb(path: str, embeddings: np.ndarray) -> None:
"""Write embeddings as raw float32 (Hub-safe, no pickle)."""
arr = np.asarray(embeddings, dtype=np.float32)
if arr.ndim == 1:
arr = arr.reshape(1, -1)
elif arr.ndim != 2:
raise ValueError(f"Expected 1D or 2D embedding array, got shape {arr.shape}")
n_vec, dim = arr.shape
os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
with open(path, "wb") as fh:
fh.write(HEADER.pack(MAGIC, n_vec, dim))
fh.write(arr.tobytes(order="C"))
def load_f32emb(path: str) -> np.ndarray:
with open(path, "rb") as fh:
header = fh.read(HEADER.size)
if len(header) != HEADER.size:
raise ValueError(f"Truncated embedding file: {path}")
magic, n_vec, dim = HEADER.unpack(header)
if magic != MAGIC:
raise ValueError(f"Bad magic in {path}")
raw = fh.read(n_vec * dim * 4)
arr = np.frombuffer(raw, dtype=np.float32).reshape(n_vec, dim)
return arr[0] if n_vec == 1 else arr
def load_embedding(path_base: str) -> np.ndarray | None:
"""Load from path without extension — prefers .f32emb then .npy."""
f32 = f"{path_base}{EMB_SUFFIX}"
npy = f"{path_base}.npy"
if os.path.isfile(f32):
try:
return load_f32emb(f32)
except Exception as exc:
logger.warning("Failed loading %s: %s", f32, exc)
if os.path.isfile(npy):
try:
return np.load(npy)
except Exception as exc:
logger.warning("Failed loading %s: %s", npy, exc)
return None
def save_embeddings(name: str, emb_root: str, embeddings: np.ndarray) -> None:
"""Persist to .f32emb (Hub-safe) and .npy (local dev compatibility)."""
os.makedirs(emb_root, exist_ok=True)
base = os.path.join(emb_root, name)
save_f32emb(f"{base}{EMB_SUFFIX}", embeddings)
np.save(f"{base}.npy", np.asarray(embeddings))
def iter_embedding_files(folder: str) -> Iterable[tuple[str, str]]:
"""Yield (person_name, full_path) for .f32emb and .npy in folder."""
if not os.path.isdir(folder):
return
seen: set[str] = set()
for fname in os.listdir(folder):
if fname.endswith(EMB_SUFFIX):
name = fname[: -len(EMB_SUFFIX)]
elif fname.endswith(".npy"):
name = fname[:-4]
else:
continue
if name in seen:
continue
seen.add(name)
yield name, os.path.join(folder, fname)
def export_npy_tree_to_f32emb(root: str) -> int:
"""Convert all .npy under root to sibling .f32emb files. Returns count converted."""
if not os.path.isdir(root):
return 0
converted = 0
for dirpath, _, files in os.walk(root):
for fname in files:
if not fname.endswith(".npy"):
continue
npy_path = os.path.join(dirpath, fname)
f32_path = npy_path[:-4] + EMB_SUFFIX
try:
emb = np.load(npy_path)
save_f32emb(f32_path, emb)
converted += 1
logger.info("Exported %s -> %s", npy_path, f32_path)
except Exception as exc:
logger.warning("Could not export %s: %s", npy_path, exc)
return converted