Spaces:

Shoraky
/

Reunite

Sleeping

App Files Files Community

Reunite / core.py

Shoraky

Complete backend folder

7e710af about 2 months ago

Raw

History Blame Contribute Delete

16.4 kB

	"""
	core.py — Deep Learning Engine for REFIND
	Pipeline: InsightFace (buffalo_l / ArcFace) ➜ 512-d embedding ➜ FAISS IndexFlatIP
	No database. All state lives in /storage (CSV + .npy + photo.jpg per person).
	"""

	from __future__ import annotations

	import json
	import io
	import os
	import shutil
	import uuid
	import warnings
	from datetime import datetime
	from pathlib import Path
	import time
	import logging
	import contextlib
	from typing import Optional

	import cv2
	import faiss
	import numpy as np
	import pandas as pd

	# ─────────────────────────────────────────────────────────────────────────────
	# Storage Layout
	# Storage/
	# persons/{ID}/photo.jpg — original upload
	# persons/{ID}/embedding.npy — 512-d L2-norm ArcFace vector
	# registry.csv — metadata for all persons
	# embeddings_map.json — {id: abs_path_to_embedding}
	# Weights/ — InsightFace / ArcFace model cache
	# ─────────────────────────────────────────────────────────────────────────────
	BASE_DIR = Path(__file__).parent
	STORAGE_DIR = BASE_DIR / "Storage"
	PERSONS_DIR = STORAGE_DIR / "persons"
	WEIGHTS_DIR = BASE_DIR / "Weights"
	REGISTRY_CSV = STORAGE_DIR / "registry.csv"
	EMB_MAP_PATH = STORAGE_DIR / "embeddings_map.json"

	EMBEDDING_DIM = 512 # ArcFace output dimensionality

	# Cosine-similarity thresholds (inner product of L2-normalized vectors)
	THRESH_VERY_HIGH = 0.68
	THRESH_HIGH = 0.52
	THRESH_MEDIUM = 0.38 # default search cut-off

	CSV_COLUMNS = [
	"id", "name", "age", "gender",
	"last_seen_date", "last_seen_location",
	"phone_contact", "address", "national_id",
	"description", "registered_at", "status",
	]


	# ─────────────────────────────────────────────────────────────────────────────
	# Directory bootstrap
	# ─────────────────────────────────────────────────────────────────────────────
	def ensure_dirs() -> None:
	for d in [STORAGE_DIR, PERSONS_DIR, WEIGHTS_DIR]:
	d.mkdir(parents=True, exist_ok=True)


	# ─────────────────────────────────────────────────────────────────────────────
	# InsightFace / ArcFace — lazy singleton
	# ─────────────────────────────────────────────────────────────────────────────
	_face_app = None


	def get_face_app():
	"""
	Lazy-load InsightFace FaceAnalysis with buffalo_l (ArcFace R100).
	Falls back to CPU if CUDA is unavailable.
	Model weights are cached under Weights/ on first call.
	"""
	global _face_app
	if _face_app is None:
	os.environ.setdefault("ORT_LOG_SEVERITY_LEVEL", "3")
	os.environ.setdefault("INSIGHTFACE_LOG_LEVEL", "ERROR")
	logging.getLogger("onnxruntime").setLevel(logging.ERROR)
	logging.getLogger("insightface").setLevel(logging.ERROR)

	warnings.filterwarnings(
	"ignore",
	message=r"`rcond` parameter will change to the default.*",
	category=FutureWarning,
	)

	sink = io.StringIO()
	with contextlib.redirect_stdout(sink), contextlib.redirect_stderr(sink):
	from insightface.app import FaceAnalysis

	_face_app = FaceAnalysis(
	name="buffalo_l",
	root=str(WEIGHTS_DIR),
	providers=["CPUExecutionProvider"],
	)
	# ctx_id=-1 forces CPU mode
	_face_app.prepare(ctx_id=-1, det_size=(640, 640))
	return _face_app


	# ─────────────────────────────────────────────────────────────────────────────
	# Embedding Extraction
	# ─────────────────────────────────────────────────────────────────────────────
	def extract_embedding(image_bytes: bytes) -> Optional[np.ndarray]:
	"""
	Decode image bytes → detect all faces → return the 512-d L2-normalized
	ArcFace embedding of the largest face in the frame.
	Returns None if no face is detected or image is corrupt.
	"""
	arr = np.frombuffer(image_bytes, np.uint8)
	img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
	if img is None:
	return None

	faces = get_face_app().get(img)
	if not faces:
	return None

	# Select the spatially largest face (highest confidence in most scenarios)
	best = max(
	faces,
	key=lambda f: (f.bbox[2] - f.bbox[0]) * (f.bbox[3] - f.bbox[1]),
	)
	# normed_embedding is already L2-normalized by InsightFace
	return best.normed_embedding.astype(np.float32)


	# ─────────────────────────────────────────────────────────────────────────────
	# CSV Registry helpers
	# ─────────────────────────────────────────────────────────────────────────────
	def load_registry() -> pd.DataFrame:
	if not REGISTRY_CSV.exists():
	df = pd.DataFrame(columns=CSV_COLUMNS)
	df.to_csv(REGISTRY_CSV, index=False)
	return df
	return pd.read_csv(REGISTRY_CSV, dtype=str).fillna("")


	def save_registry(df: pd.DataFrame) -> None:
	df.to_csv(REGISTRY_CSV, index=False)


	# ─────────────────────────────────────────────────────────────────────────────
	# Embeddings map helpers {person_id: abs_path_to_npy}
	# ─────────────────────────────────────────────────────────────────────────────
	def load_emb_map() -> dict:
	if not EMB_MAP_PATH.exists():
	return {}
	with open(EMB_MAP_PATH) as fh:
	return json.load(fh)


	def save_emb_map(mapping: dict) -> None:
	with open(EMB_MAP_PATH, "w") as fh:
	json.dump(mapping, fh)


	# ─────────────────────────────────────────────────────────────────────────────
	# FAISS index — rebuilt on each search call
	# For datasets < ~50 k persons this is near-instant (<10 ms).
	# For larger datasets swap IndexFlatIP for IndexIVFFlat + periodic training.
	# ─────────────────────────────────────────────────────────────────────────────
	def build_faiss_index() -> tuple[faiss.Index, list[str]]:
	"""
	Load all stored embeddings and add them to a fresh FAISS IndexFlatIP.
	Inner-product on L2-normalized vectors == cosine similarity.
	Returns (index, ordered_ids) where ordered_ids[i] maps to index row i.
	"""
	emb_map = load_emb_map()
	ids: list[str] = []
	vecs: list[np.ndarray] = []

	for pid, emb_path in emb_map.items():
	p = Path(emb_path)
	if p.exists():
	vecs.append(np.load(str(p)))
	ids.append(pid)

	index = faiss.IndexFlatIP(EMBEDDING_DIM)
	if vecs:
	matrix = np.stack(vecs).astype(np.float32)
	index.add(matrix)

	return index, ids


	# ─────────────────────────────────────────────────────────────────────────────
	# Confidence label helper
	# ─────────────────────────────────────────────────────────────────────────────
	def confidence_label(sim: float) -> str:
	if sim >= THRESH_VERY_HIGH:
	return "Very High"
	if sim >= THRESH_HIGH:
	return "High"
	return "Medium"


	# ─────────────────────────────────────────────────────────────────────────────
	# Public API
	# ─────────────────────────────────────────────────────────────────────────────

	def register_missing_person(image_bytes: bytes, details: dict) -> dict:
	"""
	Full registration pipeline
	──────────────────────────
	1. Decode + detect face → extract ArcFace embedding
	2. Generate 8-char uppercase UUID
	3. Write photo.jpg to storage/persons/{ID}/
	4. Write embedding.npy to storage/persons/{ID}/
	5. Append row to registry.csv
	6. Update embeddings_map.json (used by build_faiss_index)

	Returns {"success": True, "id": <ID>} or {"success": False, "error": ...}
	"""
	ensure_dirs()

	t0 = time.perf_counter()
	t_emb0 = time.perf_counter()
	emb = extract_embedding(image_bytes)
	t_emb1 = time.perf_counter()
	if emb is None:
	return {
	"success": False,
	"error": (
	"No face detected. Please upload a clear, well-lit photo "
	"showing the person's face without occlusion."
	),
	}

	person_id = uuid.uuid4().hex[:8].upper()
	person_dir = PERSONS_DIR / person_id
	person_dir.mkdir(parents=True, exist_ok=True)

	# ── Photo ──────────────────────────────────────────────────────────────
	t_io0 = time.perf_counter()
	arr = np.frombuffer(image_bytes, np.uint8)
	img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
	cv2.imwrite(str(person_dir / "photo.jpg"), img, [cv2.IMWRITE_JPEG_QUALITY, 92])

	# ── Embedding ──────────────────────────────────────────────────────────
	emb_path = str(person_dir / "embedding.npy")
	np.save(emb_path, emb)

	# ── Registry CSV ───────────────────────────────────────────────────────
	df = load_registry()
	row = {
	"id": person_id,
	"name": details.get("name", "Unknown").strip(),
	"age": details.get("age", ""),
	"gender": details.get("gender", ""),
	"last_seen_date": details.get("last_seen_date", ""),
	"last_seen_location": details.get("last_seen_location", ""),
	"phone_contact": details.get("phone_contact", ""),
	"address": details.get("address", ""),
	"national_id": details.get("national_id", ""),
	"description": details.get("description", ""),
	"registered_at": datetime.now().strftime("%Y-%m-%d %H:%M"),
	"status": "missing",
	}
	df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
	save_registry(df)

	# ── Embeddings map ─────────────────────────────────────────────────────
	emb_map = load_emb_map()
	emb_map[person_id] = emb_path
	save_emb_map(emb_map)

	t_io1 = time.perf_counter()
	t1 = time.perf_counter()

	timing_ms = {
	"embedding_ms": round((t_emb1 - t_emb0) * 1000, 1),
	"io_ms": round((t_io1 - t_io0) * 1000, 1),
	"total_ms": round((t1 - t0) * 1000, 1),
	}

	return {"success": True, "id": person_id, "timing_ms": timing_ms}


	def search_person(
	image_bytes: bytes,
	top_k: int = 5,
	threshold: float = THRESH_MEDIUM,
	) -> dict:
	"""
	Search pipeline
	───────────────
	1. Extract ArcFace embedding from query image
	2. Build FAISS IndexFlatIP from all stored embeddings
	3. k-NN inner-product search (cosine similarity)
	4. Filter results below threshold, rank descending, enrich with metadata

	Returns {"success": True, "matches": [...]}
	Each match: id, similarity (0-100), confidence label, full metadata.
	"""
	ensure_dirs()

	q_emb = extract_embedding(image_bytes)
	if q_emb is None:
	return {"success": False, "error": "No face detected in the search image."}

	index, ordered_ids = build_faiss_index()
	if index.ntotal == 0:
	return {"success": True, "matches": [], "message": "Registry is currently empty."}

	k = min(top_k, index.ntotal)
	scores, indices = index.search(q_emb.reshape(1, -1), k)

	df = load_registry()
	matches = []

	for score, idx in zip(scores[0], indices[0]):
	if idx < 0:
	continue
	sim = float(score)
	if sim < threshold:
	continue

	pid = ordered_ids[idx]
	rows = df[df["id"] == pid]
	if rows.empty:
	continue
	p = rows.iloc[0]

	matches.append({
	"id": pid,
	"similarity": round(sim * 100, 1), # as percentage
	"confidence": confidence_label(sim),
	"name": p["name"],
	"age": p["age"],
	"gender": p["gender"],
	"last_seen_date": p["last_seen_date"],
	"last_seen_location": p["last_seen_location"],
	"phone_contact": p["phone_contact"],
	"address": p["address"],
	"description": p["description"],
	"registered_at": p["registered_at"],
	"status": p.get("status", "missing"),
	})

	matches.sort(key=lambda x: x["similarity"], reverse=True)
	return {"success": True, "matches": matches}


	def get_all_persons() -> list[dict]:
	ensure_dirs()
	return load_registry().to_dict(orient="records")


	def delete_person(person_id: str) -> dict:
	ensure_dirs()
	df = load_registry()

	if person_id not in df["id"].values:
	return {"success": False, "error": "Person not found in registry."}

	df = df[df["id"] != person_id]
	save_registry(df)

	emb_map = load_emb_map()
	emb_map.pop(person_id, None)
	save_emb_map(emb_map)

	person_dir = PERSONS_DIR / person_id
	if person_dir.exists():
	shutil.rmtree(str(person_dir))

	return {"success": True, "message": f"Person {person_id} removed from registry."}


	def update_person_status(person_id: str, status: str) -> dict:
	"""Update status field: 'missing' \| 'found'"""
	df = load_registry()
	if person_id not in df["id"].values:
	return {"success": False, "error": "Person not found."}
	df.loc[df["id"] == person_id, "status"] = status
	save_registry(df)
	return {"success": True}