"""Core embedding-based detector. Loads the DETree KNN database and exposes ``detect_embedding``, which accepts a single pre-computed, L2-normalised embedding vector and returns a prediction. All modality-specific logic (text, image) lives in separate embedder modules: - text_embedder.py → str → np.ndarray - image_embedder.py → PIL.Image → np.ndarray Usage:: from Apps.detector import detect_embedding from Apps.text_embedder import get_text_embedding from Apps.image_embedder import get_image_embedding emb = get_text_embedding("Some text here") result = detect_embedding(emb) # {"predicted_class": "Human"|"Ai", "confidence": 0.95} emb = get_image_embedding(pil_image) result = detect_embedding(emb, mode="image") # {"predicted_class": "Real"|"AI", "confidence": 0.88} """ from __future__ import annotations import logging import os import sys from typing import Optional import numpy as np import torch from huggingface_hub import hf_hub_download log = logging.getLogger("detector") logging.basicConfig(level=logging.INFO, format="%(levelname)s [%(name)s] %(message)s") # --------------------------------------------------------------------------- # Make the local 'detree' package importable # --------------------------------------------------------------------------- _current_dir = os.path.dirname(os.path.abspath(__file__)) if _current_dir not in sys.path: sys.path.append(_current_dir) try: from detree.utils.index import Indexer log.info("Indexer imported successfully.") except ImportError as _e: log.error(f"Could not import detree Indexer: {_e} — detection will return fallback responses.") Indexer = None # --------------------------------------------------------------------------- # Paths # --------------------------------------------------------------------------- REPO_ID = "MAS-AI-0000/Authentica" _DB_PATH = hf_hub_download( repo_id=REPO_ID, filename="Lib/Models/MultiModal/priori1_center10k.pt", ) log.info(f"[paths] _DB_PATH = {_DB_PATH!r} exists={os.path.exists(_DB_PATH)}") # --------------------------------------------------------------------------- # Hyperparameters (match values used during database construction) # --------------------------------------------------------------------------- TOP_K = 10 THRESHOLD = 0.97 # --------------------------------------------------------------------------- # Internal helpers # --------------------------------------------------------------------------- def _load_database(path: str): log.info(f"_load_database: loading from {path!r} ...") data = torch.load(path, map_location="cpu") embeddings = data["embeddings"] labels = data["labels"] ids = data["ids"] classes = data["classes"] log.info(f"_load_database: classes={list(classes)} " f"embedding keys={list(embeddings.keys()) if isinstance(embeddings, dict) else type(embeddings).__name__}") if not isinstance(embeddings, dict): raise ValueError("Expected embeddings to be a dict keyed by layer index.") return embeddings, labels, ids, classes def _to_numpy(value) -> np.ndarray: if isinstance(value, np.ndarray): return value if torch.is_tensor(value): return value.detach().cpu().numpy() return np.asarray(value) # --------------------------------------------------------------------------- # Module-level initialisation # --------------------------------------------------------------------------- _index: Optional[object] = None _human_index: Optional[int] = None _classes: list = [] _embedding_dim: int = 0 _active_layer: Optional[int] = None def _init() -> None: global _index, _human_index, _classes, _embedding_dim, _active_layer log.info("_init: starting Detector initialisation.") if Indexer is None: log.error("_init: Indexer is None — check import error above. Detection disabled.") return if not os.path.exists(_DB_PATH): log.error(f"_init: database not found at {_DB_PATH!r} — detection disabled.") return try: embeddings, labels, ids, classes = _load_database(_DB_PATH) _classes = list(classes) log.info(f"_init: available classes={_classes}") if "human" not in _classes: raise ValueError("Database must include a 'human' class entry.") _human_index = _classes.index("human") log.info(f"_init: human_index={_human_index}") # Layer embeddings keyed by int layer index layer_embeddings = {int(k): v.float() for k, v in embeddings.items()} available_layers = sorted(layer_embeddings.keys()) active_layer = available_layers[-1] # last layer by default _active_layer = active_layer log.info(f"_init: available layers={available_layers} using active_layer={active_layer}") # Resolve per-layer or shared label / id tensors if isinstance(labels, dict): layer_labels = _to_numpy(labels[active_layer]).astype(np.int64) else: layer_labels = _to_numpy(labels).astype(np.int64) if isinstance(ids, dict): layer_ids = _to_numpy(ids[active_layer]).astype(np.int64) else: layer_ids = _to_numpy(ids).astype(np.int64) train_embs = _to_numpy(layer_embeddings[active_layer]).astype(np.float32) _embedding_dim = train_embs.shape[-1] log.info(f"_init: train_embs shape={train_embs.shape} embedding_dim={_embedding_dim}") log.info(f"_init: label distribution — " f"human={int((layer_labels == _human_index).sum())} " f"ai={int((layer_labels != _human_index).sum())}") label_dict = { int(idx): (1 if int(lbl) == int(_human_index) else 0) for idx, lbl in zip(layer_ids.tolist(), layer_labels.tolist()) } _index = Indexer(_embedding_dim) _index.label_dict = label_dict _index.index_data(layer_ids.tolist(), train_embs) log.info(f"_init: Indexer built — layer={active_layer} dim={_embedding_dim} " f"entries={len(layer_ids)}") except Exception as exc: log.exception(f"_init: error initialising database: {exc}") _init() # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def detect_embedding( embedding: np.ndarray, *, top_k: int = TOP_K, threshold: float = THRESHOLD, ) -> dict: """Classify a single pre-computed, L2-normalised embedding via KNN. Args: embedding: 1-D or (1, dim) float32 numpy array already projected into the DETree embedding space (L2-normalised). mode: ``"text"`` returns labels ``"Human"`` / ``"Ai"``. ``"image"`` returns labels ``"Real"`` / ``"AI"``. top_k: Number of nearest neighbours to consider. threshold: Probability above which the sample is labelled Human/Real. Returns: ``{"predicted_class": int, "confidence": float}`` """ fallback_class = 0 if _index is None: log.error("detect_embedding: _index is None — returning fallback. Check _init logs.") return {"predicted_class": fallback_class, "confidence": 0.0} emb = np.asarray(embedding, dtype=np.float32).reshape(1, -1) log.info(f"detect_embedding: query embedding shape={emb.shape} norm={float(np.linalg.norm(emb)):.4f} " f"top_k={top_k} threshold={threshold}") try: results = _index.search_knn( emb, top_k, index_batch_size=max(1, min(top_k, 128)), ) _ids, scores, labels_knn = results[0] log.info(f"detect_embedding: neighbour ids={_ids}") log.info(f"detect_embedding: neighbour scores={[round(float(s), 4) for s in scores]}") log.info(f"detect_embedding: neighbour labels={labels_knn} " f"(1=human, 0=ai)") scores_tensor = torch.from_numpy(np.asarray(scores)) weights = torch.softmax(scores_tensor, dim=0) label_t = torch.tensor(labels_knn, dtype=torch.float32) prob_human = float(torch.dot(weights, label_t).item()) prob_human = max(0.0, min(1.0, prob_human)) prob_ai = float(max(0.0, min(1.0, 1.0 - prob_human))) #0 = Real, 1 = AI predicted_class = 1 if prob_ai > prob_human else 0 confidence = prob_human if predicted_class == 0 else prob_ai log.info(f"detect_embedding: prob_human={prob_human:.4f} prob_ai={prob_ai:.4f} " f"predicted_class={predicted_class} confidence={confidence:.4f}") except Exception as exc: log.exception(f"detect_embedding: failed during KNN search: {exc}") return {"predicted_class": fallback_class, "confidence": 0.0} return {"predicted_class": predicted_class, "confidence": confidence}