Spaces:

MAS-AI-0000
/

Authentica

Sleeping

App Files Files Community

MAS-AI-0000 commited on Feb 25

Commit

cae2130

verified ·

1 Parent(s): 025084b

Update text_embedder.py

Browse files

Files changed (1) hide show

text_embedder.py +196 -179

text_embedder.py CHANGED Viewed

@@ -1,179 +1,196 @@
-"""Text → detection-ready embedding.
-Loads the DETree ``TextEmbeddingModel`` and exposes ``get_text_embedding``,
-which tokenises a string, runs it through the model, and returns a single
-L2-normalised embedding vector ready to be passed to ``detect_embedding``.
-The layer extracted defaults to -1 (the last hidden layer), matching the
-default used in ``detector.py`` when building the KNN index.  Override
-``layer`` if your database was built with a different layer.
-Usage::
-    from Apps.text_embedder import get_text_embedding
-    from Apps.detector import detect_embedding
-    emb    = get_text_embedding("Was this written by a human?")
-    result = detect_embedding(emb)
-    # {"predicted_class": "Human"|"Ai", "confidence": 0.93}
-"""
-from __future__ import annotations
-import os
-import sys
-from typing import Optional
-import numpy as np
-import torch
-import torch.nn.functional as F
-# ---------------------------------------------------------------------------
-# Make the local 'detree' package importable
-# ---------------------------------------------------------------------------
-_current_dir = os.path.dirname(os.path.abspath(__file__))
-if _current_dir not in sys.path:
-    sys.path.append(_current_dir)
-try:
-    from detree.model.text_embedding import TextEmbeddingModel
-except ImportError as _e:
-    print(f"Warning: could not import TextEmbeddingModel ({_e}). Text embedding will return zeros.")
-    TextEmbeddingModel = None
-# ---------------------------------------------------------------------------
-# Paths
-# ---------------------------------------------------------------------------
-_BASE_DIR  = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-_TEXT_DIR  = os.path.join(_BASE_DIR, "Lib", "Models", "Text")
-# ---------------------------------------------------------------------------
-# Configuration
-# ---------------------------------------------------------------------------
-MAX_LENGTH = 512
-POOLING    = "max"    # must match what was used during database construction
-DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"
-# ---------------------------------------------------------------------------
-# Module-level initialisation
-# ---------------------------------------------------------------------------
-_model:     Optional[object] = None
-_tokenizer: Optional[object] = None
-def _init() -> None:
-    global _model, _tokenizer
-    if TextEmbeddingModel is None:
-        print("TextEmbedder: TextEmbeddingModel unavailable — embedding disabled.")
-        return
-    if not os.path.exists(_TEXT_DIR):
-        print(f"TextEmbedder: model directory not found at {_TEXT_DIR!r} — embedding disabled.")
-        return
-    try:
-        _model = TextEmbeddingModel(
-            _TEXT_DIR,
-            output_hidden_states=True,
-            infer=True,
-            use_pooling=POOLING,
-        ).to(DEVICE)
-        _model.eval()
-        _tokenizer = _model.tokenizer
-        print(f"TextEmbedder: model loaded from {_TEXT_DIR!r}")
-    except Exception as exc:
-        print(f"TextEmbedder: error loading model: {exc}")
-_init()
-# ---------------------------------------------------------------------------
-# Public API
-# ---------------------------------------------------------------------------
-@torch.no_grad()
-def get_text_embedding(
-    text: str,
-    *,
-    layer:      int = -1,           # which hidden-state layer to use (-1 = last)
-    max_length: int = MAX_LENGTH,
-) -> np.ndarray:
-    """Return a (1, embedding_dim) float32 numpy array for the given text.
-    The embedding is L2-normalised and projected into the same space as the
-    DETree database so it can be passed directly to ``detect_embedding``.
-    Args:
-        text:       The input string to embed.
-        layer:      Hidden-state layer index.  -1 selects the last layer,
-                    matching the default used when building the database.
-        max_length: Tokenisation truncation length.
-    Returns:
-        ``np.ndarray`` of shape ``(1, embedding_dim)`` and dtype float32.
-    """
-    if _model is None or _tokenizer is None:
-        return np.zeros((1, 1), dtype=np.float32)
-    encoded = _tokenizer.batch_encode_plus(
-        [text],
-        return_tensors="pt",
-        max_length=max_length,
-        padding="max_length",
-        truncation=True,
-    )
-    encoded = {k: v.to(DEVICE) for k, v in encoded.items()}
-    # Shape returned by model with hidden_states=True: (batch, num_layers, dim)
-    embeddings = _model(encoded, hidden_states=True)
-    embeddings = F.normalize(embeddings, dim=-1)         # normalise feature dim
-    # embeddings: (1, num_layers, dim)  →  select layer  →  (1, dim)
-    selected = embeddings[:, layer, :]                   # supports negative indexing
-    return selected.cpu().numpy().astype(np.float32)
-@torch.no_grad()
-def get_text_embeddings_batch(
-    texts: list[str],
-    *,
-    layer:      int = -1,
-    max_length: int = MAX_LENGTH,
-    batch_size: int = 8,
-) -> np.ndarray:
-    """Return an (N, embedding_dim) float32 array for a list of strings.
-    Args:
-        texts:      List of input strings.
-        layer:      Hidden-state layer index (-1 = last).
-        max_length: Tokenisation truncation length.
-        batch_size: Number of strings to encode per forward pass.
-    Returns:
-        ``np.ndarray`` of shape ``(N, embedding_dim)`` and dtype float32.
-    """
-    if _model is None or _tokenizer is None:
-        return np.zeros((len(texts), 1), dtype=np.float32)
-    all_embeddings: list[np.ndarray] = []
-    for i in range(0, len(texts), batch_size):
-        batch = [str(t) for t in texts[i : i + batch_size]]
-        encoded = _tokenizer.batch_encode_plus(
-            batch,
-            return_tensors="pt",
-            max_length=max_length,
-            padding="max_length",
-            truncation=True,
-        )
-        encoded = {k: v.to(DEVICE) for k, v in encoded.items()}
-        embeddings = _model(encoded, hidden_states=True)
-        embeddings = F.normalize(embeddings, dim=-1)         # (B, num_layers, dim)
-        selected   = embeddings[:, layer, :]                 # (B, dim)
-        all_embeddings.append(selected.cpu().numpy().astype(np.float32))
-    return np.concatenate(all_embeddings, axis=0) if all_embeddings else np.zeros((0, 1), dtype=np.float32)

+"""Text → detection-ready embedding.
+Loads the DETree ``TextEmbeddingModel`` and exposes ``get_text_embedding``,
+which tokenises a string, runs it through the model, and returns a single
+L2-normalised embedding vector ready to be passed to ``detect_embedding``.
+The layer extracted defaults to -1 (the last hidden layer), matching the
+default used in ``detector.py`` when building the KNN index.  Override
+``layer`` if your database was built with a different layer.
+Usage::
+    from Apps.text_embedder import get_text_embedding
+    from Apps.detector import detect_embedding
+    emb    = get_text_embedding("Was this written by a human?")
+    result = detect_embedding(emb)
+    # {"predicted_class": "Human"|"Ai", "confidence": 0.93}
+"""
+from __future__ import annotations
+import os
+import sys
+from typing import Optional
+import numpy as np
+import torch
+import torch.nn.functional as F
+from pathlib import Path
+from huggingface_hub import snapshot_download
+# ---------------------------------------------------------------------------
+# Make the local 'detree' package importable
+# ---------------------------------------------------------------------------
+_current_dir = os.path.dirname(os.path.abspath(__file__))
+if _current_dir not in sys.path:
+    sys.path.append(_current_dir)
+try:
+    from detree.model.text_embedding import TextEmbeddingModel
+except ImportError as _e:
+    print(f"Warning: could not import TextEmbeddingModel ({_e}). Text embedding will return zeros.")
+    TextEmbeddingModel = None
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+MAX_LENGTH = 512
+POOLING    = "max"    # must match what was used during database construction
+DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"
+# hugging face
+REPO_ID = "MAS-AI-0000/Authentica"
+TEXT_SUBFOLDER = "Lib/Models/Text"   # where config.json/model.safetensors live in the repo
+EMBEDDING_FILE = "priori1_center10k.pt"
+_TEXT_DIR = None
+try:
+    # download a local snapshot of just the Text folder and point _TEXT_DIR at it
+    print(f"Downloading/Checking model from {REPO_ID}...")
+    _snapshot_dir = snapshot_download(
+        repo_id=REPO_ID,
+        allow_patterns=[f"{TEXT_SUBFOLDER}/*"]
+    )
+    _TEXT_DIR = os.path.join(_snapshot_dir, TEXT_SUBFOLDER)
+    print(f"Model directory set to: {_TEXT_DIR}")
+except Exception as e:
+    print(f"Error downloading model from Hugging Face: {e}")
+# ---------------------------------------------------------------------------
+# Module-level initialisation
+# ---------------------------------------------------------------------------
+_model:     Optional[object] = None
+_tokenizer: Optional[object] = None
+def _init() -> None:
+    global _model, _tokenizer
+    if TextEmbeddingModel is None:
+        print("TextEmbedder: TextEmbeddingModel unavailable — embedding disabled.")
+        return
+    if not os.path.exists(_TEXT_DIR):
+        print(f"TextEmbedder: model directory not found at {_TEXT_DIR!r} — embedding disabled.")
+        return
+    try:
+        _model = TextEmbeddingModel(
+            _TEXT_DIR,
+            output_hidden_states=True,
+            infer=True,
+            use_pooling=POOLING,
+        ).to(DEVICE)
+        _model.eval()
+        _tokenizer = _model.tokenizer
+        print(f"TextEmbedder: model loaded from {_TEXT_DIR!r}")
+    except Exception as exc:
+        print(f"TextEmbedder: error loading model: {exc}")
+_init()
+# ---------------------------------------------------------------------------
+# Public API
+# ---------------------------------------------------------------------------
+@torch.no_grad()
+def get_text_embedding(
+    text: str,
+    *,
+    layer:      int = -1,           # which hidden-state layer to use (-1 = last)
+    max_length: int = MAX_LENGTH,
+) -> np.ndarray:
+    """Return a (1, embedding_dim) float32 numpy array for the given text.
+    The embedding is L2-normalised and projected into the same space as the
+    DETree database so it can be passed directly to ``detect_embedding``.
+    Args:
+        text:       The input string to embed.
+        layer:      Hidden-state layer index.  -1 selects the last layer,
+                    matching the default used when building the database.
+        max_length: Tokenisation truncation length.
+    Returns:
+        ``np.ndarray`` of shape ``(1, embedding_dim)`` and dtype float32.
+    """
+    if _model is None or _tokenizer is None:
+        return np.zeros((1, 1), dtype=np.float32)
+    encoded = _tokenizer.batch_encode_plus(
+        [text],
+        return_tensors="pt",
+        max_length=max_length,
+        padding="max_length",
+        truncation=True,
+    )
+    encoded = {k: v.to(DEVICE) for k, v in encoded.items()}
+    # Shape returned by model with hidden_states=True: (batch, num_layers, dim)
+    embeddings = _model(encoded, hidden_states=True)
+    embeddings = F.normalize(embeddings, dim=-1)         # normalise feature dim
+    # embeddings: (1, num_layers, dim)  →  select layer  →  (1, dim)
+    selected = embeddings[:, layer, :]                   # supports negative indexing
+    return selected.cpu().numpy().astype(np.float32)
+@torch.no_grad()
+def get_text_embeddings_batch(
+    texts: list[str],
+    *,
+    layer:      int = -1,
+    max_length: int = MAX_LENGTH,
+    batch_size: int = 8,
+) -> np.ndarray:
+    """Return an (N, embedding_dim) float32 array for a list of strings.
+    Args:
+        texts:      List of input strings.
+        layer:      Hidden-state layer index (-1 = last).
+        max_length: Tokenisation truncation length.
+        batch_size: Number of strings to encode per forward pass.
+    Returns:
+        ``np.ndarray`` of shape ``(N, embedding_dim)`` and dtype float32.
+    """
+    if _model is None or _tokenizer is None:
+        return np.zeros((len(texts), 1), dtype=np.float32)
+    all_embeddings: list[np.ndarray] = []
+    for i in range(0, len(texts), batch_size):
+        batch = [str(t) for t in texts[i : i + batch_size]]
+        encoded = _tokenizer.batch_encode_plus(
+            batch,
+            return_tensors="pt",
+            max_length=max_length,
+            padding="max_length",
+            truncation=True,
+        )
+        encoded = {k: v.to(DEVICE) for k, v in encoded.items()}
+        embeddings = _model(encoded, hidden_states=True)
+        embeddings = F.normalize(embeddings, dim=-1)         # (B, num_layers, dim)
+        selected   = embeddings[:, layer, :]                 # (B, dim)
+        all_embeddings.append(selected.cpu().numpy().astype(np.float32))
+    return np.concatenate(all_embeddings, axis=0) if all_embeddings else np.zeros((0, 1), dtype=np.float32)