Spaces:

Mrkomiljon
/

text-detector

Sleeping

File size: 9,538 Bytes

# app.py
import os
import re
import unicodedata
import joblib
import torch
import gradio as gr
import numpy as np
import pandas as pd
import warnings
import nltk
from sentence_transformers import SentenceTransformer
from huggingface_hub import hf_hub_download

warnings.filterwarnings("ignore")

# -------------------------------------------------
# Hugging Face model config
# -------------------------------------------------
REPO_ID = "Detecting-ai/text-detector-model-embedding"
FILENAME = "complete_trained_model_lite.joblib"
REPO_TYPE = "model"

# -------------------------------------------------
# Force 768-dim embedder (MPNet; English-optimized)
# -------------------------------------------------
FORCED_EMBEDDER = "sentence-transformers/all-mpnet-base-v2"
FORCED_DIM = 768

# -------------------------------------------------
# Ensure NLTK deps (safe no-ops if already present)
# -------------------------------------------------
def ensure_nltk():
    resources = {
        "punkt": "tokenizers/punkt",
        "punkt_tab": "tokenizers/punkt_tab/english",  # ok if missing on older NLTK
    }
    for pkg, path in resources.items():
        try:
            nltk.data.find(path)
        except LookupError:
            try:
                nltk.download(pkg, quiet=True)
            except Exception:
                pass

ensure_nltk()

# -------------------------------------------------
# Minimal preprocessing for Transformer embeddings
# (DO NOT remove stopwords/lemmatize — keep raw text)
# -------------------------------------------------
def preprocess_text(text: str, max_chars: int = 100000) -> str:
    """
    Minimal, language-agnostic clean-up:
    - Unicode normalize
    - Strip and optional lower
    - Hard cap on size (avoid insane inputs)
    """
    if pd.isna(text):
        return ""
    t = str(text)
    t = unicodedata.normalize("NFKC", t)
    t = t.strip().lower()
    # hard limit to keep memory/tokenizer stable on huge pastes
    if len(t) > max_chars:
        t = t[:max_chars]
    return t

def chunk_by_words(text: str, words_per_chunk: int = 350):
    words = text.split()
    if not words:
        return []
    chunks = []
    for i in range(0, len(words), words_per_chunk):
        ch = " ".join(words[i:i + words_per_chunk])
        if ch.strip():
            chunks.append(ch)
    return chunks

# -------------------------------------------------
# Load classifier + embedder (forced 768-dim)
# -------------------------------------------------
def load_embedding_model():
    path = hf_hub_download(
        repo_id=REPO_ID,
        filename=FILENAME,
        repo_type=REPO_TYPE,
        token=os.getenv("HF_TOKEN") or None,
    )
    print(f"✅ Downloaded model from Hugging Face: {FILENAME}")

    data = joblib.load(path)
    clf = data.get("model")
    if clf is None:
        raise RuntimeError("Model file does not contain 'model' key.")

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"🔧 Loading 768-dim embedder: {FORCED_EMBEDDER} on {device}")
    embedding_model = SentenceTransformer(FORCED_EMBEDDER, device=device)
    actual_dim = embedding_model.get_sentence_embedding_dimension()
    if actual_dim != FORCED_DIM:
        raise RuntimeError(f"Loaded embedder dim={actual_dim}, expected {FORCED_DIM}")

    # Classifier sanity check
    clf_dim = getattr(clf, "n_features_in_", None)
    if clf_dim and clf_dim != FORCED_DIM:
        raise RuntimeError(
            f"Classifier expects {clf_dim} features, but app is configured for {FORCED_DIM}. "
            f"Please retrain or load a 768-dim trained classifier."
        )

    # finalize model_data dict
    model_data = {
        "model": clf,
        "embedding_model": embedding_model,
        "resolved_embedding_model_name": FORCED_EMBEDDER,
        "resolved_embedding_dim": actual_dim,
        "device": device,
        # UI defaults
        "max_chars": int(data.get("max_chars", 100000)),
        "words_per_chunk": int(data.get("words_per_chunk", 350)),
        # remember training-time normalize flag if you stored it; default True
        "normalize_embeddings_default": bool(data.get("normalize_embeddings", True)),
    }

    classes = getattr(clf, "classes_", None)
    print(f"✅ Using embedder: {FORCED_EMBEDDER} (dim={actual_dim}) — "
          f"classifier expects {getattr(clf,'n_features_in_','unknown')}, "
          f"classes={classes}")
    return model_data

# -------------------------------------------------
# Prediction with threshold + chunking
# -------------------------------------------------
def _infer_ai_index(clf) -> int:
    classes = [str(c).upper() for c in getattr(clf, "classes_", [])]
    if "AI" in classes:
        return classes.index("AI")
    # common fallback: binary {0,1} where 1=AI
    if set(classes) == {"0", "1"}:
        return classes.index("1")
    # last resort: assume last class is AI
    return len(classes) - 1 if classes else 0

def predict_with_threshold(
    text: str,
    model_data: dict,
    ai_threshold: float = 0.70,
    normalize_flag: bool = True,
    agg: str = "mean",  # "mean" or "median"
):
    proc = preprocess_text(text, max_chars=model_data.get("max_chars", 100000))
    if not proc:
        return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"}

    chunks = chunk_by_words(proc, words_per_chunk=model_data.get("words_per_chunk", 350))
    if not chunks:
        return "UNKNOWN", 0.0, {"error": "Empty after chunking"}

    clf = model_data["model"]
    ai_idx = _infer_ai_index(clf)
    p_ai_list = []

    with torch.no_grad():
        for ch in chunks:
            emb = model_data["embedding_model"].encode(
                [ch], convert_to_numpy=True, normalize_embeddings=normalize_flag
            )
            if emb.ndim == 1:
                emb = emb.reshape(1, -1)

            need = getattr(clf, "n_features_in_", emb.shape[1])
            if emb.shape[1] != need:
                return "ERROR", 0.0, {
                    "error": f"Embedding dim {emb.shape[1]} != classifier requires {need}"
                }

            if hasattr(clf, "predict_proba"):
                proba = clf.predict_proba(emb)[0]
                p_ai_list.append(float(proba[ai_idx]))
            else:
                # fallback if no proba: convert predicted label to pseudo-proba
                pred = str(clf.predict(emb)[0]).upper()
                p_ai_list.append(1.0 if pred == "AI" else 0.0)

    p_ai = float(np.mean(p_ai_list) if agg == "mean" else np.median(p_ai_list))
    label = "AI" if p_ai >= ai_threshold else "HUMAN"
    conf = p_ai if label == "AI" else 1.0 - p_ai

    return label, conf, {
        "p_ai": p_ai,
        "chunks": len(chunks),
        "threshold": ai_threshold,
        "agg": agg,
    }

# -------------------------------------------------
# Gradio App
# -------------------------------------------------
def create_app(model_data):
    with gr.Blocks(title="Embedding-based Human vs AI Detector") as demo:
        gr.Markdown("## 🤖👤 Human vs AI Detector (Embedding-based)")
        gr.Markdown(
            "Transformer-friendly pipeline: **no stopword removal / lemmatization**, "
            "**chunking** for long texts, **thresholded** decision."
        )

        with gr.Row():
            inp = gr.Textbox(label="Enter English text", lines=10, placeholder="Paste text here...")
        with gr.Row():
            thr = gr.Slider(minimum=0.50, maximum=0.90, value=0.70, step=0.01,
                            label="AI threshold (p_AI ≥ threshold → AI)")
            norm = gr.Checkbox(
                value=model_data.get("normalize_embeddings_default", True),
                label="Normalize embeddings (match training setting)"
            )
        with gr.Row():
            agg = gr.Dropdown(choices=["mean", "median"], value="mean", label="Aggregate across chunks")

        out = gr.Markdown()
        details = gr.Markdown()

        def _predict_ui(text, threshold, normalize_embeddings, agg_mode):
            label, conf, meta = predict_with_threshold(
                text, model_data,
                ai_threshold=float(threshold),
                normalize_flag=bool(normalize_embeddings),
                agg=agg_mode
            )
            if label == "AI":
                headline = f"🤖 **AI Generated** (Conf: {conf:.1%})"
            elif label == "HUMAN":
                headline = f"👤 **Human Written** (Conf: {conf:.1%})"
            elif label == "ERROR":
                headline = f"❌ Error: {meta.get('error', 'Unknown')}"
            else:
                headline = f"❓ {label} (Conf: {conf:.1%})"

            det = (
                f"- p(AI): {meta.get('p_ai','?'):.4f}\n"
                f"- Chunks: {meta.get('chunks','?')}\n"
                f"- Threshold: {meta.get('threshold','?')}\n"
                f"- Aggregate: {meta.get('agg','?')}\n"
                f"- Embedder: {model_data['resolved_embedding_model_name']} (dim={model_data['resolved_embedding_dim']})"
            )
            return headline, det

        inp.submit(_predict_ui, [inp, thr, norm, agg], [out, details])
        gr.Button("🔍 Predict").click(_predict_ui, [inp, thr, norm, agg], [out, details])

    return demo

# -------------------------------------------------
# Load + Launch
# -------------------------------------------------
_model_data = load_embedding_model()
demo = create_app(_model_data)

if __name__ == "__main__":
    # Pass share=True if you need a public URL
    demo.launch()