text-detector / app.py
Mrkomiljon's picture
Update app.py
219254d verified
# app.py
import os
import re
import unicodedata
import joblib
import torch
import gradio as gr
import numpy as np
import pandas as pd
import warnings
import nltk
from sentence_transformers import SentenceTransformer
from huggingface_hub import hf_hub_download
warnings.filterwarnings("ignore")
# -------------------------------------------------
# Hugging Face model config
# -------------------------------------------------
REPO_ID = "Detecting-ai/text-detector-model-embedding"
FILENAME = "complete_trained_model_lite.joblib"
REPO_TYPE = "model"
# -------------------------------------------------
# Force 768-dim embedder (MPNet; English-optimized)
# -------------------------------------------------
FORCED_EMBEDDER = "sentence-transformers/all-mpnet-base-v2"
FORCED_DIM = 768
# -------------------------------------------------
# Ensure NLTK deps (safe no-ops if already present)
# -------------------------------------------------
def ensure_nltk():
resources = {
"punkt": "tokenizers/punkt",
"punkt_tab": "tokenizers/punkt_tab/english", # ok if missing on older NLTK
}
for pkg, path in resources.items():
try:
nltk.data.find(path)
except LookupError:
try:
nltk.download(pkg, quiet=True)
except Exception:
pass
ensure_nltk()
# -------------------------------------------------
# Minimal preprocessing for Transformer embeddings
# (DO NOT remove stopwords/lemmatize β€” keep raw text)
# -------------------------------------------------
def preprocess_text(text: str, max_chars: int = 100000) -> str:
"""
Minimal, language-agnostic clean-up:
- Unicode normalize
- Strip and optional lower
- Hard cap on size (avoid insane inputs)
"""
if pd.isna(text):
return ""
t = str(text)
t = unicodedata.normalize("NFKC", t)
t = t.strip().lower()
# hard limit to keep memory/tokenizer stable on huge pastes
if len(t) > max_chars:
t = t[:max_chars]
return t
def chunk_by_words(text: str, words_per_chunk: int = 350):
words = text.split()
if not words:
return []
chunks = []
for i in range(0, len(words), words_per_chunk):
ch = " ".join(words[i:i + words_per_chunk])
if ch.strip():
chunks.append(ch)
return chunks
# -------------------------------------------------
# Load classifier + embedder (forced 768-dim)
# -------------------------------------------------
def load_embedding_model():
path = hf_hub_download(
repo_id=REPO_ID,
filename=FILENAME,
repo_type=REPO_TYPE,
token=os.getenv("HF_TOKEN") or None,
)
print(f"βœ… Downloaded model from Hugging Face: {FILENAME}")
data = joblib.load(path)
clf = data.get("model")
if clf is None:
raise RuntimeError("Model file does not contain 'model' key.")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"πŸ”§ Loading 768-dim embedder: {FORCED_EMBEDDER} on {device}")
embedding_model = SentenceTransformer(FORCED_EMBEDDER, device=device)
actual_dim = embedding_model.get_sentence_embedding_dimension()
if actual_dim != FORCED_DIM:
raise RuntimeError(f"Loaded embedder dim={actual_dim}, expected {FORCED_DIM}")
# Classifier sanity check
clf_dim = getattr(clf, "n_features_in_", None)
if clf_dim and clf_dim != FORCED_DIM:
raise RuntimeError(
f"Classifier expects {clf_dim} features, but app is configured for {FORCED_DIM}. "
f"Please retrain or load a 768-dim trained classifier."
)
# finalize model_data dict
model_data = {
"model": clf,
"embedding_model": embedding_model,
"resolved_embedding_model_name": FORCED_EMBEDDER,
"resolved_embedding_dim": actual_dim,
"device": device,
# UI defaults
"max_chars": int(data.get("max_chars", 100000)),
"words_per_chunk": int(data.get("words_per_chunk", 350)),
# remember training-time normalize flag if you stored it; default True
"normalize_embeddings_default": bool(data.get("normalize_embeddings", True)),
}
classes = getattr(clf, "classes_", None)
print(f"βœ… Using embedder: {FORCED_EMBEDDER} (dim={actual_dim}) β€” "
f"classifier expects {getattr(clf,'n_features_in_','unknown')}, "
f"classes={classes}")
return model_data
# -------------------------------------------------
# Prediction with threshold + chunking
# -------------------------------------------------
def _infer_ai_index(clf) -> int:
classes = [str(c).upper() for c in getattr(clf, "classes_", [])]
if "AI" in classes:
return classes.index("AI")
# common fallback: binary {0,1} where 1=AI
if set(classes) == {"0", "1"}:
return classes.index("1")
# last resort: assume last class is AI
return len(classes) - 1 if classes else 0
def predict_with_threshold(
text: str,
model_data: dict,
ai_threshold: float = 0.70,
normalize_flag: bool = True,
agg: str = "mean", # "mean" or "median"
):
proc = preprocess_text(text, max_chars=model_data.get("max_chars", 100000))
if not proc:
return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"}
chunks = chunk_by_words(proc, words_per_chunk=model_data.get("words_per_chunk", 350))
if not chunks:
return "UNKNOWN", 0.0, {"error": "Empty after chunking"}
clf = model_data["model"]
ai_idx = _infer_ai_index(clf)
p_ai_list = []
with torch.no_grad():
for ch in chunks:
emb = model_data["embedding_model"].encode(
[ch], convert_to_numpy=True, normalize_embeddings=normalize_flag
)
if emb.ndim == 1:
emb = emb.reshape(1, -1)
need = getattr(clf, "n_features_in_", emb.shape[1])
if emb.shape[1] != need:
return "ERROR", 0.0, {
"error": f"Embedding dim {emb.shape[1]} != classifier requires {need}"
}
if hasattr(clf, "predict_proba"):
proba = clf.predict_proba(emb)[0]
p_ai_list.append(float(proba[ai_idx]))
else:
# fallback if no proba: convert predicted label to pseudo-proba
pred = str(clf.predict(emb)[0]).upper()
p_ai_list.append(1.0 if pred == "AI" else 0.0)
p_ai = float(np.mean(p_ai_list) if agg == "mean" else np.median(p_ai_list))
label = "AI" if p_ai >= ai_threshold else "HUMAN"
conf = p_ai if label == "AI" else 1.0 - p_ai
return label, conf, {
"p_ai": p_ai,
"chunks": len(chunks),
"threshold": ai_threshold,
"agg": agg,
}
# -------------------------------------------------
# Gradio App
# -------------------------------------------------
def create_app(model_data):
with gr.Blocks(title="Embedding-based Human vs AI Detector") as demo:
gr.Markdown("## πŸ€–πŸ‘€ Human vs AI Detector (Embedding-based)")
gr.Markdown(
"Transformer-friendly pipeline: **no stopword removal / lemmatization**, "
"**chunking** for long texts, **thresholded** decision."
)
with gr.Row():
inp = gr.Textbox(label="Enter English text", lines=10, placeholder="Paste text here...")
with gr.Row():
thr = gr.Slider(minimum=0.50, maximum=0.90, value=0.70, step=0.01,
label="AI threshold (p_AI β‰₯ threshold β†’ AI)")
norm = gr.Checkbox(
value=model_data.get("normalize_embeddings_default", True),
label="Normalize embeddings (match training setting)"
)
with gr.Row():
agg = gr.Dropdown(choices=["mean", "median"], value="mean", label="Aggregate across chunks")
out = gr.Markdown()
details = gr.Markdown()
def _predict_ui(text, threshold, normalize_embeddings, agg_mode):
label, conf, meta = predict_with_threshold(
text, model_data,
ai_threshold=float(threshold),
normalize_flag=bool(normalize_embeddings),
agg=agg_mode
)
if label == "AI":
headline = f"πŸ€– **AI Generated** (Conf: {conf:.1%})"
elif label == "HUMAN":
headline = f"πŸ‘€ **Human Written** (Conf: {conf:.1%})"
elif label == "ERROR":
headline = f"❌ Error: {meta.get('error', 'Unknown')}"
else:
headline = f"❓ {label} (Conf: {conf:.1%})"
det = (
f"- p(AI): {meta.get('p_ai','?'):.4f}\n"
f"- Chunks: {meta.get('chunks','?')}\n"
f"- Threshold: {meta.get('threshold','?')}\n"
f"- Aggregate: {meta.get('agg','?')}\n"
f"- Embedder: {model_data['resolved_embedding_model_name']} (dim={model_data['resolved_embedding_dim']})"
)
return headline, det
inp.submit(_predict_ui, [inp, thr, norm, agg], [out, details])
gr.Button("πŸ” Predict").click(_predict_ui, [inp, thr, norm, agg], [out, details])
return demo
# -------------------------------------------------
# Load + Launch
# -------------------------------------------------
_model_data = load_embedding_model()
demo = create_app(_model_data)
if __name__ == "__main__":
# Pass share=True if you need a public URL
demo.launch()