Spaces:

Mrkomiljon
/

text-detector

Sleeping

App Files Files Community

text-detector / app.py

Mrkomiljon

Update app.py

219254d verified 5 months ago

raw

history blame contribute delete

9.54 kB

	# app.py
	import os
	import re
	import unicodedata
	import joblib
	import torch
	import gradio as gr
	import numpy as np
	import pandas as pd
	import warnings
	import nltk
	from sentence_transformers import SentenceTransformer
	from huggingface_hub import hf_hub_download

	warnings.filterwarnings("ignore")

	# -------------------------------------------------
	# Hugging Face model config
	# -------------------------------------------------
	REPO_ID = "Detecting-ai/text-detector-model-embedding"
	FILENAME = "complete_trained_model_lite.joblib"
	REPO_TYPE = "model"

	# -------------------------------------------------
	# Force 768-dim embedder (MPNet; English-optimized)
	# -------------------------------------------------
	FORCED_EMBEDDER = "sentence-transformers/all-mpnet-base-v2"
	FORCED_DIM = 768

	# -------------------------------------------------
	# Ensure NLTK deps (safe no-ops if already present)
	# -------------------------------------------------
	def ensure_nltk():
	resources = {
	"punkt": "tokenizers/punkt",
	"punkt_tab": "tokenizers/punkt_tab/english", # ok if missing on older NLTK
	}
	for pkg, path in resources.items():
	try:
	nltk.data.find(path)
	except LookupError:
	try:
	nltk.download(pkg, quiet=True)
	except Exception:
	pass

	ensure_nltk()

	# -------------------------------------------------
	# Minimal preprocessing for Transformer embeddings
	# (DO NOT remove stopwords/lemmatize — keep raw text)
	# -------------------------------------------------
	def preprocess_text(text: str, max_chars: int = 100000) -> str:
	"""
	Minimal, language-agnostic clean-up:
	- Unicode normalize
	- Strip and optional lower
	- Hard cap on size (avoid insane inputs)
	"""
	if pd.isna(text):
	return ""
	t = str(text)
	t = unicodedata.normalize("NFKC", t)
	t = t.strip().lower()
	# hard limit to keep memory/tokenizer stable on huge pastes
	if len(t) > max_chars:
	t = t[:max_chars]
	return t

	def chunk_by_words(text: str, words_per_chunk: int = 350):
	words = text.split()
	if not words:
	return []
	chunks = []
	for i in range(0, len(words), words_per_chunk):
	ch = " ".join(words[i:i + words_per_chunk])
	if ch.strip():
	chunks.append(ch)
	return chunks

	# -------------------------------------------------
	# Load classifier + embedder (forced 768-dim)
	# -------------------------------------------------
	def load_embedding_model():
	path = hf_hub_download(
	repo_id=REPO_ID,
	filename=FILENAME,
	repo_type=REPO_TYPE,
	token=os.getenv("HF_TOKEN") or None,
	)
	print(f"✅ Downloaded model from Hugging Face: {FILENAME}")

	data = joblib.load(path)
	clf = data.get("model")
	if clf is None:
	raise RuntimeError("Model file does not contain 'model' key.")

	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"🔧 Loading 768-dim embedder: {FORCED_EMBEDDER} on {device}")
	embedding_model = SentenceTransformer(FORCED_EMBEDDER, device=device)
	actual_dim = embedding_model.get_sentence_embedding_dimension()
	if actual_dim != FORCED_DIM:
	raise RuntimeError(f"Loaded embedder dim={actual_dim}, expected {FORCED_DIM}")

	# Classifier sanity check
	clf_dim = getattr(clf, "n_features_in_", None)
	if clf_dim and clf_dim != FORCED_DIM:
	raise RuntimeError(
	f"Classifier expects {clf_dim} features, but app is configured for {FORCED_DIM}. "
	f"Please retrain or load a 768-dim trained classifier."
	)

	# finalize model_data dict
	model_data = {
	"model": clf,
	"embedding_model": embedding_model,
	"resolved_embedding_model_name": FORCED_EMBEDDER,
	"resolved_embedding_dim": actual_dim,
	"device": device,
	# UI defaults
	"max_chars": int(data.get("max_chars", 100000)),
	"words_per_chunk": int(data.get("words_per_chunk", 350)),
	# remember training-time normalize flag if you stored it; default True
	"normalize_embeddings_default": bool(data.get("normalize_embeddings", True)),
	}

	classes = getattr(clf, "classes_", None)
	print(f"✅ Using embedder: {FORCED_EMBEDDER} (dim={actual_dim}) — "
	f"classifier expects {getattr(clf,'n_features_in_','unknown')}, "
	f"classes={classes}")
	return model_data

	# -------------------------------------------------
	# Prediction with threshold + chunking
	# -------------------------------------------------
	def _infer_ai_index(clf) -> int:
	classes = [str(c).upper() for c in getattr(clf, "classes_", [])]
	if "AI" in classes:
	return classes.index("AI")
	# common fallback: binary {0,1} where 1=AI
	if set(classes) == {"0", "1"}:
	return classes.index("1")
	# last resort: assume last class is AI
	return len(classes) - 1 if classes else 0

	def predict_with_threshold(
	text: str,
	model_data: dict,
	ai_threshold: float = 0.70,
	normalize_flag: bool = True,
	agg: str = "mean", # "mean" or "median"
	):
	proc = preprocess_text(text, max_chars=model_data.get("max_chars", 100000))
	if not proc:
	return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"}

	chunks = chunk_by_words(proc, words_per_chunk=model_data.get("words_per_chunk", 350))
	if not chunks:
	return "UNKNOWN", 0.0, {"error": "Empty after chunking"}

	clf = model_data["model"]
	ai_idx = _infer_ai_index(clf)
	p_ai_list = []

	with torch.no_grad():
	for ch in chunks:
	emb = model_data["embedding_model"].encode(
	[ch], convert_to_numpy=True, normalize_embeddings=normalize_flag
	)
	if emb.ndim == 1:
	emb = emb.reshape(1, -1)

	need = getattr(clf, "n_features_in_", emb.shape[1])
	if emb.shape[1] != need:
	return "ERROR", 0.0, {
	"error": f"Embedding dim {emb.shape[1]} != classifier requires {need}"
	}

	if hasattr(clf, "predict_proba"):
	proba = clf.predict_proba(emb)[0]
	p_ai_list.append(float(proba[ai_idx]))
	else:
	# fallback if no proba: convert predicted label to pseudo-proba
	pred = str(clf.predict(emb)[0]).upper()
	p_ai_list.append(1.0 if pred == "AI" else 0.0)

	p_ai = float(np.mean(p_ai_list) if agg == "mean" else np.median(p_ai_list))
	label = "AI" if p_ai >= ai_threshold else "HUMAN"
	conf = p_ai if label == "AI" else 1.0 - p_ai

	return label, conf, {
	"p_ai": p_ai,
	"chunks": len(chunks),
	"threshold": ai_threshold,
	"agg": agg,
	}

	# -------------------------------------------------
	# Gradio App
	# -------------------------------------------------
	def create_app(model_data):
	with gr.Blocks(title="Embedding-based Human vs AI Detector") as demo:
	gr.Markdown("## 🤖👤 Human vs AI Detector (Embedding-based)")
	gr.Markdown(
	"Transformer-friendly pipeline: no stopword removal / lemmatization, "
	"chunking for long texts, thresholded decision."
	)

	with gr.Row():
	inp = gr.Textbox(label="Enter English text", lines=10, placeholder="Paste text here...")
	with gr.Row():
	thr = gr.Slider(minimum=0.50, maximum=0.90, value=0.70, step=0.01,
	label="AI threshold (p_AI ≥ threshold → AI)")
	norm = gr.Checkbox(
	value=model_data.get("normalize_embeddings_default", True),
	label="Normalize embeddings (match training setting)"
	)
	with gr.Row():
	agg = gr.Dropdown(choices=["mean", "median"], value="mean", label="Aggregate across chunks")

	out = gr.Markdown()
	details = gr.Markdown()

	def _predict_ui(text, threshold, normalize_embeddings, agg_mode):
	label, conf, meta = predict_with_threshold(
	text, model_data,
	ai_threshold=float(threshold),
	normalize_flag=bool(normalize_embeddings),
	agg=agg_mode
	)
	if label == "AI":
	headline = f"🤖 AI Generated (Conf: {conf:.1%})"
	elif label == "HUMAN":
	headline = f"👤 Human Written (Conf: {conf:.1%})"
	elif label == "ERROR":
	headline = f"❌ Error: {meta.get('error', 'Unknown')}"
	else:
	headline = f"❓ {label} (Conf: {conf:.1%})"

	det = (
	f"- p(AI): {meta.get('p_ai','?'):.4f}\n"
	f"- Chunks: {meta.get('chunks','?')}\n"
	f"- Threshold: {meta.get('threshold','?')}\n"
	f"- Aggregate: {meta.get('agg','?')}\n"
	f"- Embedder: {model_data['resolved_embedding_model_name']} (dim={model_data['resolved_embedding_dim']})"
	)
	return headline, det

	inp.submit(_predict_ui, [inp, thr, norm, agg], [out, details])
	gr.Button("🔍 Predict").click(_predict_ui, [inp, thr, norm, agg], [out, details])

	return demo

	# -------------------------------------------------
	# Load + Launch
	# -------------------------------------------------
	_model_data = load_embedding_model()
	demo = create_app(_model_data)

	if __name__ == "__main__":
	# Pass share=True if you need a public URL
	demo.launch()