# app.py import os import re import unicodedata import joblib import torch import gradio as gr import numpy as np import pandas as pd import warnings import nltk from sentence_transformers import SentenceTransformer from huggingface_hub import hf_hub_download warnings.filterwarnings("ignore") # ------------------------------------------------- # Hugging Face model config # ------------------------------------------------- REPO_ID = "Detecting-ai/text-detector-model-embedding" FILENAME = "complete_trained_model_lite.joblib" REPO_TYPE = "model" # ------------------------------------------------- # Force 768-dim embedder (MPNet; English-optimized) # ------------------------------------------------- FORCED_EMBEDDER = "sentence-transformers/all-mpnet-base-v2" FORCED_DIM = 768 # ------------------------------------------------- # Ensure NLTK deps (safe no-ops if already present) # ------------------------------------------------- def ensure_nltk(): resources = { "punkt": "tokenizers/punkt", "punkt_tab": "tokenizers/punkt_tab/english", # ok if missing on older NLTK } for pkg, path in resources.items(): try: nltk.data.find(path) except LookupError: try: nltk.download(pkg, quiet=True) except Exception: pass ensure_nltk() # ------------------------------------------------- # Minimal preprocessing for Transformer embeddings # (DO NOT remove stopwords/lemmatize — keep raw text) # ------------------------------------------------- def preprocess_text(text: str, max_chars: int = 100000) -> str: """ Minimal, language-agnostic clean-up: - Unicode normalize - Strip and optional lower - Hard cap on size (avoid insane inputs) """ if pd.isna(text): return "" t = str(text) t = unicodedata.normalize("NFKC", t) t = t.strip().lower() # hard limit to keep memory/tokenizer stable on huge pastes if len(t) > max_chars: t = t[:max_chars] return t def chunk_by_words(text: str, words_per_chunk: int = 350): words = text.split() if not words: return [] chunks = [] for i in range(0, len(words), words_per_chunk): ch = " ".join(words[i:i + words_per_chunk]) if ch.strip(): chunks.append(ch) return chunks # ------------------------------------------------- # Load classifier + embedder (forced 768-dim) # ------------------------------------------------- def load_embedding_model(): path = hf_hub_download( repo_id=REPO_ID, filename=FILENAME, repo_type=REPO_TYPE, token=os.getenv("HF_TOKEN") or None, ) print(f"✅ Downloaded model from Hugging Face: {FILENAME}") data = joblib.load(path) clf = data.get("model") if clf is None: raise RuntimeError("Model file does not contain 'model' key.") device = "cuda" if torch.cuda.is_available() else "cpu" print(f"🔧 Loading 768-dim embedder: {FORCED_EMBEDDER} on {device}") embedding_model = SentenceTransformer(FORCED_EMBEDDER, device=device) actual_dim = embedding_model.get_sentence_embedding_dimension() if actual_dim != FORCED_DIM: raise RuntimeError(f"Loaded embedder dim={actual_dim}, expected {FORCED_DIM}") # Classifier sanity check clf_dim = getattr(clf, "n_features_in_", None) if clf_dim and clf_dim != FORCED_DIM: raise RuntimeError( f"Classifier expects {clf_dim} features, but app is configured for {FORCED_DIM}. " f"Please retrain or load a 768-dim trained classifier." ) # finalize model_data dict model_data = { "model": clf, "embedding_model": embedding_model, "resolved_embedding_model_name": FORCED_EMBEDDER, "resolved_embedding_dim": actual_dim, "device": device, # UI defaults "max_chars": int(data.get("max_chars", 100000)), "words_per_chunk": int(data.get("words_per_chunk", 350)), # remember training-time normalize flag if you stored it; default True "normalize_embeddings_default": bool(data.get("normalize_embeddings", True)), } classes = getattr(clf, "classes_", None) print(f"✅ Using embedder: {FORCED_EMBEDDER} (dim={actual_dim}) — " f"classifier expects {getattr(clf,'n_features_in_','unknown')}, " f"classes={classes}") return model_data # ------------------------------------------------- # Prediction with threshold + chunking # ------------------------------------------------- def _infer_ai_index(clf) -> int: classes = [str(c).upper() for c in getattr(clf, "classes_", [])] if "AI" in classes: return classes.index("AI") # common fallback: binary {0,1} where 1=AI if set(classes) == {"0", "1"}: return classes.index("1") # last resort: assume last class is AI return len(classes) - 1 if classes else 0 def predict_with_threshold( text: str, model_data: dict, ai_threshold: float = 0.70, normalize_flag: bool = True, agg: str = "mean", # "mean" or "median" ): proc = preprocess_text(text, max_chars=model_data.get("max_chars", 100000)) if not proc: return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"} chunks = chunk_by_words(proc, words_per_chunk=model_data.get("words_per_chunk", 350)) if not chunks: return "UNKNOWN", 0.0, {"error": "Empty after chunking"} clf = model_data["model"] ai_idx = _infer_ai_index(clf) p_ai_list = [] with torch.no_grad(): for ch in chunks: emb = model_data["embedding_model"].encode( [ch], convert_to_numpy=True, normalize_embeddings=normalize_flag ) if emb.ndim == 1: emb = emb.reshape(1, -1) need = getattr(clf, "n_features_in_", emb.shape[1]) if emb.shape[1] != need: return "ERROR", 0.0, { "error": f"Embedding dim {emb.shape[1]} != classifier requires {need}" } if hasattr(clf, "predict_proba"): proba = clf.predict_proba(emb)[0] p_ai_list.append(float(proba[ai_idx])) else: # fallback if no proba: convert predicted label to pseudo-proba pred = str(clf.predict(emb)[0]).upper() p_ai_list.append(1.0 if pred == "AI" else 0.0) p_ai = float(np.mean(p_ai_list) if agg == "mean" else np.median(p_ai_list)) label = "AI" if p_ai >= ai_threshold else "HUMAN" conf = p_ai if label == "AI" else 1.0 - p_ai return label, conf, { "p_ai": p_ai, "chunks": len(chunks), "threshold": ai_threshold, "agg": agg, } # ------------------------------------------------- # Gradio App # ------------------------------------------------- def create_app(model_data): with gr.Blocks(title="Embedding-based Human vs AI Detector") as demo: gr.Markdown("## 🤖👤 Human vs AI Detector (Embedding-based)") gr.Markdown( "Transformer-friendly pipeline: **no stopword removal / lemmatization**, " "**chunking** for long texts, **thresholded** decision." ) with gr.Row(): inp = gr.Textbox(label="Enter English text", lines=10, placeholder="Paste text here...") with gr.Row(): thr = gr.Slider(minimum=0.50, maximum=0.90, value=0.70, step=0.01, label="AI threshold (p_AI ≥ threshold → AI)") norm = gr.Checkbox( value=model_data.get("normalize_embeddings_default", True), label="Normalize embeddings (match training setting)" ) with gr.Row(): agg = gr.Dropdown(choices=["mean", "median"], value="mean", label="Aggregate across chunks") out = gr.Markdown() details = gr.Markdown() def _predict_ui(text, threshold, normalize_embeddings, agg_mode): label, conf, meta = predict_with_threshold( text, model_data, ai_threshold=float(threshold), normalize_flag=bool(normalize_embeddings), agg=agg_mode ) if label == "AI": headline = f"🤖 **AI Generated** (Conf: {conf:.1%})" elif label == "HUMAN": headline = f"👤 **Human Written** (Conf: {conf:.1%})" elif label == "ERROR": headline = f"❌ Error: {meta.get('error', 'Unknown')}" else: headline = f"❓ {label} (Conf: {conf:.1%})" det = ( f"- p(AI): {meta.get('p_ai','?'):.4f}\n" f"- Chunks: {meta.get('chunks','?')}\n" f"- Threshold: {meta.get('threshold','?')}\n" f"- Aggregate: {meta.get('agg','?')}\n" f"- Embedder: {model_data['resolved_embedding_model_name']} (dim={model_data['resolved_embedding_dim']})" ) return headline, det inp.submit(_predict_ui, [inp, thr, norm, agg], [out, details]) gr.Button("🔍 Predict").click(_predict_ui, [inp, thr, norm, agg], [out, details]) return demo # ------------------------------------------------- # Load + Launch # ------------------------------------------------- _model_data = load_embedding_model() demo = create_app(_model_data) if __name__ == "__main__": # Pass share=True if you need a public URL demo.launch()