Spaces:
Sleeping
Sleeping
| # app.py | |
| import os | |
| import re | |
| import unicodedata | |
| import joblib | |
| import torch | |
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| import warnings | |
| import nltk | |
| from sentence_transformers import SentenceTransformer | |
| from huggingface_hub import hf_hub_download | |
| warnings.filterwarnings("ignore") | |
| # ------------------------------------------------- | |
| # Hugging Face model config | |
| # ------------------------------------------------- | |
| REPO_ID = "Detecting-ai/text-detector-model-embedding" | |
| FILENAME = "complete_trained_model_lite.joblib" | |
| REPO_TYPE = "model" | |
| # ------------------------------------------------- | |
| # Force 768-dim embedder (MPNet; English-optimized) | |
| # ------------------------------------------------- | |
| FORCED_EMBEDDER = "sentence-transformers/all-mpnet-base-v2" | |
| FORCED_DIM = 768 | |
| # ------------------------------------------------- | |
| # Ensure NLTK deps (safe no-ops if already present) | |
| # ------------------------------------------------- | |
| def ensure_nltk(): | |
| resources = { | |
| "punkt": "tokenizers/punkt", | |
| "punkt_tab": "tokenizers/punkt_tab/english", # ok if missing on older NLTK | |
| } | |
| for pkg, path in resources.items(): | |
| try: | |
| nltk.data.find(path) | |
| except LookupError: | |
| try: | |
| nltk.download(pkg, quiet=True) | |
| except Exception: | |
| pass | |
| ensure_nltk() | |
| # ------------------------------------------------- | |
| # Minimal preprocessing for Transformer embeddings | |
| # (DO NOT remove stopwords/lemmatize β keep raw text) | |
| # ------------------------------------------------- | |
| def preprocess_text(text: str, max_chars: int = 100000) -> str: | |
| """ | |
| Minimal, language-agnostic clean-up: | |
| - Unicode normalize | |
| - Strip and optional lower | |
| - Hard cap on size (avoid insane inputs) | |
| """ | |
| if pd.isna(text): | |
| return "" | |
| t = str(text) | |
| t = unicodedata.normalize("NFKC", t) | |
| t = t.strip().lower() | |
| # hard limit to keep memory/tokenizer stable on huge pastes | |
| if len(t) > max_chars: | |
| t = t[:max_chars] | |
| return t | |
| def chunk_by_words(text: str, words_per_chunk: int = 350): | |
| words = text.split() | |
| if not words: | |
| return [] | |
| chunks = [] | |
| for i in range(0, len(words), words_per_chunk): | |
| ch = " ".join(words[i:i + words_per_chunk]) | |
| if ch.strip(): | |
| chunks.append(ch) | |
| return chunks | |
| # ------------------------------------------------- | |
| # Load classifier + embedder (forced 768-dim) | |
| # ------------------------------------------------- | |
| def load_embedding_model(): | |
| path = hf_hub_download( | |
| repo_id=REPO_ID, | |
| filename=FILENAME, | |
| repo_type=REPO_TYPE, | |
| token=os.getenv("HF_TOKEN") or None, | |
| ) | |
| print(f"β Downloaded model from Hugging Face: {FILENAME}") | |
| data = joblib.load(path) | |
| clf = data.get("model") | |
| if clf is None: | |
| raise RuntimeError("Model file does not contain 'model' key.") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"π§ Loading 768-dim embedder: {FORCED_EMBEDDER} on {device}") | |
| embedding_model = SentenceTransformer(FORCED_EMBEDDER, device=device) | |
| actual_dim = embedding_model.get_sentence_embedding_dimension() | |
| if actual_dim != FORCED_DIM: | |
| raise RuntimeError(f"Loaded embedder dim={actual_dim}, expected {FORCED_DIM}") | |
| # Classifier sanity check | |
| clf_dim = getattr(clf, "n_features_in_", None) | |
| if clf_dim and clf_dim != FORCED_DIM: | |
| raise RuntimeError( | |
| f"Classifier expects {clf_dim} features, but app is configured for {FORCED_DIM}. " | |
| f"Please retrain or load a 768-dim trained classifier." | |
| ) | |
| # finalize model_data dict | |
| model_data = { | |
| "model": clf, | |
| "embedding_model": embedding_model, | |
| "resolved_embedding_model_name": FORCED_EMBEDDER, | |
| "resolved_embedding_dim": actual_dim, | |
| "device": device, | |
| # UI defaults | |
| "max_chars": int(data.get("max_chars", 100000)), | |
| "words_per_chunk": int(data.get("words_per_chunk", 350)), | |
| # remember training-time normalize flag if you stored it; default True | |
| "normalize_embeddings_default": bool(data.get("normalize_embeddings", True)), | |
| } | |
| classes = getattr(clf, "classes_", None) | |
| print(f"β Using embedder: {FORCED_EMBEDDER} (dim={actual_dim}) β " | |
| f"classifier expects {getattr(clf,'n_features_in_','unknown')}, " | |
| f"classes={classes}") | |
| return model_data | |
| # ------------------------------------------------- | |
| # Prediction with threshold + chunking | |
| # ------------------------------------------------- | |
| def _infer_ai_index(clf) -> int: | |
| classes = [str(c).upper() for c in getattr(clf, "classes_", [])] | |
| if "AI" in classes: | |
| return classes.index("AI") | |
| # common fallback: binary {0,1} where 1=AI | |
| if set(classes) == {"0", "1"}: | |
| return classes.index("1") | |
| # last resort: assume last class is AI | |
| return len(classes) - 1 if classes else 0 | |
| def predict_with_threshold( | |
| text: str, | |
| model_data: dict, | |
| ai_threshold: float = 0.70, | |
| normalize_flag: bool = True, | |
| agg: str = "mean", # "mean" or "median" | |
| ): | |
| proc = preprocess_text(text, max_chars=model_data.get("max_chars", 100000)) | |
| if not proc: | |
| return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"} | |
| chunks = chunk_by_words(proc, words_per_chunk=model_data.get("words_per_chunk", 350)) | |
| if not chunks: | |
| return "UNKNOWN", 0.0, {"error": "Empty after chunking"} | |
| clf = model_data["model"] | |
| ai_idx = _infer_ai_index(clf) | |
| p_ai_list = [] | |
| with torch.no_grad(): | |
| for ch in chunks: | |
| emb = model_data["embedding_model"].encode( | |
| [ch], convert_to_numpy=True, normalize_embeddings=normalize_flag | |
| ) | |
| if emb.ndim == 1: | |
| emb = emb.reshape(1, -1) | |
| need = getattr(clf, "n_features_in_", emb.shape[1]) | |
| if emb.shape[1] != need: | |
| return "ERROR", 0.0, { | |
| "error": f"Embedding dim {emb.shape[1]} != classifier requires {need}" | |
| } | |
| if hasattr(clf, "predict_proba"): | |
| proba = clf.predict_proba(emb)[0] | |
| p_ai_list.append(float(proba[ai_idx])) | |
| else: | |
| # fallback if no proba: convert predicted label to pseudo-proba | |
| pred = str(clf.predict(emb)[0]).upper() | |
| p_ai_list.append(1.0 if pred == "AI" else 0.0) | |
| p_ai = float(np.mean(p_ai_list) if agg == "mean" else np.median(p_ai_list)) | |
| label = "AI" if p_ai >= ai_threshold else "HUMAN" | |
| conf = p_ai if label == "AI" else 1.0 - p_ai | |
| return label, conf, { | |
| "p_ai": p_ai, | |
| "chunks": len(chunks), | |
| "threshold": ai_threshold, | |
| "agg": agg, | |
| } | |
| # ------------------------------------------------- | |
| # Gradio App | |
| # ------------------------------------------------- | |
| def create_app(model_data): | |
| with gr.Blocks(title="Embedding-based Human vs AI Detector") as demo: | |
| gr.Markdown("## π€π€ Human vs AI Detector (Embedding-based)") | |
| gr.Markdown( | |
| "Transformer-friendly pipeline: **no stopword removal / lemmatization**, " | |
| "**chunking** for long texts, **thresholded** decision." | |
| ) | |
| with gr.Row(): | |
| inp = gr.Textbox(label="Enter English text", lines=10, placeholder="Paste text here...") | |
| with gr.Row(): | |
| thr = gr.Slider(minimum=0.50, maximum=0.90, value=0.70, step=0.01, | |
| label="AI threshold (p_AI β₯ threshold β AI)") | |
| norm = gr.Checkbox( | |
| value=model_data.get("normalize_embeddings_default", True), | |
| label="Normalize embeddings (match training setting)" | |
| ) | |
| with gr.Row(): | |
| agg = gr.Dropdown(choices=["mean", "median"], value="mean", label="Aggregate across chunks") | |
| out = gr.Markdown() | |
| details = gr.Markdown() | |
| def _predict_ui(text, threshold, normalize_embeddings, agg_mode): | |
| label, conf, meta = predict_with_threshold( | |
| text, model_data, | |
| ai_threshold=float(threshold), | |
| normalize_flag=bool(normalize_embeddings), | |
| agg=agg_mode | |
| ) | |
| if label == "AI": | |
| headline = f"π€ **AI Generated** (Conf: {conf:.1%})" | |
| elif label == "HUMAN": | |
| headline = f"π€ **Human Written** (Conf: {conf:.1%})" | |
| elif label == "ERROR": | |
| headline = f"β Error: {meta.get('error', 'Unknown')}" | |
| else: | |
| headline = f"β {label} (Conf: {conf:.1%})" | |
| det = ( | |
| f"- p(AI): {meta.get('p_ai','?'):.4f}\n" | |
| f"- Chunks: {meta.get('chunks','?')}\n" | |
| f"- Threshold: {meta.get('threshold','?')}\n" | |
| f"- Aggregate: {meta.get('agg','?')}\n" | |
| f"- Embedder: {model_data['resolved_embedding_model_name']} (dim={model_data['resolved_embedding_dim']})" | |
| ) | |
| return headline, det | |
| inp.submit(_predict_ui, [inp, thr, norm, agg], [out, details]) | |
| gr.Button("π Predict").click(_predict_ui, [inp, thr, norm, agg], [out, details]) | |
| return demo | |
| # ------------------------------------------------- | |
| # Load + Launch | |
| # ------------------------------------------------- | |
| _model_data = load_embedding_model() | |
| demo = create_app(_model_data) | |
| if __name__ == "__main__": | |
| # Pass share=True if you need a public URL | |
| demo.launch() | |