Spaces:

Mrkomiljon
/

text-detector

Sleeping

App Files Files Community

Mrkomiljon commited on Sep 27, 2025

Commit

324e34e

verified ·

1 Parent(s): c9ee224

Update app.py

Browse files

Files changed (1) hide show

app.py +144 -82

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # app.py
 import os
 import re
 import joblib
 import torch
 import gradio as gr
@@ -8,9 +9,6 @@ import numpy as np
 import pandas as pd
 import warnings
 import nltk
-from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize
-from nltk.stem import WordNetLemmatizer
 from sentence_transformers import SentenceTransformer
 from huggingface_hub import hf_hub_download
@@ -24,21 +22,18 @@ FILENAME = "complete_trained_model_lite.joblib"
 REPO_TYPE = "model"
 # -------------------------------------------------
-# Force 768-dim embedder (MPNet)
 # -------------------------------------------------
 FORCED_EMBEDDER = "sentence-transformers/all-mpnet-base-v2"
 FORCED_DIM = 768
 # -------------------------------------------------
-# Ensure NLTK dependencies
 # -------------------------------------------------
 def ensure_nltk():
     resources = {
         "punkt": "tokenizers/punkt",
-        # newer nltk introduces punkt_tab; harmless to try
-        "punkt_tab": "tokenizers/punkt_tab/english",
-        "stopwords": "corpora/stopwords",
-        "wordnet": "corpora/wordnet",
     }
     for pkg, path in resources.items():
         try:
@@ -52,22 +47,39 @@ def ensure_nltk():
 ensure_nltk()
 # -------------------------------------------------
-# Helper functions
 # -------------------------------------------------
-def _to_stopword_set(sw):
-    if sw is None:
-        return set(stopwords.words("english"))
-    if isinstance(sw, set):
-        return sw
-    if isinstance(sw, (list, tuple)):
-        return set(sw)
-    try:
-        return set(sw)
-    except Exception:
-        return set(stopwords.words("english"))
 # -------------------------------------------------
-# Load model bundle + forced 768-dim embedder
 # -------------------------------------------------
 def load_embedding_model():
     path = hf_hub_download(
@@ -79,20 +91,18 @@ def load_embedding_model():
     print(f"✅ Downloaded model from Hugging Face: {FILENAME}")
     data = joblib.load(path)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
     clf = data.get("model")
     if clf is None:
         raise RuntimeError("Model file does not contain 'model' key.")
-    # --- Always use 768-dim MPNet ---
     print(f"🔧 Loading 768-dim embedder: {FORCED_EMBEDDER} on {device}")
     embedding_model = SentenceTransformer(FORCED_EMBEDDER, device=device)
     actual_dim = embedding_model.get_sentence_embedding_dimension()
     if actual_dim != FORCED_DIM:
         raise RuntimeError(f"Loaded embedder dim={actual_dim}, expected {FORCED_DIM}")
-    # --- Classifier sanity check (must be trained on 768) ---
     clf_dim = getattr(clf, "n_features_in_", None)
     if clf_dim and clf_dim != FORCED_DIM:
         raise RuntimeError(
@@ -100,60 +110,90 @@ def load_embedding_model():
             f"Please retrain or load a 768-dim trained classifier."
         )
-    # finalize
-    data["embedding_model"] = embedding_model
-    data["resolved_embedding_model_name"] = FORCED_EMBEDDER
-    data["resolved_embedding_dim"] = actual_dim
-    data["device"] = device
-    data["lemmatizer"] = data.get("lemmatizer") or WordNetLemmatizer()
-    data["stop_words"] = _to_stopword_set(data.get("stop_words"))
-    data["max_tokens"] = data.get("max_tokens", 600)
-    print(f"✅ Using embedder: {FORCED_EMBEDDER} (dim={actual_dim}) — classifier expects {getattr(clf,'n_features_in_','unknown')}")
-    return data
 # -------------------------------------------------
-# Preprocessing
 # -------------------------------------------------
-def preprocess_text(text, lemmatizer, stop_words, max_tokens=600):
-    if pd.isna(text) or not str(text).strip():
-        return ""
-    text = str(text).lower()
-    text = re.sub(r"[^a-zA-Z\s]", " ", text)
-    tokens = [
-        lemmatizer.lemmatize(tok)
-        for tok in word_tokenize(text)
-        if tok not in stop_words and len(tok) > 2
-    ]
-    return " ".join(tokens[:max_tokens])
-# -------------------------------------------------
-# Prediction
-# -------------------------------------------------
-def predict_text(text, model_data):
-    proc = preprocess_text(text, model_data["lemmatizer"], model_data["stop_words"], model_data["max_tokens"])
     if not proc:
         return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"}
-    with torch.no_grad():
-        emb = model_data["embedding_model"].encode(
-            [proc], convert_to_numpy=True, normalize_embeddings=False
-        )
-    if emb.ndim == 1:
-        emb = emb.reshape(1, -1)
     clf = model_data["model"]
-    need = getattr(clf, "n_features_in_", emb.shape[1])
-    if emb.shape[1] != need:
-        return "ERROR", 0.0, {"error": f"Embedding dim {emb.shape[1]} != classifier requires {need}"}
-    try:
-        pred = clf.predict(emb)[0]
-        conf = float(np.max(clf.predict_proba(emb)[0])) if hasattr(clf, "predict_proba") else 0.5
-    except Exception as e:
-        return "ERROR", 0.0, {"error": str(e)}
-    return str(pred), conf, {"tokens": len(proc.split())}
 # -------------------------------------------------
 # Gradio App
@@ -161,32 +201,54 @@ def predict_text(text, model_data):
 def create_app(model_data):
     with gr.Blocks(title="Embedding-based Human vs AI Detector") as demo:
         gr.Markdown("## 🤖👤 Human vs AI Detector (Embedding-based)")
-        inp = gr.Textbox(label="Matn kiriting", lines=6, placeholder="Enter text...")
         out = gr.Markdown()
         details = gr.Markdown()
-        def _predict_ui(text):
-            label, conf, meta = predict_text(text, model_data)
-            if label.upper() == "AI":
                 headline = f"🤖 **AI Generated** (Conf: {conf:.1%})"
-            elif label.upper() == "HUMAN":
                 headline = f"👤 **Human Written** (Conf: {conf:.1%})"
-            elif label.upper() == "ERROR":
                 headline = f"❌ Error: {meta.get('error', 'Unknown')}"
-            elif label.upper() == "UNKNOWN":
-                headline = f"❓ Unknown (Conf: {conf:.1%})"
             else:
                 headline = f"❓ {label} (Conf: {conf:.1%})"
             det = (
-                f"- Tokens: {meta.get('tokens','?')}\n"
-                f"- Embedding: {model_data['resolved_embedding_model_name']} "
-                f"(dim={model_data['resolved_embedding_dim']})"
             )
             return headline, det
-        inp.submit(_predict_ui, inp, [out, details])
-        gr.Button("🔍 Predict").click(_predict_ui, inp, [out, details])
     return demo
 # -------------------------------------------------
@@ -196,5 +258,5 @@ _model_data = load_embedding_model()
 demo = create_app(_model_data)
 if __name__ == "__main__":
-    # You can pass share=True if you need a public URL
     demo.launch()

 # app.py
 import os
 import re
+import unicodedata
 import joblib
 import torch
 import gradio as gr
 import pandas as pd
 import warnings
 import nltk
 from sentence_transformers import SentenceTransformer
 from huggingface_hub import hf_hub_download
 REPO_TYPE = "model"
 # -------------------------------------------------
+# Force 768-dim embedder (MPNet; English-optimized)
 # -------------------------------------------------
 FORCED_EMBEDDER = "sentence-transformers/all-mpnet-base-v2"
 FORCED_DIM = 768
 # -------------------------------------------------
+# Ensure NLTK deps (safe no-ops if already present)
 # -------------------------------------------------
 def ensure_nltk():
     resources = {
         "punkt": "tokenizers/punkt",
+        "punkt_tab": "tokenizers/punkt_tab/english",  # ok if missing on older NLTK
     }
     for pkg, path in resources.items():
         try:
 ensure_nltk()
 # -------------------------------------------------
+# Minimal preprocessing for Transformer embeddings
+# (DO NOT remove stopwords/lemmatize — keep raw text)
 # -------------------------------------------------
+def preprocess_text(text: str, max_chars: int = 100000) -> str:
+    """
+    Minimal, language-agnostic clean-up:
+    - Unicode normalize
+    - Strip and optional lower
+    - Hard cap on size (avoid insane inputs)
+    """
+    if pd.isna(text):
+        return ""
+    t = str(text)
+    t = unicodedata.normalize("NFKC", t)
+    t = t.strip().lower()
+    # hard limit to keep memory/tokenizer stable on huge pastes
+    if len(t) > max_chars:
+        t = t[:max_chars]
+    return t
+def chunk_by_words(text: str, words_per_chunk: int = 350):
+    words = text.split()
+    if not words:
+        return []
+    chunks = []
+    for i in range(0, len(words), words_per_chunk):
+        ch = " ".join(words[i:i + words_per_chunk])
+        if ch.strip():
+            chunks.append(ch)
+    return chunks
 # -------------------------------------------------
+# Load classifier + embedder (forced 768-dim)
 # -------------------------------------------------
 def load_embedding_model():
     path = hf_hub_download(
     print(f"✅ Downloaded model from Hugging Face: {FILENAME}")
     data = joblib.load(path)
     clf = data.get("model")
     if clf is None:
         raise RuntimeError("Model file does not contain 'model' key.")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
     print(f"🔧 Loading 768-dim embedder: {FORCED_EMBEDDER} on {device}")
     embedding_model = SentenceTransformer(FORCED_EMBEDDER, device=device)
     actual_dim = embedding_model.get_sentence_embedding_dimension()
     if actual_dim != FORCED_DIM:
         raise RuntimeError(f"Loaded embedder dim={actual_dim}, expected {FORCED_DIM}")
+    # Classifier sanity check
     clf_dim = getattr(clf, "n_features_in_", None)
     if clf_dim and clf_dim != FORCED_DIM:
         raise RuntimeError(
             f"Please retrain or load a 768-dim trained classifier."
         )
+    # finalize model_data dict
+    model_data = {
+        "model": clf,
+        "embedding_model": embedding_model,
+        "resolved_embedding_model_name": FORCED_EMBEDDER,
+        "resolved_embedding_dim": actual_dim,
+        "device": device,
+        # UI defaults
+        "max_chars": int(data.get("max_chars", 100000)),
+        "words_per_chunk": int(data.get("words_per_chunk", 350)),
+        # remember training-time normalize flag if you stored it; default True
+        "normalize_embeddings_default": bool(data.get("normalize_embeddings", True)),
+    }
+    classes = getattr(clf, "classes_", None)
+    print(f"✅ Using embedder: {FORCED_EMBEDDER} (dim={actual_dim}) — "
+          f"classifier expects {getattr(clf,'n_features_in_','unknown')}, "
+          f"classes={classes}")
+    return model_data
 # -------------------------------------------------
+# Prediction with threshold + chunking
 # -------------------------------------------------
+def _infer_ai_index(clf) -> int:
+    classes = [str(c).upper() for c in getattr(clf, "classes_", [])]
+    if "AI" in classes:
+        return classes.index("AI")
+    # common fallback: binary {0,1} where 1=AI
+    if set(classes) == {"0", "1"}:
+        return classes.index("1")
+    # last resort: assume last class is AI
+    return len(classes) - 1 if classes else 0
+def predict_with_threshold(
+    text: str,
+    model_data: dict,
+    ai_threshold: float = 0.70,
+    normalize_flag: bool = True,
+    agg: str = "mean",  # "mean" or "median"
+):
+    proc = preprocess_text(text, max_chars=model_data.get("max_chars", 100000))
     if not proc:
         return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"}
+    chunks = chunk_by_words(proc, words_per_chunk=model_data.get("words_per_chunk", 350))
+    if not chunks:
+        return "UNKNOWN", 0.0, {"error": "Empty after chunking"}
     clf = model_data["model"]
+    ai_idx = _infer_ai_index(clf)
+    p_ai_list = []
+    with torch.no_grad():
+        for ch in chunks:
+            emb = model_data["embedding_model"].encode(
+                [ch], convert_to_numpy=True, normalize_embeddings=normalize_flag
+            )
+            if emb.ndim == 1:
+                emb = emb.reshape(1, -1)
+            need = getattr(clf, "n_features_in_", emb.shape[1])
+            if emb.shape[1] != need:
+                return "ERROR", 0.0, {
+                    "error": f"Embedding dim {emb.shape[1]} != classifier requires {need}"
+                }
+            if hasattr(clf, "predict_proba"):
+                proba = clf.predict_proba(emb)[0]
+                p_ai_list.append(float(proba[ai_idx]))
+            else:
+                # fallback if no proba: convert predicted label to pseudo-proba
+                pred = str(clf.predict(emb)[0]).upper()
+                p_ai_list.append(1.0 if pred == "AI" else 0.0)
+    p_ai = float(np.mean(p_ai_list) if agg == "mean" else np.median(p_ai_list))
+    label = "AI" if p_ai >= ai_threshold else "HUMAN"
+    conf = p_ai if label == "AI" else 1.0 - p_ai
+    return label, conf, {
+        "p_ai": p_ai,
+        "chunks": len(chunks),
+        "threshold": ai_threshold,
+        "agg": agg,
+    }
 # -------------------------------------------------
 # Gradio App
 def create_app(model_data):
     with gr.Blocks(title="Embedding-based Human vs AI Detector") as demo:
         gr.Markdown("## 🤖👤 Human vs AI Detector (Embedding-based)")
+        gr.Markdown(
+            "Transformer-friendly pipeline: **no stopword removal / lemmatization**, "
+            "**chunking** for long texts, **thresholded** decision."
+        )
+        with gr.Row():
+            inp = gr.Textbox(label="Enter English text", lines=10, placeholder="Paste text here...")
+        with gr.Row():
+            thr = gr.Slider(minimum=0.50, maximum=0.90, value=0.70, step=0.01,
+                            label="AI threshold (p_AI ≥ threshold → AI)")
+            norm = gr.Checkbox(
+                value=model_data.get("normalize_embeddings_default", True),
+                label="Normalize embeddings (match training setting)"
+            )
+        with gr.Row():
+            agg = gr.Dropdown(choices=["mean", "median"], value="mean", label="Aggregate across chunks")
         out = gr.Markdown()
         details = gr.Markdown()
+        def _predict_ui(text, threshold, normalize_embeddings, agg_mode):
+            label, conf, meta = predict_with_threshold(
+                text, model_data,
+                ai_threshold=float(threshold),
+                normalize_flag=bool(normalize_embeddings),
+                agg=agg_mode
+            )
+            if label == "AI":
                 headline = f"🤖 **AI Generated** (Conf: {conf:.1%})"
+            elif label == "HUMAN":
                 headline = f"👤 **Human Written** (Conf: {conf:.1%})"
+            elif label == "ERROR":
                 headline = f"❌ Error: {meta.get('error', 'Unknown')}"
             else:
                 headline = f"❓ {label} (Conf: {conf:.1%})"
             det = (
+                f"- p(AI): {meta.get('p_ai','?'):.4f}\n"
+                f"- Chunks: {meta.get('chunks','?')}\n"
+                f"- Threshold: {meta.get('threshold','?')}\n"
+                f"- Aggregate: {meta.get('agg','?')}\n"
+                f"- Embedder: {model_data['resolved_embedding_model_name']} (dim={model_data['resolved_embedding_dim']})"
             )
             return headline, det
+        inp.submit(_predict_ui, [inp, thr, norm, agg], [out, details])
+        gr.Button("🔍 Predict").click(_predict_ui, [inp, thr, norm, agg], [out, details])
     return demo
 # -------------------------------------------------
 demo = create_app(_model_data)
 if __name__ == "__main__":
+    # Pass share=True if you need a public URL
     demo.launch()