Mrkomiljon commited on
Commit
324e34e
Β·
verified Β·
1 Parent(s): c9ee224

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -82
app.py CHANGED
@@ -1,6 +1,7 @@
1
  # app.py
2
  import os
3
  import re
 
4
  import joblib
5
  import torch
6
  import gradio as gr
@@ -8,9 +9,6 @@ import numpy as np
8
  import pandas as pd
9
  import warnings
10
  import nltk
11
- from nltk.corpus import stopwords
12
- from nltk.tokenize import word_tokenize
13
- from nltk.stem import WordNetLemmatizer
14
  from sentence_transformers import SentenceTransformer
15
  from huggingface_hub import hf_hub_download
16
 
@@ -24,21 +22,18 @@ FILENAME = "complete_trained_model_lite.joblib"
24
  REPO_TYPE = "model"
25
 
26
  # -------------------------------------------------
27
- # Force 768-dim embedder (MPNet)
28
  # -------------------------------------------------
29
  FORCED_EMBEDDER = "sentence-transformers/all-mpnet-base-v2"
30
  FORCED_DIM = 768
31
 
32
  # -------------------------------------------------
33
- # Ensure NLTK dependencies
34
  # -------------------------------------------------
35
  def ensure_nltk():
36
  resources = {
37
  "punkt": "tokenizers/punkt",
38
- # newer nltk introduces punkt_tab; harmless to try
39
- "punkt_tab": "tokenizers/punkt_tab/english",
40
- "stopwords": "corpora/stopwords",
41
- "wordnet": "corpora/wordnet",
42
  }
43
  for pkg, path in resources.items():
44
  try:
@@ -52,22 +47,39 @@ def ensure_nltk():
52
  ensure_nltk()
53
 
54
  # -------------------------------------------------
55
- # Helper functions
 
56
  # -------------------------------------------------
57
- def _to_stopword_set(sw):
58
- if sw is None:
59
- return set(stopwords.words("english"))
60
- if isinstance(sw, set):
61
- return sw
62
- if isinstance(sw, (list, tuple)):
63
- return set(sw)
64
- try:
65
- return set(sw)
66
- except Exception:
67
- return set(stopwords.words("english"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  # -------------------------------------------------
70
- # Load model bundle + forced 768-dim embedder
71
  # -------------------------------------------------
72
  def load_embedding_model():
73
  path = hf_hub_download(
@@ -79,20 +91,18 @@ def load_embedding_model():
79
  print(f"βœ… Downloaded model from Hugging Face: {FILENAME}")
80
 
81
  data = joblib.load(path)
82
-
83
- device = "cuda" if torch.cuda.is_available() else "cpu"
84
  clf = data.get("model")
85
  if clf is None:
86
  raise RuntimeError("Model file does not contain 'model' key.")
87
 
88
- # --- Always use 768-dim MPNet ---
89
  print(f"πŸ”§ Loading 768-dim embedder: {FORCED_EMBEDDER} on {device}")
90
  embedding_model = SentenceTransformer(FORCED_EMBEDDER, device=device)
91
  actual_dim = embedding_model.get_sentence_embedding_dimension()
92
  if actual_dim != FORCED_DIM:
93
  raise RuntimeError(f"Loaded embedder dim={actual_dim}, expected {FORCED_DIM}")
94
 
95
- # --- Classifier sanity check (must be trained on 768) ---
96
  clf_dim = getattr(clf, "n_features_in_", None)
97
  if clf_dim and clf_dim != FORCED_DIM:
98
  raise RuntimeError(
@@ -100,60 +110,90 @@ def load_embedding_model():
100
  f"Please retrain or load a 768-dim trained classifier."
101
  )
102
 
103
- # finalize
104
- data["embedding_model"] = embedding_model
105
- data["resolved_embedding_model_name"] = FORCED_EMBEDDER
106
- data["resolved_embedding_dim"] = actual_dim
107
- data["device"] = device
108
- data["lemmatizer"] = data.get("lemmatizer") or WordNetLemmatizer()
109
- data["stop_words"] = _to_stopword_set(data.get("stop_words"))
110
- data["max_tokens"] = data.get("max_tokens", 600)
 
 
 
 
 
111
 
112
- print(f"βœ… Using embedder: {FORCED_EMBEDDER} (dim={actual_dim}) β€” classifier expects {getattr(clf,'n_features_in_','unknown')}")
113
- return data
 
 
 
114
 
115
  # -------------------------------------------------
116
- # Preprocessing
117
  # -------------------------------------------------
118
- def preprocess_text(text, lemmatizer, stop_words, max_tokens=600):
119
- if pd.isna(text) or not str(text).strip():
120
- return ""
121
- text = str(text).lower()
122
- text = re.sub(r"[^a-zA-Z\s]", " ", text)
123
- tokens = [
124
- lemmatizer.lemmatize(tok)
125
- for tok in word_tokenize(text)
126
- if tok not in stop_words and len(tok) > 2
127
- ]
128
- return " ".join(tokens[:max_tokens])
129
 
130
- # -------------------------------------------------
131
- # Prediction
132
- # -------------------------------------------------
133
- def predict_text(text, model_data):
134
- proc = preprocess_text(text, model_data["lemmatizer"], model_data["stop_words"], model_data["max_tokens"])
 
 
 
135
  if not proc:
136
  return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"}
137
 
138
- with torch.no_grad():
139
- emb = model_data["embedding_model"].encode(
140
- [proc], convert_to_numpy=True, normalize_embeddings=False
141
- )
142
- if emb.ndim == 1:
143
- emb = emb.reshape(1, -1)
144
 
145
  clf = model_data["model"]
146
- need = getattr(clf, "n_features_in_", emb.shape[1])
147
- if emb.shape[1] != need:
148
- return "ERROR", 0.0, {"error": f"Embedding dim {emb.shape[1]} != classifier requires {need}"}
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
- try:
151
- pred = clf.predict(emb)[0]
152
- conf = float(np.max(clf.predict_proba(emb)[0])) if hasattr(clf, "predict_proba") else 0.5
153
- except Exception as e:
154
- return "ERROR", 0.0, {"error": str(e)}
 
 
 
 
 
 
155
 
156
- return str(pred), conf, {"tokens": len(proc.split())}
 
 
 
 
 
157
 
158
  # -------------------------------------------------
159
  # Gradio App
@@ -161,32 +201,54 @@ def predict_text(text, model_data):
161
  def create_app(model_data):
162
  with gr.Blocks(title="Embedding-based Human vs AI Detector") as demo:
163
  gr.Markdown("## πŸ€–πŸ‘€ Human vs AI Detector (Embedding-based)")
164
- inp = gr.Textbox(label="Matn kiriting", lines=6, placeholder="Enter text...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  out = gr.Markdown()
166
  details = gr.Markdown()
167
 
168
- def _predict_ui(text):
169
- label, conf, meta = predict_text(text, model_data)
170
- if label.upper() == "AI":
 
 
 
 
 
171
  headline = f"πŸ€– **AI Generated** (Conf: {conf:.1%})"
172
- elif label.upper() == "HUMAN":
173
  headline = f"πŸ‘€ **Human Written** (Conf: {conf:.1%})"
174
- elif label.upper() == "ERROR":
175
  headline = f"❌ Error: {meta.get('error', 'Unknown')}"
176
- elif label.upper() == "UNKNOWN":
177
- headline = f"❓ Unknown (Conf: {conf:.1%})"
178
  else:
179
  headline = f"❓ {label} (Conf: {conf:.1%})"
180
 
181
  det = (
182
- f"- Tokens: {meta.get('tokens','?')}\n"
183
- f"- Embedding: {model_data['resolved_embedding_model_name']} "
184
- f"(dim={model_data['resolved_embedding_dim']})"
 
 
185
  )
186
  return headline, det
187
 
188
- inp.submit(_predict_ui, inp, [out, details])
189
- gr.Button("πŸ” Predict").click(_predict_ui, inp, [out, details])
 
190
  return demo
191
 
192
  # -------------------------------------------------
@@ -196,5 +258,5 @@ _model_data = load_embedding_model()
196
  demo = create_app(_model_data)
197
 
198
  if __name__ == "__main__":
199
- # You can pass share=True if you need a public URL
200
  demo.launch()
 
1
  # app.py
2
  import os
3
  import re
4
+ import unicodedata
5
  import joblib
6
  import torch
7
  import gradio as gr
 
9
  import pandas as pd
10
  import warnings
11
  import nltk
 
 
 
12
  from sentence_transformers import SentenceTransformer
13
  from huggingface_hub import hf_hub_download
14
 
 
22
  REPO_TYPE = "model"
23
 
24
  # -------------------------------------------------
25
+ # Force 768-dim embedder (MPNet; English-optimized)
26
  # -------------------------------------------------
27
  FORCED_EMBEDDER = "sentence-transformers/all-mpnet-base-v2"
28
  FORCED_DIM = 768
29
 
30
  # -------------------------------------------------
31
+ # Ensure NLTK deps (safe no-ops if already present)
32
  # -------------------------------------------------
33
  def ensure_nltk():
34
  resources = {
35
  "punkt": "tokenizers/punkt",
36
+ "punkt_tab": "tokenizers/punkt_tab/english", # ok if missing on older NLTK
 
 
 
37
  }
38
  for pkg, path in resources.items():
39
  try:
 
47
  ensure_nltk()
48
 
49
  # -------------------------------------------------
50
+ # Minimal preprocessing for Transformer embeddings
51
+ # (DO NOT remove stopwords/lemmatize β€” keep raw text)
52
  # -------------------------------------------------
53
+ def preprocess_text(text: str, max_chars: int = 100000) -> str:
54
+ """
55
+ Minimal, language-agnostic clean-up:
56
+ - Unicode normalize
57
+ - Strip and optional lower
58
+ - Hard cap on size (avoid insane inputs)
59
+ """
60
+ if pd.isna(text):
61
+ return ""
62
+ t = str(text)
63
+ t = unicodedata.normalize("NFKC", t)
64
+ t = t.strip().lower()
65
+ # hard limit to keep memory/tokenizer stable on huge pastes
66
+ if len(t) > max_chars:
67
+ t = t[:max_chars]
68
+ return t
69
+
70
+ def chunk_by_words(text: str, words_per_chunk: int = 350):
71
+ words = text.split()
72
+ if not words:
73
+ return []
74
+ chunks = []
75
+ for i in range(0, len(words), words_per_chunk):
76
+ ch = " ".join(words[i:i + words_per_chunk])
77
+ if ch.strip():
78
+ chunks.append(ch)
79
+ return chunks
80
 
81
  # -------------------------------------------------
82
+ # Load classifier + embedder (forced 768-dim)
83
  # -------------------------------------------------
84
  def load_embedding_model():
85
  path = hf_hub_download(
 
91
  print(f"βœ… Downloaded model from Hugging Face: {FILENAME}")
92
 
93
  data = joblib.load(path)
 
 
94
  clf = data.get("model")
95
  if clf is None:
96
  raise RuntimeError("Model file does not contain 'model' key.")
97
 
98
+ device = "cuda" if torch.cuda.is_available() else "cpu"
99
  print(f"πŸ”§ Loading 768-dim embedder: {FORCED_EMBEDDER} on {device}")
100
  embedding_model = SentenceTransformer(FORCED_EMBEDDER, device=device)
101
  actual_dim = embedding_model.get_sentence_embedding_dimension()
102
  if actual_dim != FORCED_DIM:
103
  raise RuntimeError(f"Loaded embedder dim={actual_dim}, expected {FORCED_DIM}")
104
 
105
+ # Classifier sanity check
106
  clf_dim = getattr(clf, "n_features_in_", None)
107
  if clf_dim and clf_dim != FORCED_DIM:
108
  raise RuntimeError(
 
110
  f"Please retrain or load a 768-dim trained classifier."
111
  )
112
 
113
+ # finalize model_data dict
114
+ model_data = {
115
+ "model": clf,
116
+ "embedding_model": embedding_model,
117
+ "resolved_embedding_model_name": FORCED_EMBEDDER,
118
+ "resolved_embedding_dim": actual_dim,
119
+ "device": device,
120
+ # UI defaults
121
+ "max_chars": int(data.get("max_chars", 100000)),
122
+ "words_per_chunk": int(data.get("words_per_chunk", 350)),
123
+ # remember training-time normalize flag if you stored it; default True
124
+ "normalize_embeddings_default": bool(data.get("normalize_embeddings", True)),
125
+ }
126
 
127
+ classes = getattr(clf, "classes_", None)
128
+ print(f"βœ… Using embedder: {FORCED_EMBEDDER} (dim={actual_dim}) β€” "
129
+ f"classifier expects {getattr(clf,'n_features_in_','unknown')}, "
130
+ f"classes={classes}")
131
+ return model_data
132
 
133
  # -------------------------------------------------
134
+ # Prediction with threshold + chunking
135
  # -------------------------------------------------
136
+ def _infer_ai_index(clf) -> int:
137
+ classes = [str(c).upper() for c in getattr(clf, "classes_", [])]
138
+ if "AI" in classes:
139
+ return classes.index("AI")
140
+ # common fallback: binary {0,1} where 1=AI
141
+ if set(classes) == {"0", "1"}:
142
+ return classes.index("1")
143
+ # last resort: assume last class is AI
144
+ return len(classes) - 1 if classes else 0
 
 
145
 
146
+ def predict_with_threshold(
147
+ text: str,
148
+ model_data: dict,
149
+ ai_threshold: float = 0.70,
150
+ normalize_flag: bool = True,
151
+ agg: str = "mean", # "mean" or "median"
152
+ ):
153
+ proc = preprocess_text(text, max_chars=model_data.get("max_chars", 100000))
154
  if not proc:
155
  return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"}
156
 
157
+ chunks = chunk_by_words(proc, words_per_chunk=model_data.get("words_per_chunk", 350))
158
+ if not chunks:
159
+ return "UNKNOWN", 0.0, {"error": "Empty after chunking"}
 
 
 
160
 
161
  clf = model_data["model"]
162
+ ai_idx = _infer_ai_index(clf)
163
+ p_ai_list = []
164
+
165
+ with torch.no_grad():
166
+ for ch in chunks:
167
+ emb = model_data["embedding_model"].encode(
168
+ [ch], convert_to_numpy=True, normalize_embeddings=normalize_flag
169
+ )
170
+ if emb.ndim == 1:
171
+ emb = emb.reshape(1, -1)
172
+
173
+ need = getattr(clf, "n_features_in_", emb.shape[1])
174
+ if emb.shape[1] != need:
175
+ return "ERROR", 0.0, {
176
+ "error": f"Embedding dim {emb.shape[1]} != classifier requires {need}"
177
+ }
178
 
179
+ if hasattr(clf, "predict_proba"):
180
+ proba = clf.predict_proba(emb)[0]
181
+ p_ai_list.append(float(proba[ai_idx]))
182
+ else:
183
+ # fallback if no proba: convert predicted label to pseudo-proba
184
+ pred = str(clf.predict(emb)[0]).upper()
185
+ p_ai_list.append(1.0 if pred == "AI" else 0.0)
186
+
187
+ p_ai = float(np.mean(p_ai_list) if agg == "mean" else np.median(p_ai_list))
188
+ label = "AI" if p_ai >= ai_threshold else "HUMAN"
189
+ conf = p_ai if label == "AI" else 1.0 - p_ai
190
 
191
+ return label, conf, {
192
+ "p_ai": p_ai,
193
+ "chunks": len(chunks),
194
+ "threshold": ai_threshold,
195
+ "agg": agg,
196
+ }
197
 
198
  # -------------------------------------------------
199
  # Gradio App
 
201
  def create_app(model_data):
202
  with gr.Blocks(title="Embedding-based Human vs AI Detector") as demo:
203
  gr.Markdown("## πŸ€–πŸ‘€ Human vs AI Detector (Embedding-based)")
204
+ gr.Markdown(
205
+ "Transformer-friendly pipeline: **no stopword removal / lemmatization**, "
206
+ "**chunking** for long texts, **thresholded** decision."
207
+ )
208
+
209
+ with gr.Row():
210
+ inp = gr.Textbox(label="Enter English text", lines=10, placeholder="Paste text here...")
211
+ with gr.Row():
212
+ thr = gr.Slider(minimum=0.50, maximum=0.90, value=0.70, step=0.01,
213
+ label="AI threshold (p_AI β‰₯ threshold β†’ AI)")
214
+ norm = gr.Checkbox(
215
+ value=model_data.get("normalize_embeddings_default", True),
216
+ label="Normalize embeddings (match training setting)"
217
+ )
218
+ with gr.Row():
219
+ agg = gr.Dropdown(choices=["mean", "median"], value="mean", label="Aggregate across chunks")
220
+
221
  out = gr.Markdown()
222
  details = gr.Markdown()
223
 
224
+ def _predict_ui(text, threshold, normalize_embeddings, agg_mode):
225
+ label, conf, meta = predict_with_threshold(
226
+ text, model_data,
227
+ ai_threshold=float(threshold),
228
+ normalize_flag=bool(normalize_embeddings),
229
+ agg=agg_mode
230
+ )
231
+ if label == "AI":
232
  headline = f"πŸ€– **AI Generated** (Conf: {conf:.1%})"
233
+ elif label == "HUMAN":
234
  headline = f"πŸ‘€ **Human Written** (Conf: {conf:.1%})"
235
+ elif label == "ERROR":
236
  headline = f"❌ Error: {meta.get('error', 'Unknown')}"
 
 
237
  else:
238
  headline = f"❓ {label} (Conf: {conf:.1%})"
239
 
240
  det = (
241
+ f"- p(AI): {meta.get('p_ai','?'):.4f}\n"
242
+ f"- Chunks: {meta.get('chunks','?')}\n"
243
+ f"- Threshold: {meta.get('threshold','?')}\n"
244
+ f"- Aggregate: {meta.get('agg','?')}\n"
245
+ f"- Embedder: {model_data['resolved_embedding_model_name']} (dim={model_data['resolved_embedding_dim']})"
246
  )
247
  return headline, det
248
 
249
+ inp.submit(_predict_ui, [inp, thr, norm, agg], [out, details])
250
+ gr.Button("πŸ” Predict").click(_predict_ui, [inp, thr, norm, agg], [out, details])
251
+
252
  return demo
253
 
254
  # -------------------------------------------------
 
258
  demo = create_app(_model_data)
259
 
260
  if __name__ == "__main__":
261
+ # Pass share=True if you need a public URL
262
  demo.launch()