Mrkomiljon commited on
Commit
c9ee224
Β·
verified Β·
1 Parent(s): ba7c3bb

Update app.py

Browse files

updated 768 dim

Files changed (1) hide show
  1. app.py +84 -32
app.py CHANGED
@@ -16,15 +16,26 @@ from huggingface_hub import hf_hub_download
16
 
17
  warnings.filterwarnings("ignore")
18
 
 
19
  # Hugging Face model config
 
20
  REPO_ID = "Detecting-ai/text-detector-model-embedding"
21
  FILENAME = "complete_trained_model_lite.joblib"
22
  REPO_TYPE = "model"
23
 
24
- # --- Ensure NLTK dependencies ---
 
 
 
 
 
 
 
 
25
  def ensure_nltk():
26
  resources = {
27
  "punkt": "tokenizers/punkt",
 
28
  "punkt_tab": "tokenizers/punkt_tab/english",
29
  "stopwords": "corpora/stopwords",
30
  "wordnet": "corpora/wordnet",
@@ -33,26 +44,31 @@ def ensure_nltk():
33
  try:
34
  nltk.data.find(path)
35
  except LookupError:
36
- nltk.download(pkg)
 
 
 
 
37
  ensure_nltk()
38
 
39
- # --- Helper Functions ---
 
 
40
  def _to_stopword_set(sw):
41
- if sw is None: return set(stopwords.words("english"))
42
- if isinstance(sw, (list, tuple)): return set(sw)
43
- if isinstance(sw, set): return sw
44
- try: return set(sw)
45
- except: return set(stopwords.words("english"))
46
-
47
- def _guess_model_by_dim(dim: int):
48
- if dim == 768:
49
- return "sentence-transformers/all-mpnet-base-v2"
50
- if dim == 384:
51
- return "sentence-transformers/all-MiniLM-L6-v2"
52
- # default
53
- return "sentence-transformers/all-MiniLM-L6-v2"
54
-
55
- # --- Load Model Bundle ---
56
  def load_embedding_model():
57
  path = hf_hub_download(
58
  repo_id=REPO_ID,
@@ -65,27 +81,43 @@ def load_embedding_model():
65
  data = joblib.load(path)
66
 
67
  device = "cuda" if torch.cuda.is_available() else "cpu"
68
- stored_name = (data.get("embedding_model_name") or data.get("embedding_model_path") or "").strip()
69
- expected_dim = int(data.get("embedding_dim", 0)) if data.get("embedding_dim") else 0
70
- emb_name = stored_name or _guess_model_by_dim(expected_dim)
71
 
72
- print(f"πŸ”§ Loading embedding model: {emb_name} on {device}")
73
- embedding_model = SentenceTransformer(emb_name, device=device)
 
74
  actual_dim = embedding_model.get_sentence_embedding_dimension()
75
-
 
 
 
 
 
 
 
 
 
 
 
76
  data["embedding_model"] = embedding_model
77
- data["resolved_embedding_model_name"] = emb_name
78
  data["resolved_embedding_dim"] = actual_dim
79
  data["device"] = device
80
  data["lemmatizer"] = data.get("lemmatizer") or WordNetLemmatizer()
81
  data["stop_words"] = _to_stopword_set(data.get("stop_words"))
82
  data["max_tokens"] = data.get("max_tokens", 600)
83
 
 
84
  return data
85
 
86
- # --- Preprocessing ---
 
 
87
  def preprocess_text(text, lemmatizer, stop_words, max_tokens=600):
88
- if pd.isna(text) or not text: return ""
 
89
  text = str(text).lower()
90
  text = re.sub(r"[^a-zA-Z\s]", " ", text)
91
  tokens = [
@@ -95,18 +127,26 @@ def preprocess_text(text, lemmatizer, stop_words, max_tokens=600):
95
  ]
96
  return " ".join(tokens[:max_tokens])
97
 
98
- # --- Prediction ---
 
 
99
  def predict_text(text, model_data):
100
  proc = preprocess_text(text, model_data["lemmatizer"], model_data["stop_words"], model_data["max_tokens"])
101
  if not proc:
102
  return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"}
103
 
104
  with torch.no_grad():
105
- emb = model_data["embedding_model"].encode([proc], convert_to_numpy=True, normalize_embeddings=False)
 
 
106
  if emb.ndim == 1:
107
  emb = emb.reshape(1, -1)
108
 
109
  clf = model_data["model"]
 
 
 
 
110
  try:
111
  pred = clf.predict(emb)[0]
112
  conf = float(np.max(clf.predict_proba(emb)[0])) if hasattr(clf, "predict_proba") else 0.5
@@ -115,7 +155,9 @@ def predict_text(text, model_data):
115
 
116
  return str(pred), conf, {"tokens": len(proc.split())}
117
 
118
- # --- Gradio App ---
 
 
119
  def create_app(model_data):
120
  with gr.Blocks(title="Embedding-based Human vs AI Detector") as demo:
121
  gr.Markdown("## πŸ€–πŸ‘€ Human vs AI Detector (Embedding-based)")
@@ -131,18 +173,28 @@ def create_app(model_data):
131
  headline = f"πŸ‘€ **Human Written** (Conf: {conf:.1%})"
132
  elif label.upper() == "ERROR":
133
  headline = f"❌ Error: {meta.get('error', 'Unknown')}"
 
 
134
  else:
135
  headline = f"❓ {label} (Conf: {conf:.1%})"
136
- det = f"- Tokens: {meta.get('tokens','?')}\n- Embedding: {model_data['resolved_embedding_model_name']} (dim={model_data['resolved_embedding_dim']})"
 
 
 
 
 
137
  return headline, det
138
 
139
  inp.submit(_predict_ui, inp, [out, details])
140
  gr.Button("πŸ” Predict").click(_predict_ui, inp, [out, details])
141
  return demo
142
 
143
- # --- Load + Launch ---
 
 
144
  _model_data = load_embedding_model()
145
  demo = create_app(_model_data)
146
 
147
  if __name__ == "__main__":
 
148
  demo.launch()
 
16
 
17
  warnings.filterwarnings("ignore")
18
 
19
+ # -------------------------------------------------
20
  # Hugging Face model config
21
+ # -------------------------------------------------
22
  REPO_ID = "Detecting-ai/text-detector-model-embedding"
23
  FILENAME = "complete_trained_model_lite.joblib"
24
  REPO_TYPE = "model"
25
 
26
+ # -------------------------------------------------
27
+ # Force 768-dim embedder (MPNet)
28
+ # -------------------------------------------------
29
+ FORCED_EMBEDDER = "sentence-transformers/all-mpnet-base-v2"
30
+ FORCED_DIM = 768
31
+
32
+ # -------------------------------------------------
33
+ # Ensure NLTK dependencies
34
+ # -------------------------------------------------
35
  def ensure_nltk():
36
  resources = {
37
  "punkt": "tokenizers/punkt",
38
+ # newer nltk introduces punkt_tab; harmless to try
39
  "punkt_tab": "tokenizers/punkt_tab/english",
40
  "stopwords": "corpora/stopwords",
41
  "wordnet": "corpora/wordnet",
 
44
  try:
45
  nltk.data.find(path)
46
  except LookupError:
47
+ try:
48
+ nltk.download(pkg, quiet=True)
49
+ except Exception:
50
+ pass
51
+
52
  ensure_nltk()
53
 
54
+ # -------------------------------------------------
55
+ # Helper functions
56
+ # -------------------------------------------------
57
  def _to_stopword_set(sw):
58
+ if sw is None:
59
+ return set(stopwords.words("english"))
60
+ if isinstance(sw, set):
61
+ return sw
62
+ if isinstance(sw, (list, tuple)):
63
+ return set(sw)
64
+ try:
65
+ return set(sw)
66
+ except Exception:
67
+ return set(stopwords.words("english"))
68
+
69
+ # -------------------------------------------------
70
+ # Load model bundle + forced 768-dim embedder
71
+ # -------------------------------------------------
 
72
  def load_embedding_model():
73
  path = hf_hub_download(
74
  repo_id=REPO_ID,
 
81
  data = joblib.load(path)
82
 
83
  device = "cuda" if torch.cuda.is_available() else "cpu"
84
+ clf = data.get("model")
85
+ if clf is None:
86
+ raise RuntimeError("Model file does not contain 'model' key.")
87
 
88
+ # --- Always use 768-dim MPNet ---
89
+ print(f"πŸ”§ Loading 768-dim embedder: {FORCED_EMBEDDER} on {device}")
90
+ embedding_model = SentenceTransformer(FORCED_EMBEDDER, device=device)
91
  actual_dim = embedding_model.get_sentence_embedding_dimension()
92
+ if actual_dim != FORCED_DIM:
93
+ raise RuntimeError(f"Loaded embedder dim={actual_dim}, expected {FORCED_DIM}")
94
+
95
+ # --- Classifier sanity check (must be trained on 768) ---
96
+ clf_dim = getattr(clf, "n_features_in_", None)
97
+ if clf_dim and clf_dim != FORCED_DIM:
98
+ raise RuntimeError(
99
+ f"Classifier expects {clf_dim} features, but app is configured for {FORCED_DIM}. "
100
+ f"Please retrain or load a 768-dim trained classifier."
101
+ )
102
+
103
+ # finalize
104
  data["embedding_model"] = embedding_model
105
+ data["resolved_embedding_model_name"] = FORCED_EMBEDDER
106
  data["resolved_embedding_dim"] = actual_dim
107
  data["device"] = device
108
  data["lemmatizer"] = data.get("lemmatizer") or WordNetLemmatizer()
109
  data["stop_words"] = _to_stopword_set(data.get("stop_words"))
110
  data["max_tokens"] = data.get("max_tokens", 600)
111
 
112
+ print(f"βœ… Using embedder: {FORCED_EMBEDDER} (dim={actual_dim}) β€” classifier expects {getattr(clf,'n_features_in_','unknown')}")
113
  return data
114
 
115
+ # -------------------------------------------------
116
+ # Preprocessing
117
+ # -------------------------------------------------
118
  def preprocess_text(text, lemmatizer, stop_words, max_tokens=600):
119
+ if pd.isna(text) or not str(text).strip():
120
+ return ""
121
  text = str(text).lower()
122
  text = re.sub(r"[^a-zA-Z\s]", " ", text)
123
  tokens = [
 
127
  ]
128
  return " ".join(tokens[:max_tokens])
129
 
130
+ # -------------------------------------------------
131
+ # Prediction
132
+ # -------------------------------------------------
133
  def predict_text(text, model_data):
134
  proc = preprocess_text(text, model_data["lemmatizer"], model_data["stop_words"], model_data["max_tokens"])
135
  if not proc:
136
  return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"}
137
 
138
  with torch.no_grad():
139
+ emb = model_data["embedding_model"].encode(
140
+ [proc], convert_to_numpy=True, normalize_embeddings=False
141
+ )
142
  if emb.ndim == 1:
143
  emb = emb.reshape(1, -1)
144
 
145
  clf = model_data["model"]
146
+ need = getattr(clf, "n_features_in_", emb.shape[1])
147
+ if emb.shape[1] != need:
148
+ return "ERROR", 0.0, {"error": f"Embedding dim {emb.shape[1]} != classifier requires {need}"}
149
+
150
  try:
151
  pred = clf.predict(emb)[0]
152
  conf = float(np.max(clf.predict_proba(emb)[0])) if hasattr(clf, "predict_proba") else 0.5
 
155
 
156
  return str(pred), conf, {"tokens": len(proc.split())}
157
 
158
+ # -------------------------------------------------
159
+ # Gradio App
160
+ # -------------------------------------------------
161
  def create_app(model_data):
162
  with gr.Blocks(title="Embedding-based Human vs AI Detector") as demo:
163
  gr.Markdown("## πŸ€–πŸ‘€ Human vs AI Detector (Embedding-based)")
 
173
  headline = f"πŸ‘€ **Human Written** (Conf: {conf:.1%})"
174
  elif label.upper() == "ERROR":
175
  headline = f"❌ Error: {meta.get('error', 'Unknown')}"
176
+ elif label.upper() == "UNKNOWN":
177
+ headline = f"❓ Unknown (Conf: {conf:.1%})"
178
  else:
179
  headline = f"❓ {label} (Conf: {conf:.1%})"
180
+
181
+ det = (
182
+ f"- Tokens: {meta.get('tokens','?')}\n"
183
+ f"- Embedding: {model_data['resolved_embedding_model_name']} "
184
+ f"(dim={model_data['resolved_embedding_dim']})"
185
+ )
186
  return headline, det
187
 
188
  inp.submit(_predict_ui, inp, [out, details])
189
  gr.Button("πŸ” Predict").click(_predict_ui, inp, [out, details])
190
  return demo
191
 
192
+ # -------------------------------------------------
193
+ # Load + Launch
194
+ # -------------------------------------------------
195
  _model_data = load_embedding_model()
196
  demo = create_app(_model_data)
197
 
198
  if __name__ == "__main__":
199
+ # You can pass share=True if you need a public URL
200
  demo.launch()