Mrkomiljon commited on
Commit
7aac00f
·
verified ·
1 Parent(s): 9e17dfb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -106
app.py CHANGED
@@ -2,10 +2,9 @@
2
  # -*- coding: utf-8 -*-
3
 
4
  import os
5
- import pickle
6
  import re
7
  import warnings
8
-
9
  import numpy as np
10
  import pandas as pd
11
  import torch
@@ -19,18 +18,11 @@ from huggingface_hub import hf_hub_download
19
 
20
  warnings.filterwarnings("ignore")
21
 
22
- # ---------------------------
23
- # Hugging Face Hub config
24
- # ---------------------------
25
- HF_REPO_ID = os.getenv("HF_REPO_ID", "Detecting-ai/text-detector-model-embedding")
26
- # Agar siz envda HF_FILENAME bermasangiz, dastur avval lite, keyin heavy ni sinab ko'radi:
27
- HF_FILENAME = os.getenv("HF_FILENAME", "").strip()
28
- HF_REPO_TYPE = os.getenv("HF_REPO_TYPE", "model").strip() # "model" yoki "dataset"
29
- HF_TOKEN = os.getenv("HF_TOKEN", "").strip() # private repo bo'lsa Secrets'dan keladi
30
-
31
- # ---------------------------
32
- # NLTK bootstrap
33
- # ---------------------------
34
  def ensure_nltk():
35
  try: nltk.data.find("tokenizers/punkt")
36
  except LookupError: nltk.download("punkt")
@@ -41,7 +33,6 @@ def ensure_nltk():
41
 
42
  ensure_nltk()
43
 
44
-
45
  def _to_stopword_set(sw):
46
  if sw is None:
47
  return set(stopwords.words("english"))
@@ -54,80 +45,22 @@ def _to_stopword_set(sw):
54
  except Exception:
55
  return set(stopwords.words("english"))
56
 
57
-
58
  def _guess_model_by_dim(dim: int) -> str:
59
- if dim == 768:
60
- return "sentence-transformers/all-mpnet-base-v2"
61
- if dim == 384:
62
- return "sentence-transformers/all-MiniLM-L6-v2"
63
  return "sentence-transformers/all-mpnet-base-v2"
64
 
65
-
66
- # ---------------------------
67
- # Hub loader (CPU-safe)
68
- # ---------------------------
69
- def _hf_download(filename: str) -> str:
70
- return hf_hub_download(
71
- repo_id=HF_REPO_ID,
72
- filename=filename,
73
- repo_type=HF_REPO_TYPE,
74
- token=HF_TOKEN or None
75
- )
76
-
77
- def _safe_load_pickle_cpu(path: str):
78
- """
79
- CUDA-da saqlangan obyektlarni CPU-only muhitda ochish:
80
- 1) torch.load(map_location='cpu') bilan urinamiz
81
- 2) bo'lmasa oddiy pickle.load
82
- """
83
- # 1) Torch formatini sinab ko'ramiz
84
- try:
85
- obj = torch.load(path, map_location="cpu")
86
- print("✅ Loaded with torch.load(map_location='cpu')")
87
- return obj
88
- except Exception as e:
89
- print(f"torch.load failed: {e}. Falling back to pickle.load...")
90
-
91
- # 2) Oddiy pickle
92
- with open(path, "rb") as f:
93
- obj = pickle.load(f)
94
- print("✅ Loaded with pickle.load")
95
- return obj
96
-
97
- def _load_bundle_from_hub():
98
- # Yuklash tartibi:
99
- # - Agar env HF_FILENAME berilgan bo'lsa, o'shani ishlatamiz
100
- # - Aks holda avval 'complete_trained_model_lite.pkl' (kichik), bo'lmasa 'complete_trained_model.pkl'
101
- candidates = []
102
- if HF_FILENAME:
103
- candidates = [HF_FILENAME]
104
- else:
105
- candidates = ["complete_trained_model.pkl", "complete_trained_model_lite.pkl"]
106
-
107
- last_err = None
108
- for fname in candidates:
109
- try:
110
- print(f"↓ Trying to download from hub: {HF_REPO_ID}/{fname} ({HF_REPO_TYPE})")
111
- pkl_path = _hf_download(fname)
112
- data = _safe_load_pickle_cpu(pkl_path)
113
- print(f"✅ Bundle loaded from hub: {HF_REPO_ID}/{fname}")
114
- return data
115
- except Exception as e:
116
- print(f"⚠️ Failed to load {fname}: {e}")
117
- last_err = e
118
- continue
119
-
120
- raise RuntimeError(f"❌ Could not load any bundle from hub. Last error: {last_err}")
121
-
122
-
123
- # ---------------------------
124
- # Main loader
125
- # ---------------------------
126
  def load_embedding_model():
127
- # 1) Trained classifier bundle (.pkl) ni yuklab olamiz (CPU-safe)
128
- data = _load_bundle_from_hub()
 
 
 
 
 
 
 
129
 
130
- # 2) Embedding modelni tayyorlaymiz
131
  device = "cuda" if torch.cuda.is_available() else "cpu"
132
  env_name = os.getenv("EMBEDDING_MODEL_NAME", "").strip()
133
  stored_name = (data.get("embedding_model_name") or data.get("embedding_model_path") or "").strip()
@@ -138,7 +71,6 @@ def load_embedding_model():
138
  embedding_model = SentenceTransformer(emb_name, device=device)
139
  actual_dim = embedding_model.get_sentence_embedding_dimension()
140
 
141
- # 3) Qo'shimcha maydonlar
142
  data["embedding_model"] = embedding_model
143
  data["resolved_embedding_model_name"] = emb_name
144
  data["resolved_embedding_dim"] = actual_dim
@@ -153,13 +85,8 @@ def load_embedding_model():
153
  print(f"ℹ️ Expect dim={expected_dim}, using {emb_name} (dim={actual_dim})")
154
  return data
155
 
156
-
157
- # ---------------------------
158
- # Preprocess + Predict
159
- # ---------------------------
160
  def preprocess_text(text, lemmatizer, stop_words, max_tokens=600):
161
- if pd.isna(text) or not text:
162
- return ""
163
  text = str(text).lower()
164
  text = re.sub(r"[^a-zA-Z\s]", " ", text)
165
  tokens = [
@@ -171,17 +98,14 @@ def preprocess_text(text, lemmatizer, stop_words, max_tokens=600):
171
  tokens = tokens[:max_tokens]
172
  return " ".join(tokens)
173
 
174
-
175
  def predict_text(text, model_data):
176
  proc = preprocess_text(text, model_data["lemmatizer"], model_data["stop_words"], model_data["max_tokens"])
177
  if not proc:
178
  return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"}
179
-
180
  with torch.no_grad():
181
  emb = model_data["embedding_model"].encode([proc], convert_to_numpy=True, normalize_embeddings=False)
182
  if emb.ndim == 1:
183
  emb = emb.reshape(1, -1)
184
-
185
  clf = model_data["model"]
186
  try:
187
  pred = clf.predict(emb)[0]
@@ -191,13 +115,8 @@ def predict_text(text, model_data):
191
  conf = 0.5
192
  except ValueError as e:
193
  return "ERROR_DIM_MISMATCH", 0.0, {"error": str(e)}
194
-
195
  return str(pred), conf, {"tokens": len(proc.split())}
196
 
197
-
198
- # ---------------------------
199
- # Gradio UI
200
- # ---------------------------
201
  def create_app(model_data):
202
  with gr.Blocks(title="Embedding-based Human vs AI Detector") as demo:
203
  gr.Markdown("## 🤖👤 Human vs AI Detector (Embedding-based)")
@@ -220,16 +139,10 @@ def create_app(model_data):
220
 
221
  inp.submit(_predict_ui, inp, [out, details])
222
  gr.Button("🔍 Predict").click(_predict_ui, inp, [out, details])
223
-
224
  return demo
225
 
226
-
227
- # ---------------------------
228
- # Build app
229
- # ---------------------------
230
  _model_data = load_embedding_model()
231
  demo = create_app(_model_data)
232
 
233
  if __name__ == "__main__":
234
- # Spaces-da share kerak emas; public URLni Space beradi.
235
- demo.launch(server_name="0.0.0.0", server_port=7860, share=False, show_api=True)
 
2
  # -*- coding: utf-8 -*-
3
 
4
  import os
 
5
  import re
6
  import warnings
7
+ import joblib
8
  import numpy as np
9
  import pandas as pd
10
  import torch
 
18
 
19
  warnings.filterwarnings("ignore")
20
 
21
+ REPO_ID = os.getenv("HF_REPO_ID", "Detecting-ai/text-detector-model-embedding")
22
+ FILENAME = os.getenv("HF_FILENAME", "complete_trained_model_lite.joblib")
23
+ REPO_TYPE = os.getenv("HF_REPO_TYPE", "model")
24
+ HF_TOKEN = os.getenv("HF_TOKEN") or None
25
+
 
 
 
 
 
 
 
26
  def ensure_nltk():
27
  try: nltk.data.find("tokenizers/punkt")
28
  except LookupError: nltk.download("punkt")
 
33
 
34
  ensure_nltk()
35
 
 
36
  def _to_stopword_set(sw):
37
  if sw is None:
38
  return set(stopwords.words("english"))
 
45
  except Exception:
46
  return set(stopwords.words("english"))
47
 
 
48
  def _guess_model_by_dim(dim: int) -> str:
49
+ if dim == 768: return "sentence-transformers/all-mpnet-base-v2"
50
+ if dim == 384: return "sentence-transformers/all-MiniLM-L6-v2"
 
 
51
  return "sentence-transformers/all-mpnet-base-v2"
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  def load_embedding_model():
54
+ # Joblib siqilgan lite bundle ni yuklash (kichik va tez)
55
+ path = hf_hub_download(
56
+ repo_id=REPO_ID,
57
+ filename=FILENAME,
58
+ repo_type=REPO_TYPE,
59
+ token=HF_TOKEN
60
+ )
61
+ data = joblib.load(path)
62
+ print(f"✅ Loaded lite bundle: {REPO_ID}/{FILENAME}")
63
 
 
64
  device = "cuda" if torch.cuda.is_available() else "cpu"
65
  env_name = os.getenv("EMBEDDING_MODEL_NAME", "").strip()
66
  stored_name = (data.get("embedding_model_name") or data.get("embedding_model_path") or "").strip()
 
71
  embedding_model = SentenceTransformer(emb_name, device=device)
72
  actual_dim = embedding_model.get_sentence_embedding_dimension()
73
 
 
74
  data["embedding_model"] = embedding_model
75
  data["resolved_embedding_model_name"] = emb_name
76
  data["resolved_embedding_dim"] = actual_dim
 
85
  print(f"ℹ️ Expect dim={expected_dim}, using {emb_name} (dim={actual_dim})")
86
  return data
87
 
 
 
 
 
88
  def preprocess_text(text, lemmatizer, stop_words, max_tokens=600):
89
+ if pd.isna(text) or not text: return ""
 
90
  text = str(text).lower()
91
  text = re.sub(r"[^a-zA-Z\s]", " ", text)
92
  tokens = [
 
98
  tokens = tokens[:max_tokens]
99
  return " ".join(tokens)
100
 
 
101
  def predict_text(text, model_data):
102
  proc = preprocess_text(text, model_data["lemmatizer"], model_data["stop_words"], model_data["max_tokens"])
103
  if not proc:
104
  return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"}
 
105
  with torch.no_grad():
106
  emb = model_data["embedding_model"].encode([proc], convert_to_numpy=True, normalize_embeddings=False)
107
  if emb.ndim == 1:
108
  emb = emb.reshape(1, -1)
 
109
  clf = model_data["model"]
110
  try:
111
  pred = clf.predict(emb)[0]
 
115
  conf = 0.5
116
  except ValueError as e:
117
  return "ERROR_DIM_MISMATCH", 0.0, {"error": str(e)}
 
118
  return str(pred), conf, {"tokens": len(proc.split())}
119
 
 
 
 
 
120
  def create_app(model_data):
121
  with gr.Blocks(title="Embedding-based Human vs AI Detector") as demo:
122
  gr.Markdown("## 🤖👤 Human vs AI Detector (Embedding-based)")
 
139
 
140
  inp.submit(_predict_ui, inp, [out, details])
141
  gr.Button("🔍 Predict").click(_predict_ui, inp, [out, details])
 
142
  return demo
143
 
 
 
 
 
144
  _model_data = load_embedding_model()
145
  demo = create_app(_model_data)
146
 
147
  if __name__ == "__main__":
148
+ demo.launch()