Mrkomiljon commited on
Commit
f466a35
Β·
verified Β·
1 Parent(s): 7aac00f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -53
app.py CHANGED
@@ -1,28 +1,27 @@
1
  # app.py
2
- # -*- coding: utf-8 -*-
3
-
4
  import os
5
  import re
6
- import warnings
7
  import joblib
 
 
8
  import numpy as np
9
  import pandas as pd
10
- import torch
 
 
11
  from nltk.tokenize import word_tokenize
12
  from nltk.stem import WordNetLemmatizer
13
- from nltk.corpus import stopwords
14
- import nltk
15
- import gradio as gr
16
  from sentence_transformers import SentenceTransformer
17
  from huggingface_hub import hf_hub_download
18
 
19
  warnings.filterwarnings("ignore")
20
 
21
- REPO_ID = os.getenv("HF_REPO_ID", "Detecting-ai/text-detector-model-embedding")
22
- FILENAME = os.getenv("HF_FILENAME", "complete_trained_model_lite.joblib")
23
- REPO_TYPE = os.getenv("HF_REPO_TYPE", "model")
24
- HF_TOKEN = os.getenv("HF_TOKEN") or None
25
 
 
26
  def ensure_nltk():
27
  try: nltk.data.find("tokenizers/punkt")
28
  except LookupError: nltk.download("punkt")
@@ -34,40 +33,35 @@ def ensure_nltk():
34
  ensure_nltk()
35
 
36
  def _to_stopword_set(sw):
37
- if sw is None:
38
- return set(stopwords.words("english"))
39
- if isinstance(sw, (list, tuple)):
40
- return set(sw)
41
- if isinstance(sw, set):
42
- return sw
43
- try:
44
- return set(sw)
45
- except Exception:
46
- return set(stopwords.words("english"))
47
-
48
- def _guess_model_by_dim(dim: int) -> str:
49
- if dim == 768: return "sentence-transformers/all-mpnet-base-v2"
50
- if dim == 384: return "sentence-transformers/all-MiniLM-L6-v2"
51
- return "sentence-transformers/all-mpnet-base-v2"
52
 
53
  def load_embedding_model():
54
- # Joblib siqilgan lite bundle ni yuklash (kichik va tez)
55
  path = hf_hub_download(
56
  repo_id=REPO_ID,
57
  filename=FILENAME,
58
- repo_type=REPO_TYPE,
59
- token=HF_TOKEN
60
  )
 
 
61
  data = joblib.load(path)
62
- print(f"βœ… Loaded lite bundle: {REPO_ID}/{FILENAME}")
63
 
64
  device = "cuda" if torch.cuda.is_available() else "cpu"
65
- env_name = os.getenv("EMBEDDING_MODEL_NAME", "").strip()
66
  stored_name = (data.get("embedding_model_name") or data.get("embedding_model_path") or "").strip()
67
- expected_dim = int(data.get("embedding_dim", 0)) if data.get("embedding_dim") is not None else 0
68
- emb_name = env_name or stored_name or _guess_model_by_dim(expected_dim)
69
 
70
- print(f"πŸ”§ Loading ST: {emb_name} on {device}")
71
  embedding_model = SentenceTransformer(emb_name, device=device)
72
  actual_dim = embedding_model.get_sentence_embedding_dimension()
73
 
@@ -75,14 +69,10 @@ def load_embedding_model():
75
  data["resolved_embedding_model_name"] = emb_name
76
  data["resolved_embedding_dim"] = actual_dim
77
  data["device"] = device
78
-
79
- if "lemmatizer" not in data or data["lemmatizer"] is None:
80
- data["lemmatizer"] = WordNetLemmatizer()
81
  data["stop_words"] = _to_stopword_set(data.get("stop_words"))
82
- if "max_tokens" not in data:
83
- data["max_tokens"] = 600
84
 
85
- print(f"ℹ️ Expect dim={expected_dim}, using {emb_name} (dim={actual_dim})")
86
  return data
87
 
88
  def preprocess_text(text, lemmatizer, stop_words, max_tokens=600):
@@ -94,27 +84,24 @@ def preprocess_text(text, lemmatizer, stop_words, max_tokens=600):
94
  for tok in word_tokenize(text)
95
  if tok not in stop_words and len(tok) > 2
96
  ]
97
- if len(tokens) > max_tokens:
98
- tokens = tokens[:max_tokens]
99
- return " ".join(tokens)
100
 
101
  def predict_text(text, model_data):
102
  proc = preprocess_text(text, model_data["lemmatizer"], model_data["stop_words"], model_data["max_tokens"])
103
  if not proc:
104
  return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"}
 
105
  with torch.no_grad():
106
  emb = model_data["embedding_model"].encode([proc], convert_to_numpy=True, normalize_embeddings=False)
107
- if emb.ndim == 1:
108
- emb = emb.reshape(1, -1)
109
  clf = model_data["model"]
110
  try:
111
  pred = clf.predict(emb)[0]
112
- if hasattr(clf, "predict_proba"):
113
- conf = float(np.max(clf.predict_proba(emb)[0]))
114
- else:
115
- conf = 0.5
116
- except ValueError as e:
117
- return "ERROR_DIM_MISMATCH", 0.0, {"error": str(e)}
118
  return str(pred), conf, {"tokens": len(proc.split())}
119
 
120
  def create_app(model_data):
@@ -130,8 +117,8 @@ def create_app(model_data):
130
  headline = f"πŸ€– **AI Generated** (Conf: {conf:.1%})"
131
  elif label.upper() == "HUMAN":
132
  headline = f"πŸ‘€ **Human Written** (Conf: {conf:.1%})"
133
- elif label == "ERROR_DIM_MISMATCH":
134
- headline = f"❌ Dim mismatch (Conf: {conf:.1%})"
135
  else:
136
  headline = f"❓ {label} (Conf: {conf:.1%})"
137
  det = f"- Tokens: {meta.get('tokens','?')}\n- Embedding: {model_data['resolved_embedding_model_name']} (dim={model_data['resolved_embedding_dim']})"
@@ -139,6 +126,7 @@ def create_app(model_data):
139
 
140
  inp.submit(_predict_ui, inp, [out, details])
141
  gr.Button("πŸ” Predict").click(_predict_ui, inp, [out, details])
 
142
  return demo
143
 
144
  _model_data = load_embedding_model()
 
1
  # app.py
 
 
2
  import os
3
  import re
 
4
  import joblib
5
+ import torch
6
+ import gradio as gr
7
  import numpy as np
8
  import pandas as pd
9
+ import warnings
10
+ import nltk
11
+ from nltk.corpus import stopwords
12
  from nltk.tokenize import word_tokenize
13
  from nltk.stem import WordNetLemmatizer
 
 
 
14
  from sentence_transformers import SentenceTransformer
15
  from huggingface_hub import hf_hub_download
16
 
17
  warnings.filterwarnings("ignore")
18
 
19
+ # Hugging Face repo config
20
+ REPO_ID = "Detecting-ai/text-detector-model-embedding"
21
+ FILENAME = "complete_trained_model_lite.joblib"
22
+ REPO_TYPE = "model"
23
 
24
+ # NLTK ensure
25
  def ensure_nltk():
26
  try: nltk.data.find("tokenizers/punkt")
27
  except LookupError: nltk.download("punkt")
 
33
  ensure_nltk()
34
 
35
  def _to_stopword_set(sw):
36
+ if sw is None: return set(stopwords.words("english"))
37
+ if isinstance(sw, (list, tuple)): return set(sw)
38
+ if isinstance(sw, set): return sw
39
+ try: return set(sw)
40
+ except: return set(stopwords.words("english"))
41
+
42
+ def _guess_model_by_dim(dim):
43
+ if dim == 768:
44
+ return "sentence-transformers/all-mpnet-base-v2"
45
+ if dim == 384:
46
+ return "sentence-transformers/all-MiniLM-L6-v2"
47
+ return "sentence-transformers/all-MiniLM-L6-v2"
 
 
 
48
 
49
  def load_embedding_model():
 
50
  path = hf_hub_download(
51
  repo_id=REPO_ID,
52
  filename=FILENAME,
53
+ repo_type=REPO_TYPE
 
54
  )
55
+ print(f"βœ… Downloaded bundle from Hugging Face: {FILENAME}")
56
+
57
  data = joblib.load(path)
 
58
 
59
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
60
  stored_name = (data.get("embedding_model_name") or data.get("embedding_model_path") or "").strip()
61
+ expected_dim = int(data.get("embedding_dim", 0)) if data.get("embedding_dim") else 0
62
+ emb_name = stored_name or _guess_model_by_dim(expected_dim)
63
 
64
+ print(f"πŸ”§ Loading embedding model: {emb_name} on {device}")
65
  embedding_model = SentenceTransformer(emb_name, device=device)
66
  actual_dim = embedding_model.get_sentence_embedding_dimension()
67
 
 
69
  data["resolved_embedding_model_name"] = emb_name
70
  data["resolved_embedding_dim"] = actual_dim
71
  data["device"] = device
72
+ data["lemmatizer"] = data.get("lemmatizer") or WordNetLemmatizer()
 
 
73
  data["stop_words"] = _to_stopword_set(data.get("stop_words"))
74
+ data["max_tokens"] = data.get("max_tokens", 600)
 
75
 
 
76
  return data
77
 
78
  def preprocess_text(text, lemmatizer, stop_words, max_tokens=600):
 
84
  for tok in word_tokenize(text)
85
  if tok not in stop_words and len(tok) > 2
86
  ]
87
+ return " ".join(tokens[:max_tokens])
 
 
88
 
89
  def predict_text(text, model_data):
90
  proc = preprocess_text(text, model_data["lemmatizer"], model_data["stop_words"], model_data["max_tokens"])
91
  if not proc:
92
  return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"}
93
+
94
  with torch.no_grad():
95
  emb = model_data["embedding_model"].encode([proc], convert_to_numpy=True, normalize_embeddings=False)
96
+ if emb.ndim == 1: emb = emb.reshape(1, -1)
97
+
98
  clf = model_data["model"]
99
  try:
100
  pred = clf.predict(emb)[0]
101
+ conf = float(np.max(clf.predict_proba(emb)[0])) if hasattr(clf, "predict_proba") else 0.5
102
+ except Exception as e:
103
+ return "ERROR", 0.0, {"error": str(e)}
104
+
 
 
105
  return str(pred), conf, {"tokens": len(proc.split())}
106
 
107
  def create_app(model_data):
 
117
  headline = f"πŸ€– **AI Generated** (Conf: {conf:.1%})"
118
  elif label.upper() == "HUMAN":
119
  headline = f"πŸ‘€ **Human Written** (Conf: {conf:.1%})"
120
+ elif label.upper() == "ERROR":
121
+ headline = f"❌ Error: {meta.get('error', 'Unknown')}"
122
  else:
123
  headline = f"❓ {label} (Conf: {conf:.1%})"
124
  det = f"- Tokens: {meta.get('tokens','?')}\n- Embedding: {model_data['resolved_embedding_model_name']} (dim={model_data['resolved_embedding_dim']})"
 
126
 
127
  inp.submit(_predict_ui, inp, [out, details])
128
  gr.Button("πŸ” Predict").click(_predict_ui, inp, [out, details])
129
+
130
  return demo
131
 
132
  _model_data = load_embedding_model()