Spaces:
Sleeping
Sleeping
Update app.py
Browse filesupdated 768 dim
app.py
CHANGED
|
@@ -16,15 +16,26 @@ from huggingface_hub import hf_hub_download
|
|
| 16 |
|
| 17 |
warnings.filterwarnings("ignore")
|
| 18 |
|
|
|
|
| 19 |
# Hugging Face model config
|
|
|
|
| 20 |
REPO_ID = "Detecting-ai/text-detector-model-embedding"
|
| 21 |
FILENAME = "complete_trained_model_lite.joblib"
|
| 22 |
REPO_TYPE = "model"
|
| 23 |
|
| 24 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def ensure_nltk():
|
| 26 |
resources = {
|
| 27 |
"punkt": "tokenizers/punkt",
|
|
|
|
| 28 |
"punkt_tab": "tokenizers/punkt_tab/english",
|
| 29 |
"stopwords": "corpora/stopwords",
|
| 30 |
"wordnet": "corpora/wordnet",
|
|
@@ -33,26 +44,31 @@ def ensure_nltk():
|
|
| 33 |
try:
|
| 34 |
nltk.data.find(path)
|
| 35 |
except LookupError:
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
ensure_nltk()
|
| 38 |
|
| 39 |
-
# ---
|
|
|
|
|
|
|
| 40 |
def _to_stopword_set(sw):
|
| 41 |
-
if sw is None:
|
| 42 |
-
|
| 43 |
-
if isinstance(sw, set):
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
# --- Load Model Bundle ---
|
| 56 |
def load_embedding_model():
|
| 57 |
path = hf_hub_download(
|
| 58 |
repo_id=REPO_ID,
|
|
@@ -65,27 +81,43 @@ def load_embedding_model():
|
|
| 65 |
data = joblib.load(path)
|
| 66 |
|
| 67 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
|
| 72 |
-
|
| 73 |
-
|
|
|
|
| 74 |
actual_dim = embedding_model.get_sentence_embedding_dimension()
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
data["embedding_model"] = embedding_model
|
| 77 |
-
data["resolved_embedding_model_name"] =
|
| 78 |
data["resolved_embedding_dim"] = actual_dim
|
| 79 |
data["device"] = device
|
| 80 |
data["lemmatizer"] = data.get("lemmatizer") or WordNetLemmatizer()
|
| 81 |
data["stop_words"] = _to_stopword_set(data.get("stop_words"))
|
| 82 |
data["max_tokens"] = data.get("max_tokens", 600)
|
| 83 |
|
|
|
|
| 84 |
return data
|
| 85 |
|
| 86 |
-
# ---
|
|
|
|
|
|
|
| 87 |
def preprocess_text(text, lemmatizer, stop_words, max_tokens=600):
|
| 88 |
-
if pd.isna(text) or not text:
|
|
|
|
| 89 |
text = str(text).lower()
|
| 90 |
text = re.sub(r"[^a-zA-Z\s]", " ", text)
|
| 91 |
tokens = [
|
|
@@ -95,18 +127,26 @@ def preprocess_text(text, lemmatizer, stop_words, max_tokens=600):
|
|
| 95 |
]
|
| 96 |
return " ".join(tokens[:max_tokens])
|
| 97 |
|
| 98 |
-
# ---
|
|
|
|
|
|
|
| 99 |
def predict_text(text, model_data):
|
| 100 |
proc = preprocess_text(text, model_data["lemmatizer"], model_data["stop_words"], model_data["max_tokens"])
|
| 101 |
if not proc:
|
| 102 |
return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"}
|
| 103 |
|
| 104 |
with torch.no_grad():
|
| 105 |
-
emb = model_data["embedding_model"].encode(
|
|
|
|
|
|
|
| 106 |
if emb.ndim == 1:
|
| 107 |
emb = emb.reshape(1, -1)
|
| 108 |
|
| 109 |
clf = model_data["model"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
try:
|
| 111 |
pred = clf.predict(emb)[0]
|
| 112 |
conf = float(np.max(clf.predict_proba(emb)[0])) if hasattr(clf, "predict_proba") else 0.5
|
|
@@ -115,7 +155,9 @@ def predict_text(text, model_data):
|
|
| 115 |
|
| 116 |
return str(pred), conf, {"tokens": len(proc.split())}
|
| 117 |
|
| 118 |
-
# ---
|
|
|
|
|
|
|
| 119 |
def create_app(model_data):
|
| 120 |
with gr.Blocks(title="Embedding-based Human vs AI Detector") as demo:
|
| 121 |
gr.Markdown("## π€π€ Human vs AI Detector (Embedding-based)")
|
|
@@ -131,18 +173,28 @@ def create_app(model_data):
|
|
| 131 |
headline = f"π€ **Human Written** (Conf: {conf:.1%})"
|
| 132 |
elif label.upper() == "ERROR":
|
| 133 |
headline = f"β Error: {meta.get('error', 'Unknown')}"
|
|
|
|
|
|
|
| 134 |
else:
|
| 135 |
headline = f"β {label} (Conf: {conf:.1%})"
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
return headline, det
|
| 138 |
|
| 139 |
inp.submit(_predict_ui, inp, [out, details])
|
| 140 |
gr.Button("π Predict").click(_predict_ui, inp, [out, details])
|
| 141 |
return demo
|
| 142 |
|
| 143 |
-
# ---
|
|
|
|
|
|
|
| 144 |
_model_data = load_embedding_model()
|
| 145 |
demo = create_app(_model_data)
|
| 146 |
|
| 147 |
if __name__ == "__main__":
|
|
|
|
| 148 |
demo.launch()
|
|
|
|
| 16 |
|
| 17 |
warnings.filterwarnings("ignore")
|
| 18 |
|
| 19 |
+
# -------------------------------------------------
|
| 20 |
# Hugging Face model config
|
| 21 |
+
# -------------------------------------------------
|
| 22 |
REPO_ID = "Detecting-ai/text-detector-model-embedding"
|
| 23 |
FILENAME = "complete_trained_model_lite.joblib"
|
| 24 |
REPO_TYPE = "model"
|
| 25 |
|
| 26 |
+
# -------------------------------------------------
|
| 27 |
+
# Force 768-dim embedder (MPNet)
|
| 28 |
+
# -------------------------------------------------
|
| 29 |
+
FORCED_EMBEDDER = "sentence-transformers/all-mpnet-base-v2"
|
| 30 |
+
FORCED_DIM = 768
|
| 31 |
+
|
| 32 |
+
# -------------------------------------------------
|
| 33 |
+
# Ensure NLTK dependencies
|
| 34 |
+
# -------------------------------------------------
|
| 35 |
def ensure_nltk():
|
| 36 |
resources = {
|
| 37 |
"punkt": "tokenizers/punkt",
|
| 38 |
+
# newer nltk introduces punkt_tab; harmless to try
|
| 39 |
"punkt_tab": "tokenizers/punkt_tab/english",
|
| 40 |
"stopwords": "corpora/stopwords",
|
| 41 |
"wordnet": "corpora/wordnet",
|
|
|
|
| 44 |
try:
|
| 45 |
nltk.data.find(path)
|
| 46 |
except LookupError:
|
| 47 |
+
try:
|
| 48 |
+
nltk.download(pkg, quiet=True)
|
| 49 |
+
except Exception:
|
| 50 |
+
pass
|
| 51 |
+
|
| 52 |
ensure_nltk()
|
| 53 |
|
| 54 |
+
# -------------------------------------------------
|
| 55 |
+
# Helper functions
|
| 56 |
+
# -------------------------------------------------
|
| 57 |
def _to_stopword_set(sw):
|
| 58 |
+
if sw is None:
|
| 59 |
+
return set(stopwords.words("english"))
|
| 60 |
+
if isinstance(sw, set):
|
| 61 |
+
return sw
|
| 62 |
+
if isinstance(sw, (list, tuple)):
|
| 63 |
+
return set(sw)
|
| 64 |
+
try:
|
| 65 |
+
return set(sw)
|
| 66 |
+
except Exception:
|
| 67 |
+
return set(stopwords.words("english"))
|
| 68 |
+
|
| 69 |
+
# -------------------------------------------------
|
| 70 |
+
# Load model bundle + forced 768-dim embedder
|
| 71 |
+
# -------------------------------------------------
|
|
|
|
| 72 |
def load_embedding_model():
|
| 73 |
path = hf_hub_download(
|
| 74 |
repo_id=REPO_ID,
|
|
|
|
| 81 |
data = joblib.load(path)
|
| 82 |
|
| 83 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 84 |
+
clf = data.get("model")
|
| 85 |
+
if clf is None:
|
| 86 |
+
raise RuntimeError("Model file does not contain 'model' key.")
|
| 87 |
|
| 88 |
+
# --- Always use 768-dim MPNet ---
|
| 89 |
+
print(f"π§ Loading 768-dim embedder: {FORCED_EMBEDDER} on {device}")
|
| 90 |
+
embedding_model = SentenceTransformer(FORCED_EMBEDDER, device=device)
|
| 91 |
actual_dim = embedding_model.get_sentence_embedding_dimension()
|
| 92 |
+
if actual_dim != FORCED_DIM:
|
| 93 |
+
raise RuntimeError(f"Loaded embedder dim={actual_dim}, expected {FORCED_DIM}")
|
| 94 |
+
|
| 95 |
+
# --- Classifier sanity check (must be trained on 768) ---
|
| 96 |
+
clf_dim = getattr(clf, "n_features_in_", None)
|
| 97 |
+
if clf_dim and clf_dim != FORCED_DIM:
|
| 98 |
+
raise RuntimeError(
|
| 99 |
+
f"Classifier expects {clf_dim} features, but app is configured for {FORCED_DIM}. "
|
| 100 |
+
f"Please retrain or load a 768-dim trained classifier."
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
# finalize
|
| 104 |
data["embedding_model"] = embedding_model
|
| 105 |
+
data["resolved_embedding_model_name"] = FORCED_EMBEDDER
|
| 106 |
data["resolved_embedding_dim"] = actual_dim
|
| 107 |
data["device"] = device
|
| 108 |
data["lemmatizer"] = data.get("lemmatizer") or WordNetLemmatizer()
|
| 109 |
data["stop_words"] = _to_stopword_set(data.get("stop_words"))
|
| 110 |
data["max_tokens"] = data.get("max_tokens", 600)
|
| 111 |
|
| 112 |
+
print(f"β
Using embedder: {FORCED_EMBEDDER} (dim={actual_dim}) β classifier expects {getattr(clf,'n_features_in_','unknown')}")
|
| 113 |
return data
|
| 114 |
|
| 115 |
+
# -------------------------------------------------
|
| 116 |
+
# Preprocessing
|
| 117 |
+
# -------------------------------------------------
|
| 118 |
def preprocess_text(text, lemmatizer, stop_words, max_tokens=600):
|
| 119 |
+
if pd.isna(text) or not str(text).strip():
|
| 120 |
+
return ""
|
| 121 |
text = str(text).lower()
|
| 122 |
text = re.sub(r"[^a-zA-Z\s]", " ", text)
|
| 123 |
tokens = [
|
|
|
|
| 127 |
]
|
| 128 |
return " ".join(tokens[:max_tokens])
|
| 129 |
|
| 130 |
+
# -------------------------------------------------
|
| 131 |
+
# Prediction
|
| 132 |
+
# -------------------------------------------------
|
| 133 |
def predict_text(text, model_data):
|
| 134 |
proc = preprocess_text(text, model_data["lemmatizer"], model_data["stop_words"], model_data["max_tokens"])
|
| 135 |
if not proc:
|
| 136 |
return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"}
|
| 137 |
|
| 138 |
with torch.no_grad():
|
| 139 |
+
emb = model_data["embedding_model"].encode(
|
| 140 |
+
[proc], convert_to_numpy=True, normalize_embeddings=False
|
| 141 |
+
)
|
| 142 |
if emb.ndim == 1:
|
| 143 |
emb = emb.reshape(1, -1)
|
| 144 |
|
| 145 |
clf = model_data["model"]
|
| 146 |
+
need = getattr(clf, "n_features_in_", emb.shape[1])
|
| 147 |
+
if emb.shape[1] != need:
|
| 148 |
+
return "ERROR", 0.0, {"error": f"Embedding dim {emb.shape[1]} != classifier requires {need}"}
|
| 149 |
+
|
| 150 |
try:
|
| 151 |
pred = clf.predict(emb)[0]
|
| 152 |
conf = float(np.max(clf.predict_proba(emb)[0])) if hasattr(clf, "predict_proba") else 0.5
|
|
|
|
| 155 |
|
| 156 |
return str(pred), conf, {"tokens": len(proc.split())}
|
| 157 |
|
| 158 |
+
# -------------------------------------------------
|
| 159 |
+
# Gradio App
|
| 160 |
+
# -------------------------------------------------
|
| 161 |
def create_app(model_data):
|
| 162 |
with gr.Blocks(title="Embedding-based Human vs AI Detector") as demo:
|
| 163 |
gr.Markdown("## π€π€ Human vs AI Detector (Embedding-based)")
|
|
|
|
| 173 |
headline = f"π€ **Human Written** (Conf: {conf:.1%})"
|
| 174 |
elif label.upper() == "ERROR":
|
| 175 |
headline = f"β Error: {meta.get('error', 'Unknown')}"
|
| 176 |
+
elif label.upper() == "UNKNOWN":
|
| 177 |
+
headline = f"β Unknown (Conf: {conf:.1%})"
|
| 178 |
else:
|
| 179 |
headline = f"β {label} (Conf: {conf:.1%})"
|
| 180 |
+
|
| 181 |
+
det = (
|
| 182 |
+
f"- Tokens: {meta.get('tokens','?')}\n"
|
| 183 |
+
f"- Embedding: {model_data['resolved_embedding_model_name']} "
|
| 184 |
+
f"(dim={model_data['resolved_embedding_dim']})"
|
| 185 |
+
)
|
| 186 |
return headline, det
|
| 187 |
|
| 188 |
inp.submit(_predict_ui, inp, [out, details])
|
| 189 |
gr.Button("π Predict").click(_predict_ui, inp, [out, details])
|
| 190 |
return demo
|
| 191 |
|
| 192 |
+
# -------------------------------------------------
|
| 193 |
+
# Load + Launch
|
| 194 |
+
# -------------------------------------------------
|
| 195 |
_model_data = load_embedding_model()
|
| 196 |
demo = create_app(_model_data)
|
| 197 |
|
| 198 |
if __name__ == "__main__":
|
| 199 |
+
# You can pass share=True if you need a public URL
|
| 200 |
demo.launch()
|