Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,28 +1,27 @@
|
|
| 1 |
# app.py
|
| 2 |
-
# -*- coding: utf-8 -*-
|
| 3 |
-
|
| 4 |
import os
|
| 5 |
import re
|
| 6 |
-
import warnings
|
| 7 |
import joblib
|
|
|
|
|
|
|
| 8 |
import numpy as np
|
| 9 |
import pandas as pd
|
| 10 |
-
import
|
|
|
|
|
|
|
| 11 |
from nltk.tokenize import word_tokenize
|
| 12 |
from nltk.stem import WordNetLemmatizer
|
| 13 |
-
from nltk.corpus import stopwords
|
| 14 |
-
import nltk
|
| 15 |
-
import gradio as gr
|
| 16 |
from sentence_transformers import SentenceTransformer
|
| 17 |
from huggingface_hub import hf_hub_download
|
| 18 |
|
| 19 |
warnings.filterwarnings("ignore")
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
|
|
|
|
| 26 |
def ensure_nltk():
|
| 27 |
try: nltk.data.find("tokenizers/punkt")
|
| 28 |
except LookupError: nltk.download("punkt")
|
|
@@ -34,40 +33,35 @@ def ensure_nltk():
|
|
| 34 |
ensure_nltk()
|
| 35 |
|
| 36 |
def _to_stopword_set(sw):
|
| 37 |
-
if sw is None:
|
| 38 |
-
|
| 39 |
-
if isinstance(sw,
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
if dim == 768: return "sentence-transformers/all-mpnet-base-v2"
|
| 50 |
-
if dim == 384: return "sentence-transformers/all-MiniLM-L6-v2"
|
| 51 |
-
return "sentence-transformers/all-mpnet-base-v2"
|
| 52 |
|
| 53 |
def load_embedding_model():
|
| 54 |
-
# Joblib siqilgan lite bundle ni yuklash (kichik va tez)
|
| 55 |
path = hf_hub_download(
|
| 56 |
repo_id=REPO_ID,
|
| 57 |
filename=FILENAME,
|
| 58 |
-
repo_type=REPO_TYPE
|
| 59 |
-
token=HF_TOKEN
|
| 60 |
)
|
|
|
|
|
|
|
| 61 |
data = joblib.load(path)
|
| 62 |
-
print(f"β
Loaded lite bundle: {REPO_ID}/{FILENAME}")
|
| 63 |
|
| 64 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 65 |
-
env_name = os.getenv("EMBEDDING_MODEL_NAME", "").strip()
|
| 66 |
stored_name = (data.get("embedding_model_name") or data.get("embedding_model_path") or "").strip()
|
| 67 |
-
expected_dim = int(data.get("embedding_dim", 0)) if data.get("embedding_dim")
|
| 68 |
-
emb_name =
|
| 69 |
|
| 70 |
-
print(f"π§ Loading
|
| 71 |
embedding_model = SentenceTransformer(emb_name, device=device)
|
| 72 |
actual_dim = embedding_model.get_sentence_embedding_dimension()
|
| 73 |
|
|
@@ -75,14 +69,10 @@ def load_embedding_model():
|
|
| 75 |
data["resolved_embedding_model_name"] = emb_name
|
| 76 |
data["resolved_embedding_dim"] = actual_dim
|
| 77 |
data["device"] = device
|
| 78 |
-
|
| 79 |
-
if "lemmatizer" not in data or data["lemmatizer"] is None:
|
| 80 |
-
data["lemmatizer"] = WordNetLemmatizer()
|
| 81 |
data["stop_words"] = _to_stopword_set(data.get("stop_words"))
|
| 82 |
-
|
| 83 |
-
data["max_tokens"] = 600
|
| 84 |
|
| 85 |
-
print(f"βΉοΈ Expect dim={expected_dim}, using {emb_name} (dim={actual_dim})")
|
| 86 |
return data
|
| 87 |
|
| 88 |
def preprocess_text(text, lemmatizer, stop_words, max_tokens=600):
|
|
@@ -94,27 +84,24 @@ def preprocess_text(text, lemmatizer, stop_words, max_tokens=600):
|
|
| 94 |
for tok in word_tokenize(text)
|
| 95 |
if tok not in stop_words and len(tok) > 2
|
| 96 |
]
|
| 97 |
-
|
| 98 |
-
tokens = tokens[:max_tokens]
|
| 99 |
-
return " ".join(tokens)
|
| 100 |
|
| 101 |
def predict_text(text, model_data):
|
| 102 |
proc = preprocess_text(text, model_data["lemmatizer"], model_data["stop_words"], model_data["max_tokens"])
|
| 103 |
if not proc:
|
| 104 |
return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"}
|
|
|
|
| 105 |
with torch.no_grad():
|
| 106 |
emb = model_data["embedding_model"].encode([proc], convert_to_numpy=True, normalize_embeddings=False)
|
| 107 |
-
if emb.ndim == 1:
|
| 108 |
-
|
| 109 |
clf = model_data["model"]
|
| 110 |
try:
|
| 111 |
pred = clf.predict(emb)[0]
|
| 112 |
-
if hasattr(clf, "predict_proba")
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
except ValueError as e:
|
| 117 |
-
return "ERROR_DIM_MISMATCH", 0.0, {"error": str(e)}
|
| 118 |
return str(pred), conf, {"tokens": len(proc.split())}
|
| 119 |
|
| 120 |
def create_app(model_data):
|
|
@@ -130,8 +117,8 @@ def create_app(model_data):
|
|
| 130 |
headline = f"π€ **AI Generated** (Conf: {conf:.1%})"
|
| 131 |
elif label.upper() == "HUMAN":
|
| 132 |
headline = f"π€ **Human Written** (Conf: {conf:.1%})"
|
| 133 |
-
elif label == "
|
| 134 |
-
headline = f"β
|
| 135 |
else:
|
| 136 |
headline = f"β {label} (Conf: {conf:.1%})"
|
| 137 |
det = f"- Tokens: {meta.get('tokens','?')}\n- Embedding: {model_data['resolved_embedding_model_name']} (dim={model_data['resolved_embedding_dim']})"
|
|
@@ -139,6 +126,7 @@ def create_app(model_data):
|
|
| 139 |
|
| 140 |
inp.submit(_predict_ui, inp, [out, details])
|
| 141 |
gr.Button("π Predict").click(_predict_ui, inp, [out, details])
|
|
|
|
| 142 |
return demo
|
| 143 |
|
| 144 |
_model_data = load_embedding_model()
|
|
|
|
| 1 |
# app.py
|
|
|
|
|
|
|
| 2 |
import os
|
| 3 |
import re
|
|
|
|
| 4 |
import joblib
|
| 5 |
+
import torch
|
| 6 |
+
import gradio as gr
|
| 7 |
import numpy as np
|
| 8 |
import pandas as pd
|
| 9 |
+
import warnings
|
| 10 |
+
import nltk
|
| 11 |
+
from nltk.corpus import stopwords
|
| 12 |
from nltk.tokenize import word_tokenize
|
| 13 |
from nltk.stem import WordNetLemmatizer
|
|
|
|
|
|
|
|
|
|
| 14 |
from sentence_transformers import SentenceTransformer
|
| 15 |
from huggingface_hub import hf_hub_download
|
| 16 |
|
| 17 |
warnings.filterwarnings("ignore")
|
| 18 |
|
| 19 |
+
# Hugging Face repo config
|
| 20 |
+
REPO_ID = "Detecting-ai/text-detector-model-embedding"
|
| 21 |
+
FILENAME = "complete_trained_model_lite.joblib"
|
| 22 |
+
REPO_TYPE = "model"
|
| 23 |
|
| 24 |
+
# NLTK ensure
|
| 25 |
def ensure_nltk():
|
| 26 |
try: nltk.data.find("tokenizers/punkt")
|
| 27 |
except LookupError: nltk.download("punkt")
|
|
|
|
| 33 |
ensure_nltk()
|
| 34 |
|
| 35 |
def _to_stopword_set(sw):
|
| 36 |
+
if sw is None: return set(stopwords.words("english"))
|
| 37 |
+
if isinstance(sw, (list, tuple)): return set(sw)
|
| 38 |
+
if isinstance(sw, set): return sw
|
| 39 |
+
try: return set(sw)
|
| 40 |
+
except: return set(stopwords.words("english"))
|
| 41 |
+
|
| 42 |
+
def _guess_model_by_dim(dim):
|
| 43 |
+
if dim == 768:
|
| 44 |
+
return "sentence-transformers/all-mpnet-base-v2"
|
| 45 |
+
if dim == 384:
|
| 46 |
+
return "sentence-transformers/all-MiniLM-L6-v2"
|
| 47 |
+
return "sentence-transformers/all-MiniLM-L6-v2"
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
def load_embedding_model():
|
|
|
|
| 50 |
path = hf_hub_download(
|
| 51 |
repo_id=REPO_ID,
|
| 52 |
filename=FILENAME,
|
| 53 |
+
repo_type=REPO_TYPE
|
|
|
|
| 54 |
)
|
| 55 |
+
print(f"β
Downloaded bundle from Hugging Face: {FILENAME}")
|
| 56 |
+
|
| 57 |
data = joblib.load(path)
|
|
|
|
| 58 |
|
| 59 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 60 |
stored_name = (data.get("embedding_model_name") or data.get("embedding_model_path") or "").strip()
|
| 61 |
+
expected_dim = int(data.get("embedding_dim", 0)) if data.get("embedding_dim") else 0
|
| 62 |
+
emb_name = stored_name or _guess_model_by_dim(expected_dim)
|
| 63 |
|
| 64 |
+
print(f"π§ Loading embedding model: {emb_name} on {device}")
|
| 65 |
embedding_model = SentenceTransformer(emb_name, device=device)
|
| 66 |
actual_dim = embedding_model.get_sentence_embedding_dimension()
|
| 67 |
|
|
|
|
| 69 |
data["resolved_embedding_model_name"] = emb_name
|
| 70 |
data["resolved_embedding_dim"] = actual_dim
|
| 71 |
data["device"] = device
|
| 72 |
+
data["lemmatizer"] = data.get("lemmatizer") or WordNetLemmatizer()
|
|
|
|
|
|
|
| 73 |
data["stop_words"] = _to_stopword_set(data.get("stop_words"))
|
| 74 |
+
data["max_tokens"] = data.get("max_tokens", 600)
|
|
|
|
| 75 |
|
|
|
|
| 76 |
return data
|
| 77 |
|
| 78 |
def preprocess_text(text, lemmatizer, stop_words, max_tokens=600):
|
|
|
|
| 84 |
for tok in word_tokenize(text)
|
| 85 |
if tok not in stop_words and len(tok) > 2
|
| 86 |
]
|
| 87 |
+
return " ".join(tokens[:max_tokens])
|
|
|
|
|
|
|
| 88 |
|
| 89 |
def predict_text(text, model_data):
|
| 90 |
proc = preprocess_text(text, model_data["lemmatizer"], model_data["stop_words"], model_data["max_tokens"])
|
| 91 |
if not proc:
|
| 92 |
return "UNKNOWN", 0.0, {"error": "Empty text after preprocessing"}
|
| 93 |
+
|
| 94 |
with torch.no_grad():
|
| 95 |
emb = model_data["embedding_model"].encode([proc], convert_to_numpy=True, normalize_embeddings=False)
|
| 96 |
+
if emb.ndim == 1: emb = emb.reshape(1, -1)
|
| 97 |
+
|
| 98 |
clf = model_data["model"]
|
| 99 |
try:
|
| 100 |
pred = clf.predict(emb)[0]
|
| 101 |
+
conf = float(np.max(clf.predict_proba(emb)[0])) if hasattr(clf, "predict_proba") else 0.5
|
| 102 |
+
except Exception as e:
|
| 103 |
+
return "ERROR", 0.0, {"error": str(e)}
|
| 104 |
+
|
|
|
|
|
|
|
| 105 |
return str(pred), conf, {"tokens": len(proc.split())}
|
| 106 |
|
| 107 |
def create_app(model_data):
|
|
|
|
| 117 |
headline = f"π€ **AI Generated** (Conf: {conf:.1%})"
|
| 118 |
elif label.upper() == "HUMAN":
|
| 119 |
headline = f"π€ **Human Written** (Conf: {conf:.1%})"
|
| 120 |
+
elif label.upper() == "ERROR":
|
| 121 |
+
headline = f"β Error: {meta.get('error', 'Unknown')}"
|
| 122 |
else:
|
| 123 |
headline = f"β {label} (Conf: {conf:.1%})"
|
| 124 |
det = f"- Tokens: {meta.get('tokens','?')}\n- Embedding: {model_data['resolved_embedding_model_name']} (dim={model_data['resolved_embedding_dim']})"
|
|
|
|
| 126 |
|
| 127 |
inp.submit(_predict_ui, inp, [out, details])
|
| 128 |
gr.Button("π Predict").click(_predict_ui, inp, [out, details])
|
| 129 |
+
|
| 130 |
return demo
|
| 131 |
|
| 132 |
_model_data = load_embedding_model()
|