Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,17 +15,17 @@ dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported(
|
|
| 15 |
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
|
| 16 |
|
| 17 |
# -----------------------------
|
| 18 |
-
#
|
| 19 |
# -----------------------------
|
| 20 |
-
THRESHOLD = 0.
|
| 21 |
|
| 22 |
# -----------------------------
|
| 23 |
# SENTENCE SPLITTING UTILITIES
|
| 24 |
# -----------------------------
|
| 25 |
ABBR = [
|
| 26 |
"e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
|
| 27 |
-
"jr", "sr",
|
| 28 |
-
"a.m", "p.m"
|
| 29 |
]
|
| 30 |
ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", flags=re.IGNORECASE)
|
| 31 |
|
|
@@ -50,6 +50,7 @@ def sentence_split(text: str):
|
|
| 50 |
if not t:
|
| 51 |
return []
|
| 52 |
parts = re.split(r"([.?!])\s+(?=(?:[\"ββββ']?\s*[A-Z(])|$)", t)
|
|
|
|
| 53 |
sentences, buf = [], ""
|
| 54 |
for i, chunk in enumerate(parts):
|
| 55 |
if i % 2 == 0:
|
|
@@ -58,8 +59,10 @@ def sentence_split(text: str):
|
|
| 58 |
buf += chunk
|
| 59 |
sentences.append(buf.strip())
|
| 60 |
buf = ""
|
|
|
|
| 61 |
if buf.strip():
|
| 62 |
sentences.append(buf.strip())
|
|
|
|
| 63 |
return [_restore(s).strip() for s in sentences if s.strip()]
|
| 64 |
|
| 65 |
# -----------------------------
|
|
@@ -68,8 +71,7 @@ def sentence_split(text: str):
|
|
| 68 |
def group_sentences(sents, size=3):
|
| 69 |
grouped = []
|
| 70 |
for i in range(0, len(sents), size):
|
| 71 |
-
|
| 72 |
-
grouped.append(chunk)
|
| 73 |
return grouped
|
| 74 |
|
| 75 |
# -----------------------------
|
|
@@ -80,14 +82,14 @@ def analyze(text, max_len=512):
|
|
| 80 |
if not sents:
|
| 81 |
return "β", "β", "<em>Paste some text to analyze.</em>", None
|
| 82 |
|
| 83 |
-
# GROUP sentences
|
| 84 |
grouped = group_sentences(sents, size=3)
|
| 85 |
clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
|
| 86 |
|
| 87 |
# tokenize grouped
|
| 88 |
inputs = tokenizer(
|
| 89 |
-
clean_grouped, return_tensors="pt",
|
| 90 |
-
truncation=True, max_length=max_len
|
| 91 |
).to(device)
|
| 92 |
|
| 93 |
# model inference
|
|
@@ -95,7 +97,7 @@ def analyze(text, max_len=512):
|
|
| 95 |
logits = model(**inputs).logits
|
| 96 |
chunk_probs = F.softmax(logits, dim=-1)[:, 1].detach().cpu().tolist()
|
| 97 |
|
| 98 |
-
# EXPAND chunk-level probabilities
|
| 99 |
ai_probs = []
|
| 100 |
for idx, prob in enumerate(chunk_probs):
|
| 101 |
start = idx * 3
|
|
@@ -106,28 +108,34 @@ def analyze(text, max_len=512):
|
|
| 106 |
# overall AI score
|
| 107 |
overall_ai = sum(ai_probs) / len(ai_probs)
|
| 108 |
overall_pct = f"{overall_ai * 100:.1f}%"
|
|
|
|
|
|
|
| 109 |
overall_label = (
|
| 110 |
"π€ Likely AI Written" if overall_ai >= THRESHOLD else "π§ Likely Human Written"
|
| 111 |
)
|
| 112 |
|
| 113 |
# HIGHLIGHTS + TABLE
|
| 114 |
rows, highlights = [], []
|
|
|
|
| 115 |
for i, orig in enumerate(sents, start=1):
|
| 116 |
ai_p = float(ai_probs[i-1])
|
|
|
|
|
|
|
|
|
|
| 117 |
label = "AI" if ai_p >= THRESHOLD else "Human"
|
| 118 |
-
pct = f"{ai_p*100:.1f}%"
|
| 119 |
|
| 120 |
-
# color logic
|
| 121 |
if ai_p < 0.30:
|
| 122 |
-
color = "#11823b"
|
| 123 |
elif ai_p < 0.70:
|
| 124 |
-
color = "#b8860b"
|
| 125 |
else:
|
| 126 |
-
color = "#b80d0d"
|
| 127 |
|
| 128 |
normalized = re.sub(r"\s+", " ", orig)
|
|
|
|
| 129 |
highlights.append(
|
| 130 |
-
"<div style='margin:6px 0; padding:6px 8px; border-radius:6px;
|
| 131 |
"background:rgba(0,0,0,0.03)'>"
|
| 132 |
f"<strong style='color:{color}'>[{pct} {label}]</strong> "
|
| 133 |
f"{normalized}</div>"
|
|
@@ -144,7 +152,7 @@ def analyze(text, max_len=512):
|
|
| 144 |
# GRADIO UI
|
| 145 |
# -----------------------------
|
| 146 |
with gr.Blocks() as demo:
|
| 147 |
-
gr.Markdown("### π΅οΈ AI Written Text Detector β Fakespot Model (
|
| 148 |
|
| 149 |
text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your contentβ¦")
|
| 150 |
btn = gr.Button("Analyze")
|
|
|
|
| 15 |
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
|
| 16 |
|
| 17 |
# -----------------------------
|
| 18 |
+
# AI DECISION THRESHOLD (80%)
|
| 19 |
# -----------------------------
|
| 20 |
+
THRESHOLD = 0.80 # AI from 80% and above
|
| 21 |
|
| 22 |
# -----------------------------
|
| 23 |
# SENTENCE SPLITTING UTILITIES
|
| 24 |
# -----------------------------
|
| 25 |
ABBR = [
|
| 26 |
"e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
|
| 27 |
+
"jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
|
| 28 |
+
"u.s", "u.k", "a.m", "p.m"
|
| 29 |
]
|
| 30 |
ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", flags=re.IGNORECASE)
|
| 31 |
|
|
|
|
| 50 |
if not t:
|
| 51 |
return []
|
| 52 |
parts = re.split(r"([.?!])\s+(?=(?:[\"ββββ']?\s*[A-Z(])|$)", t)
|
| 53 |
+
|
| 54 |
sentences, buf = [], ""
|
| 55 |
for i, chunk in enumerate(parts):
|
| 56 |
if i % 2 == 0:
|
|
|
|
| 59 |
buf += chunk
|
| 60 |
sentences.append(buf.strip())
|
| 61 |
buf = ""
|
| 62 |
+
|
| 63 |
if buf.strip():
|
| 64 |
sentences.append(buf.strip())
|
| 65 |
+
|
| 66 |
return [_restore(s).strip() for s in sentences if s.strip()]
|
| 67 |
|
| 68 |
# -----------------------------
|
|
|
|
| 71 |
def group_sentences(sents, size=3):
|
| 72 |
grouped = []
|
| 73 |
for i in range(0, len(sents), size):
|
| 74 |
+
grouped.append(" ".join(sents[i:i+size]))
|
|
|
|
| 75 |
return grouped
|
| 76 |
|
| 77 |
# -----------------------------
|
|
|
|
| 82 |
if not sents:
|
| 83 |
return "β", "β", "<em>Paste some text to analyze.</em>", None
|
| 84 |
|
| 85 |
+
# GROUP sentences (3 at a time)
|
| 86 |
grouped = group_sentences(sents, size=3)
|
| 87 |
clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
|
| 88 |
|
| 89 |
# tokenize grouped
|
| 90 |
inputs = tokenizer(
|
| 91 |
+
clean_grouped, return_tensors="pt",
|
| 92 |
+
padding=True, truncation=True, max_length=max_len
|
| 93 |
).to(device)
|
| 94 |
|
| 95 |
# model inference
|
|
|
|
| 97 |
logits = model(**inputs).logits
|
| 98 |
chunk_probs = F.softmax(logits, dim=-1)[:, 1].detach().cpu().tolist()
|
| 99 |
|
| 100 |
+
# EXPAND chunk-level probabilities to per-sentence
|
| 101 |
ai_probs = []
|
| 102 |
for idx, prob in enumerate(chunk_probs):
|
| 103 |
start = idx * 3
|
|
|
|
| 108 |
# overall AI score
|
| 109 |
overall_ai = sum(ai_probs) / len(ai_probs)
|
| 110 |
overall_pct = f"{overall_ai * 100:.1f}%"
|
| 111 |
+
|
| 112 |
+
# UPDATED THRESHOLD (80%)
|
| 113 |
overall_label = (
|
| 114 |
"π€ Likely AI Written" if overall_ai >= THRESHOLD else "π§ Likely Human Written"
|
| 115 |
)
|
| 116 |
|
| 117 |
# HIGHLIGHTS + TABLE
|
| 118 |
rows, highlights = [], []
|
| 119 |
+
|
| 120 |
for i, orig in enumerate(sents, start=1):
|
| 121 |
ai_p = float(ai_probs[i-1])
|
| 122 |
+
pct = f"{ai_p * 100:.1f}%"
|
| 123 |
+
|
| 124 |
+
# UPDATED β label decided by 80%
|
| 125 |
label = "AI" if ai_p >= THRESHOLD else "Human"
|
|
|
|
| 126 |
|
| 127 |
+
# color logic (unchanged)
|
| 128 |
if ai_p < 0.30:
|
| 129 |
+
color = "#11823b" # green
|
| 130 |
elif ai_p < 0.70:
|
| 131 |
+
color = "#b8860b" # amber
|
| 132 |
else:
|
| 133 |
+
color = "#b80d0d" # red
|
| 134 |
|
| 135 |
normalized = re.sub(r"\s+", " ", orig)
|
| 136 |
+
|
| 137 |
highlights.append(
|
| 138 |
+
"<div style='margin:6px 0; padding:6px 8px; border-radius:6px;"
|
| 139 |
"background:rgba(0,0,0,0.03)'>"
|
| 140 |
f"<strong style='color:{color}'>[{pct} {label}]</strong> "
|
| 141 |
f"{normalized}</div>"
|
|
|
|
| 152 |
# GRADIO UI
|
| 153 |
# -----------------------------
|
| 154 |
with gr.Blocks() as demo:
|
| 155 |
+
gr.Markdown("### π΅οΈ AI Written Text Detector β Fakespot Model (80% Threshold)")
|
| 156 |
|
| 157 |
text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your contentβ¦")
|
| 158 |
btn = gr.Button("Analyze")
|