Spaces:
Running
Running
File size: 6,179 Bytes
6d8431a 41a5821 6d8431a 49d2f3f ceeca7d 0d83dcd 6d8431a 0d83dcd 6d8431a 814a384 0d83dcd 21a21f1 41a5821 0d83dcd 96ab1a6 0d83dcd 96ab1a6 6d8431a 96ab1a6 21a21f1 96ab1a6 6d8431a 21a21f1 96ab1a6 21a21f1 96ab1a6 21a21f1 96ab1a6 21a21f1 96ab1a6 21a21f1 96ab1a6 21a21f1 96ab1a6 21a21f1 96ab1a6 21a21f1 96ab1a6 814a384 96ab1a6 814a384 96ab1a6 ea83121 96ab1a6 6d8431a 96ab1a6 26af59c 21a21f1 96ab1a6 26af59c 814a384 6d8431a 96ab1a6 6d8431a 21a21f1 814a384 96ab1a6 814a384 26af59c 814a384 96ab1a6 ea83121 26af59c 96ab1a6 26af59c 96ab1a6 23b2adf 26af59c 96ab1a6 26af59c 96ab1a6 26af59c 96ab1a6 26af59c 96ab1a6 26af59c 96ab1a6 26af59c 96ab1a6 26af59c 96ab1a6 26af59c 6d8431a 26af59c ea83121 6d8431a 26af59c 6d8431a 96ab1a6 b0b36a6 96ab1a6 6d8431a 0d83dcd 96ab1a6 21a21f1 0d83dcd 6d8431a 49d2f3f 6d8431a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 | import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import pandas as pd
import gradio as gr
# -----------------------------
# MODEL (Fakespot 2025)
# -----------------------------
MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
THRESHOLD = 0.80
# -----------------------------
# ABBREVIATION PROTECTION
# -----------------------------
ABBR = [
"e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
"jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
"u.s", "u.k", "a.m", "p.m"
]
ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
def _protect(text):
text = text.replace("...", "⟨ELLIPSIS⟩")
text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
return text
def _restore(text):
return (
text.replace("⟨ABBRDOT⟩", ".")
.replace("⟨DECIMAL⟩", ".")
.replace("⟨ELLIPSIS⟩", "...")
)
# -----------------------------
# PERFECT PARAGRAPH-PRESERVING SPLITTER
# -----------------------------
def split_preserving_structure(text):
"""
Splits text into:
- EXACT newline blocks (\n, \n\n, etc.)
- Sentences inside non-newline blocks
"""
blocks = re.split(r"(\n+)", text) # keep newline separators
final_blocks = []
for block in blocks:
if block.startswith("\n"):
final_blocks.append(block) # preserve EXACT paragraph spacing
else:
protected = _protect(block)
parts = re.split(r"([.?!])(\s+)", protected)
for i in range(0, len(parts), 3):
sentence = parts[i]
punct = parts[i+1] if i+1 < len(parts) else ""
space = parts[i+2] if i+2 < len(parts) else ""
whole = sentence + punct
if whole.strip():
final_blocks.append(_restore(whole))
if space:
final_blocks.append(space)
return final_blocks
def extract_sentences_only(blocks):
"""Return only sentence blocks (no whitespace/newlines)."""
return [
b for b in blocks
if b.strip() != "" and not b.startswith("\n") and not b.isspace()
]
# -----------------------------
# GROUPING
# -----------------------------
def group_sentences(sents, size=3):
return [" ".join(sents[i:i + size]) for i in range(0, len(sents), size)]
# -----------------------------
# ANALYSIS LOGIC
# -----------------------------
def analyze(text, max_len=512):
# Structured block split
blocks = split_preserving_structure(text)
pure_sentences = extract_sentences_only(blocks)
if not pure_sentences:
return "—", "—", "<em>Paste text to analyze.</em>", None
# Group into 3-sentence windows (Turnitin style)
grouped = group_sentences(pure_sentences, 3)
clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
# Run model
inputs = tokenizer(clean_grouped, return_tensors="pt",
padding=True, truncation=True,
max_length=max_len).to(device)
with torch.no_grad():
logits = model(**inputs).logits
chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
# Expand group scores back to individual sentences
ai_scores = []
for idx, prob in enumerate(chunk_probs):
start = idx * 3
end = min(start + 3, len(pure_sentences))
for _ in range(start, end):
ai_scores.append(prob)
# -----------------------------
# RECONSTRUCT ORIGINAL TEXT W/ HIGHLIGHTING
# -----------------------------
highlighted = ""
current_sentence = 0
for block in blocks:
# newline block → keep EXACT
if block.startswith("\n"):
highlighted += block
continue
# whitespace block → keep
if block.isspace():
highlighted += block
continue
# real sentence → highlight
ai_p = ai_scores[current_sentence]
current_sentence += 1
pct = f"{ai_p * 100:.1f}%"
if ai_p < 0.30:
color = "#11823b"
elif ai_p < 0.70:
color = "#b8860b"
else:
color = "#b80d0d"
highlighted += (
f"<span style='background:rgba(0,0,0,0.03); padding:3px 4px; "
f"border-radius:4px;'><strong style='color:{color}'>[{pct}]</strong> "
f"{block.strip()}</span>"
)
# maintain spacing after sentence
highlighted += " "
# -----------------------------
# OVERALL SCORE
# -----------------------------
overall = sum(ai_scores) / len(ai_scores)
overall_pct = f"{overall * 100:.1f}%"
overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
# Table output
df = pd.DataFrame(
[[i + 1, s, ai_scores[i]] for i, s in enumerate(pure_sentences)],
columns=["#", "Sentence", "AI_Prob"]
)
return overall_label, overall_pct, highlighted, df
# -----------------------------
# UI
# -----------------------------
with gr.Blocks() as demo:
gr.Markdown("### 🕵️ AI Sentence-Level Detector — Exact Structure Highlighting")
text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your text…")
btn = gr.Button("Analyze")
verdict = gr.Label(label="Verdict (Overall)")
score = gr.Label(label="AI Score")
highlights = gr.HTML(label="Highlighted Text (Exact Structure)")
table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)
btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])
if __name__ == "__main__":
demo.launch()
|