Spaces:
Running
Running
File size: 5,787 Bytes
6d8431a 41a5821 6d8431a 49d2f3f ceeca7d 0d83dcd 6d8431a 0d83dcd 6d8431a fdd45e5 814a384 0d83dcd 21a21f1 41a5821 0d83dcd 70fc9f3 0d83dcd 70fc9f3 fdd45e5 6d8431a fdd45e5 70fc9f3 21a21f1 fdd45e5 21a21f1 70fc9f3 6d8431a fdd45e5 21a21f1 70fc9f3 21a21f1 fdd45e5 70fc9f3 21a21f1 70fc9f3 8d27116 70fc9f3 21a21f1 70fc9f3 fdd45e5 21a21f1 70fc9f3 21a21f1 70fc9f3 21a21f1 fdd45e5 70fc9f3 fdd45e5 814a384 70fc9f3 814a384 70fc9f3 ea83121 fdd45e5 ea83121 70fc9f3 6d8431a 70fc9f3 26af59c 21a21f1 fdd45e5 26af59c 814a384 6d8431a fdd45e5 70fc9f3 6d8431a 814a384 fdd45e5 70fc9f3 814a384 26af59c 814a384 70fc9f3 ea83121 26af59c fdd45e5 26af59c fdd45e5 23b2adf 26af59c 70fc9f3 26af59c 70fc9f3 26af59c fdd45e5 70fc9f3 fdd45e5 70fc9f3 fdd45e5 70fc9f3 fdd45e5 96ab1a6 70fc9f3 fdd45e5 8d27116 70fc9f3 668274d 70fc9f3 fdd45e5 70fc9f3 26af59c 70fc9f3 26af59c 6d8431a 26af59c ea83121 fdd45e5 6d8431a fdd45e5 6d8431a fdd45e5 b0b36a6 70fc9f3 6d8431a 0d83dcd 70fc9f3 21a21f1 0d83dcd 6d8431a 49d2f3f 6d8431a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 |
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import pandas as pd
import gradio as gr
# -----------------------------
# MODEL (Fakespot 2025)
# -----------------------------
MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.bfloat16 if (device.type == "cuda" and torch.cuda.is_bf16_supported()) else torch.float32
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
THRESHOLD = 0.80
# -----------------------------
# ABBREVIATION PROTECTION
# -----------------------------
ABBR = [
"e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc",
"fig", "al", "jr", "sr", "st", "no", "vol", "pp", "mt",
"inc", "ltd", "co", "u.s", "u.k", "a.m", "p.m"
]
ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
def _protect(text):
text = text.replace("...", "⟨ELLIPSIS⟩")
text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
return text
def _restore(text):
return (
text.replace("⟨ABBRDOT⟩", ".")
.replace("⟨DECIMAL⟩", ".")
.replace("⟨ELLIPSIS⟩", "...")
)
# -----------------------------
# PERFECT PARAGRAPH-PRESERVING SPLITTER
# -----------------------------
def split_preserving_structure(text):
blocks = re.split(r"(\n+)", text) # keep newline separators
final_blocks = []
for block in blocks:
if block.startswith("\n"):
final_blocks.append(block)
else:
protected = _protect(block)
parts = re.split(r"([.?!])(\s+)", protected)
for i in range(0, len(parts), 3):
sentence = parts[i]
punct = parts[i + 1] if i + 1 < len(parts) else ""
space = parts[i + 2] if i + 2 < len(parts) else ""
whole = sentence + punct
if whole.strip():
final_blocks.append(_restore(whole))
if space:
final_blocks.append(space)
return final_blocks
def extract_sentences_only(blocks):
return [
b for b in blocks
if b.strip() != "" and not b.startswith("\n") and not b.isspace()
]
# -----------------------------
# GROUPING
# -----------------------------
def group_sentences(sents, size=3):
return [" ".join(sents[i:i + size]) for i in range(0, len(sents), size)]
# -----------------------------
# ANALYSIS LOGIC
# -----------------------------
def analyze(text, max_len=512):
blocks = split_preserving_structure(text)
pure_sentences = extract_sentences_only(blocks)
if not pure_sentences:
return "—", "—", "<em>Paste text to analyze.</em>", None
# Group into 3-sentence windows
grouped = group_sentences(pure_sentences, 3)
clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
# Model forward pass
inputs = tokenizer(clean_grouped, return_tensors="pt",
padding=True, truncation=True,
max_length=max_len).to(device)
with torch.no_grad():
logits = model(**inputs).logits
chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
# expand back
ai_scores = []
for idx, prob in enumerate(chunk_probs):
start = idx * 3
end = min(start + 3, len(pure_sentences))
for _ in range(start, end):
ai_scores.append(prob)
# -----------------------------
# RECONSTRUCTION WITH HIGHLIGHT
# -----------------------------
highlighted = ""
sentence_index = 0
for block in blocks:
if block.startswith("\n"):
highlighted += block
continue
if block.isspace():
highlighted += block
continue
# safety
if sentence_index >= len(ai_scores):
ai_p = ai_scores[-1]
else:
ai_p = ai_scores[sentence_index]
sentence_index += 1
pct = f"{ai_p * 100:.1f}%"
if ai_p < 0.30:
color = "#11823b"
elif ai_p < 0.70:
color = "#b8860b"
else:
color = "#b80d0d"
highlighted += (
f"<span style='background:rgba(0,0,0,0.03); padding:3px 4px; "
f"border-radius:4px;'><strong style='color:{color}'>[{pct}]</strong> "
f"{block.strip()}</span> "
)
# -----------------------------
# OVERALL SCORE
# -----------------------------
overall = sum(ai_scores) / len(ai_scores)
overall_pct = f"{overall * 100:.1f}%"
overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
df = pd.DataFrame(
[[i + 1, s, ai_scores[i]] for i, s in enumerate(pure_sentences)],
columns=["#", "Sentence", "AI_Prob"]
)
return overall_label, overall_pct, highlighted, df
# -----------------------------
# UI
# -----------------------------
with gr.Blocks() as demo:
gr.Markdown("### 🕵️ AI Sentence-Level Detector — Exact Structure Highlighting")
text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your text…")
btn = gr.Button("Analyze")
verdict = gr.Label(label="Verdict (Overall)")
score = gr.Label(label="AI Score")
highlights = gr.HTML(label="Highlighted Text (Exact Structure)")
table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)
btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])
if __name__ == "__main__":
demo.launch()
|