AIDetector / app.py
VictorM-Coder's picture
Update app.py
7f4b27e verified
raw
history blame
4.74 kB
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import pandas as pd
import gradio as gr
# -----------------------------
# MODEL (Fakespot 2025)
# -----------------------------
MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.bfloat16 if (device.type == "cuda" and torch.cuda.is_bf16_supported()) else torch.float32
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_NAME, dtype=dtype
).to(device).eval()
# -----------------------------
# AI DECISION THRESHOLD (80%)
# -----------------------------
THRESHOLD = 0.80
# -----------------------------
# SENTENCE SPLITTING UTILITIES
# -----------------------------
ABBR = [
"e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
"jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
"u.s", "u.k", "a.m", "p.m"
]
ABBR_REGEX = re.compile(
r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.",
flags=re.IGNORECASE
)
def _protect(text: str) -> str:
t = text.strip()
if not t:
return ""
t = re.sub(r"\s*\n+\s*", " ", t)
t = t.replace("...", "⟨ELLIPSIS⟩")
t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t)
t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
return t
def _restore(text: str) -> str:
return (
text.replace("⟨ABBRDOT⟩", ".")
.replace("⟨DECIMAL⟩", ".")
.replace("⟨ELLIPSIS⟩", "...")
)
def sentence_split(text: str):
t = _protect(text)
if not t:
return []
# hard sentence boundary detection
parts = re.split(r"([.?!])\s+(?=[\"“”‘’']?\s*[A-Z(]|$)", t)
sentences, buf = [], ""
for i, chunk in enumerate(parts):
if i % 2 == 0:
buf += chunk
else:
buf += chunk
sentences.append(buf.strip())
buf = ""
if buf.strip():
sentences.append(buf.strip())
return [_restore(s).strip() for s in sentences if s.strip()]
# -----------------------------
# CORE ANALYSIS — PER SENTENCE
# -----------------------------
def analyze(text, max_len=512):
sents = sentence_split(text)
if not sents:
return "—", "—", "<em>Paste some text to analyze.</em>", None
clean_sents = [re.sub(r"\s+", " ", s).strip() for s in sents]
# tokenize list of sentences
inputs = tokenizer(
clean_sents,
return_tensors="pt",
padding=True,
truncation=True,
max_length=max_len
).to(device)
# model inference (per sentence)
with torch.no_grad():
logits = model(**inputs).logits
probs = F.softmax(logits, dim=-1)[:, 1].detach().cpu().tolist()
# overall AI score
overall_ai = sum(probs) / len(probs)
overall_pct = f"{overall_ai * 100:.1f}%"
overall_label = (
"🤖 Likely AI Written" if overall_ai >= THRESHOLD else "🧒 Likely Human Written"
)
# highlights + table
rows, highlights = [], []
for i, sentence in enumerate(sents, start=1):
ai_p = float(probs[i - 1])
pct = f"{ai_p * 100:.1f}%"
label = "AI" if ai_p >= THRESHOLD else "Human"
# colors
if ai_p < 0.30:
color = "#11823b"
elif ai_p < 0.70:
color = "#b8860b"
else:
color = "#b80d0d"
normalized = re.sub(r"\s+", " ", sentence)
highlights.append(
"<div style='margin:6px 0; padding:6px 8px; border-radius:6px;"
"background:rgba(0,0,0,0.03)'>"
f"<strong style='color:{color}'>[{pct} {label}]</strong> "
f"{normalized}</div>"
)
rows.append([i, sentence, round(ai_p, 4), label])
df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob", "Label"])
html = "\n".join(highlights)
return overall_label, overall_pct, html, df
# -----------------------------
# GRADIO UI
# -----------------------------
with gr.Blocks() as demo:
gr.Markdown("### 🕵️ AI Written Text Detector — Fakespot Model (80% Threshold)")
text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your content…")
btn = gr.Button("Analyze")
verdict = gr.Label(label="Verdict (Overall)")
score = gr.Label(label="AI Score (Average across sentences)")
highlights = gr.HTML(label="Per-Sentence Highlights")
table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob", "Label"], wrap=True)
btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])
if __name__ == "__main__":
demo.launch()