Spaces:

VictorM-Coder
/

AIDetector

Running

App Files Files Community

AIDetector / app.py

VictorM-Coder

Update app.py

7f4b27e verified 23 days ago

raw

history blame

4.74 kB

	import torch
	import torch.nn.functional as F
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import re
	import pandas as pd
	import gradio as gr

	# -----------------------------
	# MODEL (Fakespot 2025)
	# -----------------------------
	MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	dtype = torch.bfloat16 if (device.type == "cuda" and torch.cuda.is_bf16_supported()) else torch.float32

	model = AutoModelForSequenceClassification.from_pretrained(
	MODEL_NAME, dtype=dtype
	).to(device).eval()

	# -----------------------------
	# AI DECISION THRESHOLD (80%)
	# -----------------------------
	THRESHOLD = 0.80

	# -----------------------------
	# SENTENCE SPLITTING UTILITIES
	# -----------------------------
	ABBR = [
	"e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
	"jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
	"u.s", "u.k", "a.m", "p.m"
	]
	ABBR_REGEX = re.compile(
	r"\b(" + "\|".join(map(re.escape, ABBR)) + r")\.",
	flags=re.IGNORECASE
	)

	def _protect(text: str) -> str:
	t = text.strip()
	if not t:
	return ""
	t = re.sub(r"\s\n+\s", " ", t)
	t = t.replace("...", "⟨ELLIPSIS⟩")
	t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t)
	t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
	return t

	def _restore(text: str) -> str:
	return (
	text.replace("⟨ABBRDOT⟩", ".")
	.replace("⟨DECIMAL⟩", ".")
	.replace("⟨ELLIPSIS⟩", "...")
	)

	def sentence_split(text: str):
	t = _protect(text)
	if not t:
	return []

	# hard sentence boundary detection
	parts = re.split(r"([.?!])\s+(?=[\"“”‘’']?\s*[A-Z(]\|$)", t)

	sentences, buf = [], ""
	for i, chunk in enumerate(parts):
	if i % 2 == 0:
	buf += chunk
	else:
	buf += chunk
	sentences.append(buf.strip())
	buf = ""

	if buf.strip():
	sentences.append(buf.strip())

	return [_restore(s).strip() for s in sentences if s.strip()]


	# -----------------------------
	# CORE ANALYSIS — PER SENTENCE
	# -----------------------------
	def analyze(text, max_len=512):
	sents = sentence_split(text)
	if not sents:
	return "—", "—", "<em>Paste some text to analyze.</em>", None

	clean_sents = [re.sub(r"\s+", " ", s).strip() for s in sents]

	# tokenize list of sentences
	inputs = tokenizer(
	clean_sents,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=max_len
	).to(device)

	# model inference (per sentence)
	with torch.no_grad():
	logits = model(**inputs).logits
	probs = F.softmax(logits, dim=-1)[:, 1].detach().cpu().tolist()

	# overall AI score
	overall_ai = sum(probs) / len(probs)
	overall_pct = f"{overall_ai * 100:.1f}%"

	overall_label = (
	"🤖 Likely AI Written" if overall_ai >= THRESHOLD else "🧒 Likely Human Written"
	)

	# highlights + table
	rows, highlights = [], []

	for i, sentence in enumerate(sents, start=1):
	ai_p = float(probs[i - 1])
	pct = f"{ai_p * 100:.1f}%"

	label = "AI" if ai_p >= THRESHOLD else "Human"

	# colors
	if ai_p < 0.30:
	color = "#11823b"
	elif ai_p < 0.70:
	color = "#b8860b"
	else:
	color = "#b80d0d"

	normalized = re.sub(r"\s+", " ", sentence)

	highlights.append(
	"<div style='margin:6px 0; padding:6px 8px; border-radius:6px;"
	"background:rgba(0,0,0,0.03)'>"
	f"<strong style='color:{color}'>[{pct} {label}]</strong> "
	f"{normalized}</div>"
	)

	rows.append([i, sentence, round(ai_p, 4), label])

	df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob", "Label"])
	html = "\n".join(highlights)

	return overall_label, overall_pct, html, df


	# -----------------------------
	# GRADIO UI
	# -----------------------------
	with gr.Blocks() as demo:
	gr.Markdown("### 🕵️ AI Written Text Detector — Fakespot Model (80% Threshold)")

	text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your content…")
	btn = gr.Button("Analyze")

	verdict = gr.Label(label="Verdict (Overall)")
	score = gr.Label(label="AI Score (Average across sentences)")
	highlights = gr.HTML(label="Per-Sentence Highlights")
	table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob", "Label"], wrap=True)

	btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])

	if __name__ == "__main__":
	demo.launch()