import torch import torch.nn.functional as F from transformers import AutoTokenizer, AutoModelForSequenceClassification import re import pandas as pd import gradio as gr # ----------------------------- # MODEL (Fakespot 2025) # ----------------------------- MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") dtype = torch.bfloat16 if (device.type == "cuda" and torch.cuda.is_bf16_supported()) else torch.float32 model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval() THRESHOLD = 0.80 # ----------------------------- # ABBREVIATION PROTECTION # ----------------------------- ABBR = [ "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al", "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co", "u.s", "u.k", "a.m", "p.m" ] ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE) def _protect(text): text = text.replace("...", "⟨ELLIPSIS⟩") text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text) text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text) return text def _restore(text): return ( text.replace("⟨ABBRDOT⟩", ".") .replace("⟨DECIMAL⟩", ".") .replace("⟨ELLIPSIS⟩", "...") ) # ----------------------------- # PERFECT PARAGRAPH-PRESERVING SPLITTER # ----------------------------- def split_preserving_structure(text): blocks = re.split(r"(\n+)", text) # keep newline separators final_blocks = [] for block in blocks: if block.startswith("\n"): final_blocks.append(block) else: protected = _protect(block) parts = re.split(r"([.?!])(\s+)", protected) for i in range(0, len(parts), 3): sentence = parts[i] punct = parts[i + 1] if i + 1 < len(parts) else "" space = parts[i + 2] if i + 2 < len(parts) else "" whole = sentence + punct if whole.strip(): final_blocks.append(_restore(whole)) if space: final_blocks.append(space) return final_blocks def extract_sentences_only(blocks): return [ b for b in blocks if b.strip() != "" and not b.startswith("\n") and not b.isspace() ] # ----------------------------- # GROUPING # ----------------------------- def group_sentences(sents, size=3): return [" ".join(sents[i:i + size]) for i in range(0, len(sents), size)] # ----------------------------- # ANALYSIS LOGIC # ----------------------------- def analyze(text, max_len=512): blocks = split_preserving_structure(text) pure_sentences = extract_sentences_only(blocks) if not pure_sentences: return "—", "—", "Paste text to analyze.", None # Group into 3-sentence windows grouped = group_sentences(pure_sentences, 3) clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped] # Model forward pass inputs = tokenizer(clean_grouped, return_tensors="pt", padding=True, truncation=True, max_length=max_len).to(device) with torch.no_grad(): logits = model(**inputs).logits chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist() # expand back ai_scores = [] for idx, prob in enumerate(chunk_probs): start = idx * 3 end = min(start + 3, len(pure_sentences)) for _ in range(start, end): ai_scores.append(prob) # ----------------------------- # RECONSTRUCTION WITH HIGHLIGHT # ----------------------------- highlighted = "" sentence_index = 0 for block in blocks: if block.startswith("\n"): highlighted += block continue if block.isspace(): highlighted += block continue # safety if sentence_index >= len(ai_scores): ai_p = ai_scores[-1] else: ai_p = ai_scores[sentence_index] sentence_index += 1 pct = f"{ai_p * 100:.1f}%" if ai_p < 0.30: color = "#11823b" elif ai_p < 0.70: color = "#b8860b" else: color = "#b80d0d" highlighted += ( f"[{pct}] " f"{block.strip()} " ) # ----------------------------- # OVERALL SCORE # ----------------------------- overall = sum(ai_scores) / len(ai_scores) overall_pct = f"{overall * 100:.1f}%" overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written" df = pd.DataFrame( [[i + 1, s, ai_scores[i]] for i, s in enumerate(pure_sentences)], columns=["#", "Sentence", "AI_Prob"] ) return overall_label, overall_pct, highlighted, df # ----------------------------- # UI # ----------------------------- with gr.Blocks() as demo: gr.Markdown("### 🕵️ AI Sentence-Level Detector — Exact Structure Highlighting") text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your text…") btn = gr.Button("Analyze") verdict = gr.Label(label="Verdict (Overall)") score = gr.Label(label="AI Score") highlights = gr.HTML(label="Highlighted Text (Exact Structure)") table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True) btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table]) if __name__ == "__main__": demo.launch()