Spaces:
Running
Running
| import torch | |
| import torch.nn.functional as F | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| import re | |
| import pandas as pd | |
| import gradio as gr | |
| # ----------------------------- | |
| # MODEL (Fakespot 2025) | |
| # ----------------------------- | |
| MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32 | |
| model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval() | |
| # ----------------------------- | |
| # AI DECISION THRESHOLD (80%) | |
| # ----------------------------- | |
| THRESHOLD = 0.80 # AI from 80% and above | |
| # ----------------------------- | |
| # SENTENCE SPLITTING UTILITIES | |
| # ----------------------------- | |
| ABBR = [ | |
| "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al", | |
| "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co", | |
| "u.s", "u.k", "a.m", "p.m" | |
| ] | |
| ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", flags=re.IGNORECASE) | |
| def _protect(text: str) -> str: | |
| t = text.strip() | |
| if not t: | |
| return "" | |
| t = re.sub(r"\s*\n+\s*", " ", t) | |
| t = t.replace("...", "⟨ELLIPSIS⟩") | |
| t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t) | |
| t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t) | |
| return t | |
| def _restore(text: str) -> str: | |
| return (text | |
| .replace("⟨ABBRDOT⟩", ".") | |
| .replace("⟨DECIMAL⟩", ".") | |
| .replace("⟨ELLIPSIS⟩", "...")) | |
| def sentence_split(text: str): | |
| t = _protect(text) | |
| if not t: | |
| return [] | |
| parts = re.split(r"([.?!])\s+(?=(?:[\"“”‘’']?\s*[A-Z(])|$)", t) | |
| sentences, buf = [], "" | |
| for i, chunk in enumerate(parts): | |
| if i % 2 == 0: | |
| buf += chunk | |
| else: | |
| buf += chunk | |
| sentences.append(buf.strip()) | |
| buf = "" | |
| if buf.strip(): | |
| sentences.append(buf.strip()) | |
| return [_restore(s).strip() for s in sentences if s.strip()] | |
| # ----------------------------- | |
| # GROUP SENTENCES (TURNITIN STYLE) | |
| # ----------------------------- | |
| def group_sentences(sents, size=3): | |
| grouped = [] | |
| for i in range(0, len(sents), size): | |
| grouped.append(" ".join(sents[i:i+size])) | |
| return grouped | |
| # ----------------------------- | |
| # CORE ANALYSIS (3 SENTENCE WINDOWS) | |
| # ----------------------------- | |
| def analyze(text, max_len=512): | |
| sents = sentence_split(text) | |
| if not sents: | |
| return "—", "—", "<em>Paste some text to analyze.</em>", None | |
| # GROUP sentences (3 at a time) | |
| grouped = group_sentences(sents, size=3) | |
| clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped] | |
| # tokenize grouped chunks | |
| inputs = tokenizer( | |
| clean_grouped, return_tensors="pt", | |
| padding=True, truncation=True, max_length=max_len | |
| ).to(device) | |
| # model inference | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| chunk_probs = F.softmax(logits, dim=-1)[:, 1].detach().cpu().tolist() | |
| # EXPAND chunk-level probabilities to per-sentence (each chunk contributes to its 3 sentences) | |
| ai_probs = [] | |
| for idx, prob in enumerate(chunk_probs): | |
| start = idx * 3 | |
| end = min(start + 3, len(sents)) | |
| for _ in range(start, end): | |
| ai_probs.append(prob) | |
| # overall AI score | |
| overall_ai = sum(ai_probs) / len(ai_probs) | |
| overall_pct = f"{overall_ai * 100:.1f}%" | |
| overall_label = ( | |
| "🤖 Likely AI Written" if overall_ai >= THRESHOLD else "🧒 Likely Human Written" | |
| ) | |
| # HIGHLIGHTS + TABLE | |
| rows, highlights = [], [] | |
| for i, orig in enumerate(sents, start=1): | |
| ai_p = float(ai_probs[i-1]) | |
| pct = f"{ai_p * 100:.1f}%" | |
| label = "AI" if ai_p >= THRESHOLD else "Human" | |
| if ai_p < 0.30: | |
| color = "#11823b" | |
| elif ai_p < 0.70: | |
| color = "#b8860b" | |
| else: | |
| color = "#b80d0d" | |
| normalized = re.sub(r"\s+", " ", orig) | |
| highlights.append( | |
| "<div style='margin:6px 0; padding:6px 8px; border-radius:6px;" | |
| "background:rgba(0,0,0,0.03)'>" | |
| f"<strong style='color:{color}'>[{pct} {label}]</strong> " | |
| f"{normalized}</div>" | |
| ) | |
| rows.append([i, orig, round(ai_p, 4), label]) | |
| df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob", "Label"]) | |
| html = "\n".join(highlights) | |
| return overall_label, overall_pct, html, df | |
| # ----------------------------- | |
| # GRADIO UI | |
| # ----------------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("### 🕵️ AI Written Text Detector — Fakespot Model (80% Threshold)") | |
| text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your content…") | |
| btn = gr.Button("Analyze") | |
| verdict = gr.Label(label="Verdict (Overall)") | |
| score = gr.Label(label="AI Score (Average across sentences)") | |
| highlights = gr.HTML(label="Per-Sentence Highlights") | |
| table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob", "Label"], wrap=True) | |
| btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table]) | |
| if __name__ == "__main__": | |
| demo.launch() | |