Spaces:
Running
Running
| import torch | |
| import torch.nn.functional as F | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| import re | |
| import pandas as pd | |
| import gradio as gr | |
| # ----------------------------- | |
| # MODEL (Fakespot 2025) | |
| # ----------------------------- | |
| MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| dtype = torch.bfloat16 if (device.type == "cuda" and torch.cuda.is_bf16_supported()) else torch.float32 | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| MODEL_NAME, dtype=dtype | |
| ).to(device).eval() | |
| # ----------------------------- | |
| # AI DECISION THRESHOLD (80%) | |
| # ----------------------------- | |
| THRESHOLD = 0.80 | |
| # ----------------------------- | |
| # SENTENCE SPLITTING UTILITIES | |
| # ----------------------------- | |
| ABBR = [ | |
| "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al", | |
| "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co", | |
| "u.s", "u.k", "a.m", "p.m" | |
| ] | |
| ABBR_REGEX = re.compile( | |
| r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", | |
| flags=re.IGNORECASE | |
| ) | |
| def _protect(text: str) -> str: | |
| t = text.strip() | |
| if not t: | |
| return "" | |
| t = re.sub(r"\s*\n+\s*", " ", t) | |
| t = t.replace("...", "⟨ELLIPSIS⟩") | |
| t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t) | |
| t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t) | |
| return t | |
| def _restore(text: str) -> str: | |
| return ( | |
| text.replace("⟨ABBRDOT⟩", ".") | |
| .replace("⟨DECIMAL⟩", ".") | |
| .replace("⟨ELLIPSIS⟩", "...") | |
| ) | |
| def sentence_split(text: str): | |
| t = _protect(text) | |
| if not t: | |
| return [] | |
| # hard sentence boundary detection | |
| parts = re.split(r"([.?!])\s+(?=[\"“”‘’']?\s*[A-Z(]|$)", t) | |
| sentences, buf = [], "" | |
| for i, chunk in enumerate(parts): | |
| if i % 2 == 0: | |
| buf += chunk | |
| else: | |
| buf += chunk | |
| sentences.append(buf.strip()) | |
| buf = "" | |
| if buf.strip(): | |
| sentences.append(buf.strip()) | |
| return [_restore(s).strip() for s in sentences if s.strip()] | |
| # ----------------------------- | |
| # CORE ANALYSIS — PER SENTENCE | |
| # ----------------------------- | |
| def analyze(text, max_len=512): | |
| sents = sentence_split(text) | |
| if not sents: | |
| return "—", "—", "<em>Paste some text to analyze.</em>", None | |
| clean_sents = [re.sub(r"\s+", " ", s).strip() for s in sents] | |
| # tokenize list of sentences | |
| inputs = tokenizer( | |
| clean_sents, | |
| return_tensors="pt", | |
| padding=True, | |
| truncation=True, | |
| max_length=max_len | |
| ).to(device) | |
| # model inference (per sentence) | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| probs = F.softmax(logits, dim=-1)[:, 1].detach().cpu().tolist() | |
| # overall AI score | |
| overall_ai = sum(probs) / len(probs) | |
| overall_pct = f"{overall_ai * 100:.1f}%" | |
| overall_label = ( | |
| "🤖 Likely AI Written" if overall_ai >= THRESHOLD else "🧒 Likely Human Written" | |
| ) | |
| # highlights + table | |
| rows, highlights = [], [] | |
| for i, sentence in enumerate(sents, start=1): | |
| ai_p = float(probs[i - 1]) | |
| pct = f"{ai_p * 100:.1f}%" | |
| label = "AI" if ai_p >= THRESHOLD else "Human" | |
| # colors | |
| if ai_p < 0.30: | |
| color = "#11823b" | |
| elif ai_p < 0.70: | |
| color = "#b8860b" | |
| else: | |
| color = "#b80d0d" | |
| normalized = re.sub(r"\s+", " ", sentence) | |
| highlights.append( | |
| "<div style='margin:6px 0; padding:6px 8px; border-radius:6px;" | |
| "background:rgba(0,0,0,0.03)'>" | |
| f"<strong style='color:{color}'>[{pct} {label}]</strong> " | |
| f"{normalized}</div>" | |
| ) | |
| rows.append([i, sentence, round(ai_p, 4), label]) | |
| df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob", "Label"]) | |
| html = "\n".join(highlights) | |
| return overall_label, overall_pct, html, df | |
| # ----------------------------- | |
| # GRADIO UI | |
| # ----------------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("### 🕵️ AI Written Text Detector — Fakespot Model (80% Threshold)") | |
| text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your content…") | |
| btn = gr.Button("Analyze") | |
| verdict = gr.Label(label="Verdict (Overall)") | |
| score = gr.Label(label="AI Score (Average across sentences)") | |
| highlights = gr.HTML(label="Per-Sentence Highlights") | |
| table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob", "Label"], wrap=True) | |
| btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table]) | |
| if __name__ == "__main__": | |
| demo.launch() | |