Spaces:
Running
Running
| import torch | |
| import torch.nn.functional as F | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| import re | |
| import pandas as pd | |
| import gradio as gr | |
| # ----------------------------- | |
| # MODEL (Fakespot 2025) | |
| # ----------------------------- | |
| MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32 | |
| model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval() | |
| # ----------------------------- | |
| # INTERNAL THRESHOLD for sentence labels/colors | |
| # ----------------------------- | |
| THRESHOLD = 0.70 # used only for per-sentence "AI/Human" tags & color | |
| # ----------------------------- | |
| # SENTENCE SPLITTER (protect → split → restore; no lookbehinds) | |
| # ----------------------------- | |
| ABBR = [ | |
| "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al", | |
| "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co", "u.s", "u.k", | |
| "a.m", "p.m" | |
| ] | |
| ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", flags=re.IGNORECASE) | |
| def _protect(text: str) -> str: | |
| t = text.strip() | |
| if not t: | |
| return "" | |
| t = re.sub(r"\s*\n+\s*", " ", t) # normalize newlines | |
| t = t.replace("...", "⟨ELLIPSIS⟩") # ellipses | |
| t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t) # decimals like 3.14 | |
| t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t) # abbreviations' dot | |
| return t | |
| def _restore(text: str) -> str: | |
| return (text | |
| .replace("⟨ABBRDOT⟩", ".") | |
| .replace("⟨DECIMAL⟩", ".") | |
| .replace("⟨ELLIPSIS⟩", "...")) | |
| def sentence_split(text: str): | |
| t = _protect(text) | |
| if not t: | |
| return [] | |
| # split on [.?!] followed by whitespace and likely sentence start or end | |
| parts = re.split(r"([.?!])\s+(?=(?:[\"“”‘’']?\s*[A-Z(])|$)", t) | |
| sentences, buf = [], "" | |
| for i, chunk in enumerate(parts): | |
| if i % 2 == 0: | |
| buf += chunk | |
| else: | |
| buf += chunk | |
| sentences.append(buf.strip()); buf = "" | |
| if buf.strip(): | |
| sentences.append(buf.strip()) | |
| return [_restore(s).strip() for s in sentences if s.strip()] | |
| # ----------------------------- | |
| # CORE: overall AI score + highlights | |
| # ----------------------------- | |
| def analyze(text, max_len=512): | |
| sents = sentence_split(text) | |
| if not sents: | |
| return "—", "—", "<em>Paste some text to analyze.</em>", None | |
| # light clean (per model card vibe) | |
| clean_sents = [re.sub(r"\s+", " ", s).strip() for s in sents] | |
| inputs = tokenizer( | |
| clean_sents, return_tensors="pt", padding=True, truncation=True, max_length=max_len | |
| ).to(device) | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| probs = F.softmax(logits, dim=-1) # [:,0]=Human, [:,1]=AI | |
| ai_probs = probs[:, 1].detach().cpu().tolist() | |
| overall_ai = sum(ai_probs) / len(ai_probs) | |
| overall_pct = f"{overall_ai * 100:.1f}%" | |
| overall_label = "🤖 Likely AI Written" if overall_ai >= THRESHOLD else "🧒 Likely Human Written" | |
| # Per-sentence highlights (use THRESHOLD only for the tag/color) | |
| rows, highlights = [], [] | |
| for i, orig in enumerate(sents, start=1): | |
| ai_p = float(ai_probs[i-1]) | |
| label = "AI" if ai_p >= THRESHOLD else "Human" | |
| pct = f"{ai_p*100:.1f}%" | |
| if ai_p < 0.30: | |
| color = "#11823b" # green | |
| elif ai_p < 0.70: | |
| color = "#b8860b" # amber | |
| else: | |
| color = "#b80d0d" # red | |
| normalized = re.sub(r"\s+", " ", orig) | |
| highlights.append( | |
| "<div style='margin:6px 0; padding:6px 8px; border-radius:6px; background:rgba(0,0,0,0.03)'>" | |
| f"<strong style='color:{color}'>[{pct} {label}]</strong> {normalized}</div>" | |
| ) | |
| rows.append([i, orig, round(ai_p, 4), label]) | |
| html = "\n".join(highlights) | |
| df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob", "Label"]) | |
| return overall_label, overall_pct, html, df | |
| # ----------------------------- | |
| # GRADIO UI (verdict + score, plus highlights) | |
| # ----------------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("### 🕵️ AI Written Text Detector — Fakespot Model") | |
| text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your content…") | |
| btn = gr.Button("Analyze") | |
| verdict = gr.Label(label="Verdict (Overall)") | |
| score = gr.Label(label="AI Score (Average across sentences)") | |
| highlights = gr.HTML(label="Per-Sentence Highlights") | |
| table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob", "Label"], wrap=True) | |
| btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table]) | |
| if __name__ == "__main__": | |
| demo.launch() | |