Spaces:

VictorM-Coder
/

AIDetector

Running

File size: 5,302 Bytes

6d8431a
 
 
41a5821
6d8431a
49d2f3f
ceeca7d
0d83dcd
6d8431a
0d83dcd
6d8431a
 
 
814a384
 
0d83dcd
 
23b2adf
0d83dcd
814a384
41a5821
0d83dcd
ea83121
0d83dcd
6d8431a
 
814a384
23b2adf
6d8431a
814a384
6d8431a
 
 
 
 
ea83121
 
 
 
6d8431a
 
 
814a384
 
6d8431a
814a384
6d8431a
 
 
 
 
814a384
23b2adf
6d8431a
 
 
 
 
 
ea83121
 
23b2adf
6d8431a
 
23b2adf
6d8431a
0d83dcd
814a384
 
 
 
 
 
 
 
ea83121
 
814a384
6d8431a
 
 
 
 
 
814a384
 
 
6d8431a
814a384
6d8431a
814a384
 
6d8431a
 
814a384
6d8431a
 
814a384
 
 
 
 
 
 
 
 
ea83121
 
814a384
6d8431a
23b2adf
ea83121
 
 
6d8431a
814a384
6d8431a
23b2adf
814a384
 
23b2adf
 
6d8431a
ea83121
6d8431a
7f4b27e
6d8431a
7f4b27e
6d8431a
7f4b27e
ea83121
814a384
23b2adf
6d8431a
23b2adf
ea83121
 
 
6d8431a
ea83121
814a384
6d8431a
 
ea83121
 
6d8431a
b0b36a6
6d8431a
ea83121
6d8431a
 
23b2adf
b0b36a6
6d8431a
 
0d83dcd
6d8431a
 
 
 
0d83dcd
6d8431a
49d2f3f
 
6d8431a

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import pandas as pd
import gradio as gr

# -----------------------------
# MODEL (Fakespot 2025)
# -----------------------------
MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()

# -----------------------------
# AI DECISION THRESHOLD (80%)
# -----------------------------
THRESHOLD = 0.80  # AI from 80% and above

# -----------------------------
# SENTENCE SPLITTING UTILITIES
# -----------------------------
ABBR = [
    "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
    "jr", "sr",    "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
    "u.s", "u.k", "a.m", "p.m"
]
ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", flags=re.IGNORECASE)

def _protect(text: str) -> str:
    t = text.strip()
    if not t:
        return ""
    t = re.sub(r"\s*\n+\s*", " ", t)
    t = t.replace("...", "⟨ELLIPSIS⟩")
    t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t)
    t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
    return t

def _restore(text: str) -> str:
    return (text
            .replace("⟨ABBRDOT⟩", ".")
            .replace("⟨DECIMAL⟩", ".")
            .replace("⟨ELLIPSIS⟩", "..."))

def sentence_split(text: str):
    t = _protect(text)
    if not t:
        return []
    parts = re.split(r"([.?!])\s+(?=(?:[\"“”‘’']?\s*[A-Z(])|$)", t)

    sentences, buf = [], ""
    for i, chunk in enumerate(parts):
        if i % 2 == 0:
            buf += chunk
        else:
            buf += chunk
            sentences.append(buf.strip())
            buf = ""

    if buf.strip():
        sentences.append(buf.strip())

    return [_restore(s).strip() for s in sentences if s.strip()]

# -----------------------------
# GROUP SENTENCES (TURNITIN STYLE)
# -----------------------------
def group_sentences(sents, size=3):
    grouped = []
    for i in range(0, len(sents), size):
        grouped.append(" ".join(sents[i:i+size]))
    return grouped

# -----------------------------
# CORE ANALYSIS (3 SENTENCE WINDOWS)
# -----------------------------
def analyze(text, max_len=512):
    sents = sentence_split(text)
    if not sents:
        return "—", "—", "<em>Paste some text to analyze.</em>", None

    # GROUP sentences (3 at a time)
    grouped = group_sentences(sents, size=3)
    clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]

    # tokenize grouped chunks
    inputs = tokenizer(
        clean_grouped, return_tensors="pt",
        padding=True, truncation=True, max_length=max_len
    ).to(device)

    # model inference
    with torch.no_grad():
        logits = model(**inputs).logits
        chunk_probs = F.softmax(logits, dim=-1)[:, 1].detach().cpu().tolist()

    # EXPAND chunk-level probabilities to per-sentence (each chunk contributes to its 3 sentences)
    ai_probs = []
    for idx, prob in enumerate(chunk_probs):
        start = idx * 3
        end = min(start + 3, len(sents))
        for _ in range(start, end):
            ai_probs.append(prob)

    # overall AI score
    overall_ai = sum(ai_probs) / len(ai_probs)
    overall_pct = f"{overall_ai * 100:.1f}%"

    overall_label = (
        "🤖 Likely AI Written" if overall_ai >= THRESHOLD else "🧒 Likely Human Written"
    )

    # HIGHLIGHTS + TABLE
    rows, highlights = [], []

    for i, orig in enumerate(sents, start=1):
        ai_p = float(ai_probs[i-1])
        pct = f"{ai_p * 100:.1f}%"

        label = "AI" if ai_p >= THRESHOLD else "Human"

        if ai_p < 0.30:
            color = "#11823b"
        elif ai_p < 0.70:
            color = "#b8860b"
        else:
            color = "#b80d0d"

        normalized = re.sub(r"\s+", " ", orig)

        highlights.append(
            "<div style='margin:6px 0; padding:6px 8px; border-radius:6px;"
            "background:rgba(0,0,0,0.03)'>"
            f"<strong style='color:{color}'>[{pct} {label}]</strong> "
            f"{normalized}</div>"
        )

        rows.append([i, orig, round(ai_p, 4), label])

    df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob", "Label"])
    html = "\n".join(highlights)

    return overall_label, overall_pct, html, df

# -----------------------------
# GRADIO UI
# -----------------------------
with gr.Blocks() as demo:
    gr.Markdown("### 🕵️ AI Written Text Detector — Fakespot Model (80% Threshold)")

    text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your content…")
    btn = gr.Button("Analyze")

    verdict = gr.Label(label="Verdict (Overall)")
    score = gr.Label(label="AI Score (Average across sentences)")
    highlights = gr.HTML(label="Per-Sentence Highlights")
    table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob", "Label"], wrap=True)

    btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])

if __name__ == "__main__":
    demo.launch()