File size: 5,302 Bytes
6d8431a
 
 
41a5821
6d8431a
49d2f3f
ceeca7d
0d83dcd
6d8431a
0d83dcd
6d8431a
 
 
814a384
 
0d83dcd
 
23b2adf
0d83dcd
814a384
41a5821
0d83dcd
ea83121
0d83dcd
6d8431a
 
814a384
23b2adf
6d8431a
814a384
6d8431a
 
 
 
 
ea83121
 
 
 
6d8431a
 
 
814a384
 
6d8431a
814a384
6d8431a
 
 
 
 
814a384
23b2adf
6d8431a
 
 
 
 
 
ea83121
 
23b2adf
6d8431a
 
23b2adf
6d8431a
0d83dcd
814a384
 
 
 
 
 
 
 
ea83121
 
814a384
6d8431a
 
 
 
 
 
814a384
 
 
6d8431a
814a384
6d8431a
814a384
 
6d8431a
 
814a384
6d8431a
 
814a384
 
 
 
 
 
 
 
 
ea83121
 
814a384
6d8431a
23b2adf
ea83121
 
 
6d8431a
814a384
6d8431a
23b2adf
814a384
 
23b2adf
 
6d8431a
ea83121
6d8431a
7f4b27e
6d8431a
7f4b27e
6d8431a
7f4b27e
ea83121
814a384
23b2adf
6d8431a
23b2adf
ea83121
 
 
6d8431a
ea83121
814a384
6d8431a
 
ea83121
 
6d8431a
b0b36a6
6d8431a
ea83121
6d8431a
 
23b2adf
b0b36a6
6d8431a
 
0d83dcd
6d8431a
 
 
 
0d83dcd
6d8431a
49d2f3f
 
6d8431a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import pandas as pd
import gradio as gr

# -----------------------------
# MODEL (Fakespot 2025)
# -----------------------------
MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()

# -----------------------------
# AI DECISION THRESHOLD (80%)
# -----------------------------
THRESHOLD = 0.80  # AI from 80% and above

# -----------------------------
# SENTENCE SPLITTING UTILITIES
# -----------------------------
ABBR = [
    "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
    "jr", "sr",    "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
    "u.s", "u.k", "a.m", "p.m"
]
ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", flags=re.IGNORECASE)

def _protect(text: str) -> str:
    t = text.strip()
    if not t:
        return ""
    t = re.sub(r"\s*\n+\s*", " ", t)
    t = t.replace("...", "⟨ELLIPSIS⟩")
    t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t)
    t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
    return t

def _restore(text: str) -> str:
    return (text
            .replace("⟨ABBRDOT⟩", ".")
            .replace("⟨DECIMAL⟩", ".")
            .replace("⟨ELLIPSIS⟩", "..."))

def sentence_split(text: str):
    t = _protect(text)
    if not t:
        return []
    parts = re.split(r"([.?!])\s+(?=(?:[\"“”‘’']?\s*[A-Z(])|$)", t)

    sentences, buf = [], ""
    for i, chunk in enumerate(parts):
        if i % 2 == 0:
            buf += chunk
        else:
            buf += chunk
            sentences.append(buf.strip())
            buf = ""

    if buf.strip():
        sentences.append(buf.strip())

    return [_restore(s).strip() for s in sentences if s.strip()]

# -----------------------------
# GROUP SENTENCES (TURNITIN STYLE)
# -----------------------------
def group_sentences(sents, size=3):
    grouped = []
    for i in range(0, len(sents), size):
        grouped.append(" ".join(sents[i:i+size]))
    return grouped

# -----------------------------
# CORE ANALYSIS (3 SENTENCE WINDOWS)
# -----------------------------
def analyze(text, max_len=512):
    sents = sentence_split(text)
    if not sents:
        return "—", "—", "<em>Paste some text to analyze.</em>", None

    # GROUP sentences (3 at a time)
    grouped = group_sentences(sents, size=3)
    clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]

    # tokenize grouped chunks
    inputs = tokenizer(
        clean_grouped, return_tensors="pt",
        padding=True, truncation=True, max_length=max_len
    ).to(device)

    # model inference
    with torch.no_grad():
        logits = model(**inputs).logits
        chunk_probs = F.softmax(logits, dim=-1)[:, 1].detach().cpu().tolist()

    # EXPAND chunk-level probabilities to per-sentence (each chunk contributes to its 3 sentences)
    ai_probs = []
    for idx, prob in enumerate(chunk_probs):
        start = idx * 3
        end = min(start + 3, len(sents))
        for _ in range(start, end):
            ai_probs.append(prob)

    # overall AI score
    overall_ai = sum(ai_probs) / len(ai_probs)
    overall_pct = f"{overall_ai * 100:.1f}%"

    overall_label = (
        "🤖 Likely AI Written" if overall_ai >= THRESHOLD else "🧒 Likely Human Written"
    )

    # HIGHLIGHTS + TABLE
    rows, highlights = [], []

    for i, orig in enumerate(sents, start=1):
        ai_p = float(ai_probs[i-1])
        pct = f"{ai_p * 100:.1f}%"

        label = "AI" if ai_p >= THRESHOLD else "Human"

        if ai_p < 0.30:
            color = "#11823b"
        elif ai_p < 0.70:
            color = "#b8860b"
        else:
            color = "#b80d0d"

        normalized = re.sub(r"\s+", " ", orig)

        highlights.append(
            "<div style='margin:6px 0; padding:6px 8px; border-radius:6px;"
            "background:rgba(0,0,0,0.03)'>"
            f"<strong style='color:{color}'>[{pct} {label}]</strong> "
            f"{normalized}</div>"
        )

        rows.append([i, orig, round(ai_p, 4), label])

    df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob", "Label"])
    html = "\n".join(highlights)

    return overall_label, overall_pct, html, df

# -----------------------------
# GRADIO UI
# -----------------------------
with gr.Blocks() as demo:
    gr.Markdown("### 🕵️ AI Written Text Detector — Fakespot Model (80% Threshold)")

    text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your content…")
    btn = gr.Button("Analyze")

    verdict = gr.Label(label="Verdict (Overall)")
    score = gr.Label(label="AI Score (Average across sentences)")
    highlights = gr.HTML(label="Per-Sentence Highlights")
    table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob", "Label"], wrap=True)

    btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])

if __name__ == "__main__":
    demo.launch()