File size: 5,787 Bytes
6d8431a
 
 
41a5821
6d8431a
49d2f3f
ceeca7d
0d83dcd
6d8431a
0d83dcd
6d8431a
 
 
fdd45e5
814a384
0d83dcd
21a21f1
41a5821
0d83dcd
70fc9f3
0d83dcd
70fc9f3
fdd45e5
 
 
6d8431a
fdd45e5
70fc9f3
21a21f1
fdd45e5
21a21f1
70fc9f3
 
 
 
6d8431a
fdd45e5
21a21f1
70fc9f3
 
 
 
 
21a21f1
fdd45e5
70fc9f3
 
 
 
 
 
21a21f1
70fc9f3
 
8d27116
70fc9f3
 
 
21a21f1
70fc9f3
 
fdd45e5
 
21a21f1
70fc9f3
 
 
 
 
21a21f1
70fc9f3
21a21f1
fdd45e5
70fc9f3
 
 
 
 
 
fdd45e5
814a384
70fc9f3
814a384
 
70fc9f3
ea83121
fdd45e5
ea83121
70fc9f3
6d8431a
 
 
70fc9f3
 
26af59c
 
 
21a21f1
fdd45e5
26af59c
814a384
6d8431a
fdd45e5
70fc9f3
 
 
6d8431a
 
 
814a384
fdd45e5
 
 
70fc9f3
814a384
 
26af59c
814a384
70fc9f3
ea83121
26af59c
fdd45e5
26af59c
 
fdd45e5
23b2adf
26af59c
70fc9f3
 
 
 
26af59c
70fc9f3
 
26af59c
fdd45e5
 
 
 
 
 
 
70fc9f3
 
 
fdd45e5
70fc9f3
fdd45e5
70fc9f3
fdd45e5
96ab1a6
70fc9f3
fdd45e5
 
8d27116
70fc9f3
668274d
70fc9f3
fdd45e5
70fc9f3
 
 
26af59c
 
 
70fc9f3
26af59c
 
6d8431a
26af59c
ea83121
fdd45e5
6d8431a
fdd45e5
6d8431a
 
fdd45e5
b0b36a6
70fc9f3
6d8431a
0d83dcd
70fc9f3
 
 
21a21f1
0d83dcd
6d8431a
49d2f3f
 
6d8431a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import pandas as pd
import gradio as gr

# -----------------------------
# MODEL (Fakespot 2025)
# -----------------------------
MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.bfloat16 if (device.type == "cuda" and torch.cuda.is_bf16_supported()) else torch.float32
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()

THRESHOLD = 0.80

# -----------------------------
# ABBREVIATION PROTECTION
# -----------------------------
ABBR = [
    "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc",
    "fig", "al", "jr", "sr", "st", "no", "vol", "pp", "mt",
    "inc", "ltd", "co", "u.s", "u.k", "a.m", "p.m"
]

ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)


def _protect(text):
    text = text.replace("...", "⟨ELLIPSIS⟩")
    text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
    text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
    return text


def _restore(text):
    return (
        text.replace("⟨ABBRDOT⟩", ".")
            .replace("⟨DECIMAL⟩", ".")
            .replace("⟨ELLIPSIS⟩", "...")
    )


# -----------------------------
# PERFECT PARAGRAPH-PRESERVING SPLITTER
# -----------------------------
def split_preserving_structure(text):
    blocks = re.split(r"(\n+)", text)  # keep newline separators
    final_blocks = []

    for block in blocks:
        if block.startswith("\n"):
            final_blocks.append(block)
        else:
            protected = _protect(block)
            parts = re.split(r"([.?!])(\s+)", protected)

            for i in range(0, len(parts), 3):
                sentence = parts[i]
                punct = parts[i + 1] if i + 1 < len(parts) else ""
                space = parts[i + 2] if i + 2 < len(parts) else ""

                whole = sentence + punct
                if whole.strip():
                    final_blocks.append(_restore(whole))
                if space:
                    final_blocks.append(space)

    return final_blocks


def extract_sentences_only(blocks):
    return [
        b for b in blocks
        if b.strip() != "" and not b.startswith("\n") and not b.isspace()
    ]


# -----------------------------
# GROUPING
# -----------------------------
def group_sentences(sents, size=3):
    return [" ".join(sents[i:i + size]) for i in range(0, len(sents), size)]


# -----------------------------
# ANALYSIS LOGIC
# -----------------------------
def analyze(text, max_len=512):

    blocks = split_preserving_structure(text)
    pure_sentences = extract_sentences_only(blocks)

    if not pure_sentences:
        return "—", "—", "<em>Paste text to analyze.</em>", None

    # Group into 3-sentence windows
    grouped = group_sentences(pure_sentences, 3)
    clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]

    # Model forward pass
    inputs = tokenizer(clean_grouped, return_tensors="pt",
                       padding=True, truncation=True,
                       max_length=max_len).to(device)

    with torch.no_grad():
        logits = model(**inputs).logits

    chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()

    # expand back
    ai_scores = []
    for idx, prob in enumerate(chunk_probs):
        start = idx * 3
        end = min(start + 3, len(pure_sentences))
        for _ in range(start, end):
            ai_scores.append(prob)

    # -----------------------------
    # RECONSTRUCTION WITH HIGHLIGHT
    # -----------------------------
    highlighted = ""
    sentence_index = 0

    for block in blocks:
        if block.startswith("\n"):
            highlighted += block
            continue

        if block.isspace():
            highlighted += block
            continue

        # safety
        if sentence_index >= len(ai_scores):
            ai_p = ai_scores[-1]
        else:
            ai_p = ai_scores[sentence_index]
        sentence_index += 1

        pct = f"{ai_p * 100:.1f}%"

        if ai_p < 0.30:
            color = "#11823b"
        elif ai_p < 0.70:
            color = "#b8860b"
        else:
            color = "#b80d0d"

        highlighted += (
            f"<span style='background:rgba(0,0,0,0.03); padding:3px 4px; "
            f"border-radius:4px;'><strong style='color:{color}'>[{pct}]</strong> "
            f"{block.strip()}</span> "
        )

    # -----------------------------
    # OVERALL SCORE
    # -----------------------------
    overall = sum(ai_scores) / len(ai_scores)
    overall_pct = f"{overall * 100:.1f}%"
    overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"

    df = pd.DataFrame(
        [[i + 1, s, ai_scores[i]] for i, s in enumerate(pure_sentences)],
        columns=["#", "Sentence", "AI_Prob"]
    )

    return overall_label, overall_pct, highlighted, df


# -----------------------------
# UI
# -----------------------------
with gr.Blocks() as demo:
    gr.Markdown("### 🕵️ AI Sentence-Level Detector — Exact Structure Highlighting")

    text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your text…")
    btn = gr.Button("Analyze")

    verdict = gr.Label(label="Verdict (Overall)")
    score = gr.Label(label="AI Score")
    highlights = gr.HTML(label="Highlighted Text (Exact Structure)")
    table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)

    btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])

if __name__ == "__main__":
    demo.launch()