File size: 6,179 Bytes
6d8431a
 
 
41a5821
6d8431a
49d2f3f
ceeca7d
0d83dcd
6d8431a
0d83dcd
6d8431a
 
 
814a384
 
0d83dcd
21a21f1
41a5821
0d83dcd
96ab1a6
0d83dcd
96ab1a6
 
 
 
6d8431a
96ab1a6
21a21f1
 
96ab1a6
 
 
 
6d8431a
21a21f1
96ab1a6
 
 
 
 
21a21f1
96ab1a6
 
 
 
 
 
 
 
 
 
 
21a21f1
96ab1a6
 
 
 
 
 
21a21f1
96ab1a6
 
 
 
21a21f1
96ab1a6
 
 
21a21f1
96ab1a6
 
21a21f1
96ab1a6
21a21f1
 
96ab1a6
 
 
 
 
 
 
814a384
96ab1a6
814a384
 
96ab1a6
ea83121
 
96ab1a6
6d8431a
 
 
96ab1a6
 
 
26af59c
 
 
21a21f1
96ab1a6
26af59c
814a384
6d8431a
96ab1a6
 
 
 
6d8431a
 
 
21a21f1
814a384
96ab1a6
 
814a384
 
26af59c
814a384
96ab1a6
ea83121
26af59c
96ab1a6
26af59c
 
96ab1a6
23b2adf
26af59c
96ab1a6
 
 
 
 
 
 
26af59c
96ab1a6
 
26af59c
96ab1a6
 
 
26af59c
96ab1a6
 
 
 
 
 
 
 
26af59c
96ab1a6
 
 
 
 
26af59c
96ab1a6
 
 
 
 
 
 
 
26af59c
 
 
 
96ab1a6
26af59c
 
6d8431a
26af59c
ea83121
6d8431a
26af59c
6d8431a
 
96ab1a6
b0b36a6
96ab1a6
6d8431a
0d83dcd
96ab1a6
 
 
21a21f1
0d83dcd
6d8431a
49d2f3f
 
6d8431a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import pandas as pd
import gradio as gr

# -----------------------------
# MODEL (Fakespot 2025)
# -----------------------------
MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()

THRESHOLD = 0.80

# -----------------------------
# ABBREVIATION PROTECTION
# -----------------------------
ABBR = [
    "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
    "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
    "u.s", "u.k", "a.m", "p.m"
]
ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)

def _protect(text):
    text = text.replace("...", "⟨ELLIPSIS⟩")
    text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
    text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
    return text

def _restore(text):
    return (
        text.replace("⟨ABBRDOT⟩", ".")
            .replace("⟨DECIMAL⟩", ".")
            .replace("⟨ELLIPSIS⟩", "...")
    )

# -----------------------------
# PERFECT PARAGRAPH-PRESERVING SPLITTER
# -----------------------------
def split_preserving_structure(text):
    """
    Splits text into:
    - EXACT newline blocks (\n, \n\n, etc.)
    - Sentences inside non-newline blocks
    """
    blocks = re.split(r"(\n+)", text)  # keep newline separators
    final_blocks = []

    for block in blocks:
        if block.startswith("\n"):
            final_blocks.append(block)  # preserve EXACT paragraph spacing
        else:
            protected = _protect(block)
            parts = re.split(r"([.?!])(\s+)", protected)

            for i in range(0, len(parts), 3):
                sentence = parts[i]
                punct = parts[i+1] if i+1 < len(parts) else ""
                space = parts[i+2] if i+2 < len(parts) else ""

                whole = sentence + punct
                if whole.strip():
                    final_blocks.append(_restore(whole))

                if space:
                    final_blocks.append(space)

    return final_blocks


def extract_sentences_only(blocks):
    """Return only sentence blocks (no whitespace/newlines)."""
    return [
        b for b in blocks
        if b.strip() != "" and not b.startswith("\n") and not b.isspace()
    ]

# -----------------------------
# GROUPING
# -----------------------------
def group_sentences(sents, size=3):
    return [" ".join(sents[i:i + size]) for i in range(0, len(sents), size)]

# -----------------------------
# ANALYSIS LOGIC
# -----------------------------
def analyze(text, max_len=512):

    # Structured block split
    blocks = split_preserving_structure(text)
    pure_sentences = extract_sentences_only(blocks)

    if not pure_sentences:
        return "—", "—", "<em>Paste text to analyze.</em>", None

    # Group into 3-sentence windows (Turnitin style)
    grouped = group_sentences(pure_sentences, 3)
    clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]

    # Run model
    inputs = tokenizer(clean_grouped, return_tensors="pt",
                       padding=True, truncation=True,
                       max_length=max_len).to(device)

    with torch.no_grad():
        logits = model(**inputs).logits
        chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()

    # Expand group scores back to individual sentences
    ai_scores = []
    for idx, prob in enumerate(chunk_probs):
        start = idx * 3
        end = min(start + 3, len(pure_sentences))
        for _ in range(start, end):
            ai_scores.append(prob)

    # -----------------------------
    # RECONSTRUCT ORIGINAL TEXT W/ HIGHLIGHTING
    # -----------------------------
    highlighted = ""
    current_sentence = 0

    for block in blocks:

        # newline block → keep EXACT
        if block.startswith("\n"):
            highlighted += block
            continue

        # whitespace block → keep
        if block.isspace():
            highlighted += block
            continue

        # real sentence → highlight
        ai_p = ai_scores[current_sentence]
        current_sentence += 1

        pct = f"{ai_p * 100:.1f}%"

        if ai_p < 0.30:
            color = "#11823b"
        elif ai_p < 0.70:
            color = "#b8860b"
        else:
            color = "#b80d0d"

        highlighted += (
            f"<span style='background:rgba(0,0,0,0.03); padding:3px 4px; "
            f"border-radius:4px;'><strong style='color:{color}'>[{pct}]</strong> "
            f"{block.strip()}</span>"
        )

        # maintain spacing after sentence
        highlighted += " "

    # -----------------------------
    # OVERALL SCORE
    # -----------------------------
    overall = sum(ai_scores) / len(ai_scores)
    overall_pct = f"{overall * 100:.1f}%"
    overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"

    # Table output
    df = pd.DataFrame(
        [[i + 1, s, ai_scores[i]] for i, s in enumerate(pure_sentences)],
        columns=["#", "Sentence", "AI_Prob"]
    )

    return overall_label, overall_pct, highlighted, df

# -----------------------------
# UI
# -----------------------------
with gr.Blocks() as demo:
    gr.Markdown("### 🕵️ AI Sentence-Level Detector — Exact Structure Highlighting")

    text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your text…")
    btn = gr.Button("Analyze")

    verdict = gr.Label(label="Verdict (Overall)")
    score = gr.Label(label="AI Score")
    highlights = gr.HTML(label="Highlighted Text (Exact Structure)")
    table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)

    btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])

if __name__ == "__main__":
    demo.launch()