File size: 7,732 Bytes
ea2c6a2
 
38debf0
ea2c6a2
 
 
24abfdf
ea2c6a2
 
 
c059497
ea2c6a2
 
205f405
ea2c6a2
 
 
 
205f405
 
 
38debf0
dfecc14
205f405
ea2c6a2
 
 
34ea950
ea2c6a2
 
c1d0bb0
ea2c6a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205f405
ea2c6a2
 
205f405
7e7e4c6
 
ea2c6a2
205f405
7e7e4c6
205f405
 
 
7e7e4c6
ea2c6a2
 
 
205f405
ea2c6a2
 
 
 
f6d6e24
ea2c6a2
205f405
ea2c6a2
205f405
ea2c6a2
 
 
 
 
 
205f405
 
24abfdf
 
 
 
 
205f405
 
 
 
 
 
 
24abfdf
 
205f405
ea2c6a2
 
 
 
205f405
 
 
 
ea2c6a2
f6d6e24
ea2c6a2
 
 
 
205f405
ea2c6a2
 
205f405
 
 
 
 
 
 
 
ea2c6a2
205f405
 
 
 
ea2c6a2
 
 
41bba56
205f405
 
 
c1d0bb0
205f405
 
ea2c6a2
 
 
 
205f405
 
 
f6d6e24
ea2c6a2
 
205f405
 
 
 
 
ea2c6a2
205f405
 
 
 
 
 
ea2c6a2
 
205f405
 
ea2c6a2
 
 
205f405
ea2c6a2
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import pandas as pd
import gradio as gr

# -----------------------------
# MODEL INITIALIZATION
# -----------------------------
MODEL_NAME = "desklib/ai-text-detector-v1.01"
tokenizer = None
model = None
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_model():
    global tokenizer, model
    if model is None:
        print(f"Loading Specialized Model: {MODEL_NAME} on {device}")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        # Load with default labels; if the model has 2 (Human/AI), we handle it in analyze()
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME, 
            ignore_mismatched_sizes=True
        ).to(device).eval()
    return tokenizer, model

THRESHOLD = 0.59 

# -----------------------------
# UTILITIES
# -----------------------------
ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al", "jr", "sr", "st", "inc", "ltd", "u.s", "u.k"]
ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)

def _protect(text):
    text = text.replace("...", "⟨ELLIPSIS⟩")
    text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
    text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
    return text

def _restore(text):
    return text.replace("⟨ABBRDOT⟩", ".").replace("⟨DECIMAL⟩", ".").replace("⟨ELLIPSIS⟩", "...")

def split_preserving_structure(text):
    blocks = re.split(r"(\n+)", text)
    final_blocks = []
    for block in blocks:
        if block.startswith("\n"):
            final_blocks.append(block)
        else:
            protected = _protect(block)
            parts = re.split(r"([.?!])(\s+)", protected)
            for i in range(0, len(parts), 3):
                sentence = parts[i]
                punct = parts[i+1] if i+1 < len(parts) else ""
                space = parts[i+2] if i+2 < len(parts) else ""
                if sentence.strip():
                    final_blocks.append(_restore(sentence + punct))
                if space:
                    final_blocks.append(space)
    return final_blocks

# -----------------------------
# ANALYSIS
# -----------------------------
@torch.inference_mode()
def analyze(text):
    text = text.strip()
    if not text:
        return "—", "—", "<em>Please enter text...</em>", None
        
    word_count = len(text.split())
    if word_count < 300: # Slightly lowered for testing flexibility
        warning_msg = f"⚠️ <b>Insufficient Text:</b> Your input has {word_count} words. Please enter at least 250-300 words for accurate results."
        return "Too Short", "N/A", f"<div style='color: #b80d0d; padding: 20px; border: 1px solid #b80d0d; border-radius: 8px;'>{warning_msg}</div>", None

    try:
        tok, mod = get_model()
    except Exception as e:
        return "ERROR", "0%", f"Failed to load model: {str(e)}", None

    blocks = split_preserving_structure(text)
    pure_sents_indices = [i for i, b in enumerate(blocks) if b.strip() and not b.startswith("\n")]
    pure_sents = [blocks[i] for i in pure_sents_indices]
    
    if not pure_sents:
        return "—", "—", "<em>No sentences detected.</em>", None

    # Sliding Window Generation (Context of 3 sentences)
    windows = []
    for i in range(len(pure_sents)):
        start = max(0, i - 1)
        end = min(len(pure_sents), i + 2)
        windows.append(" ".join(pure_sents[start:end]))

    # --- BATCHED INFERENCE (Prevents OOM) ---
    batch_size = 8
    probs = []
    for i in range(0, len(windows), batch_size):
        batch = windows[i : i + batch_size]
        inputs = tok(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        output = mod(**inputs)
        
        # Check if model is binary classification (2 labels) or regression (1 label)
        if output.logits.shape[1] > 1:
            # Assumes Label 1 is 'AI'
            batch_probs = F.softmax(output.logits, dim=-1)[:, 1].cpu().numpy().tolist()
        else:
            batch_probs = torch.sigmoid(output.logits).cpu().numpy().flatten().tolist()
        probs.extend(batch_probs)

    # Calculation for Final Score
    lengths = [len(s.split()) for s in pure_sents]
    total_words = sum(lengths)
    weighted_avg = sum(p * l for p, l in zip(probs, lengths)) / total_words if total_words > 0 else 0

    # -----------------------------
    # HTML RECONSTRUCTION
    # -----------------------------
    highlighted_html = "<div style='font-family: -apple-system, BlinkMacSystemFont, \"Segoe UI\", Roboto, sans-serif; line-height: 1.8;'>"
    prob_map = {idx: probs[i] for i, idx in enumerate(pure_sents_indices)}
    
    for i, block in enumerate(blocks):
        if block.startswith("\n") or block.isspace():
            highlighted_html += block.replace("\n", "<br>")
            continue
        
        if i in prob_map:
            score = prob_map[i]
            # Color logic based on Threshold
            if score >= THRESHOLD:
                color, bg = "#d32f2f", "rgba(211, 47, 47, 0.12)" # Soft Red
                border = "2px solid #d32f2f"
            else:
                color, bg = "#2e7d32", "rgba(46, 125, 50, 0.08)" # Soft Green
                border = "1px solid transparent"
            
            highlighted_html += (
                f"<span style='background:{bg}; padding:1px 2px; border-radius:3px; border-bottom: {border}; cursor: help;' "
                f"title='AI Confidence: {score:.2%}'>"
                f"<span style='color:{color}; font-weight: bold; font-size: 0.75em; vertical-align: super; margin-right: 2px;'>{score:.0%}</span>"
                f"{block}</span>"
            )
        else:
            highlighted_html += block
    highlighted_html += "</div>"

    label = f"{weighted_avg:.1%} AI Written"
    display_score = f"{weighted_avg:.2%}"
    df = pd.DataFrame({"Sentence": pure_sents, "AI Confidence": [f"{p:.2%}" for p in probs]})
    
    return label, display_score, highlighted_html, df

# -----------------------------
# GRADIO INTERFACE
# -----------------------------
with gr.Blocks(theme=gr.themes.Soft(), title="AI Detector Pro") as demo:
    gr.Markdown("# 🕵️ AI Detector Pro")
    gr.Markdown(f"Utilizing **{MODEL_NAME}**. Values above **{THRESHOLD*100:.0f}%** are flagged as highly likely AI.")
    
    with gr.Row():
        with gr.Column(scale=3):
            text_input = gr.Textbox(label="Input Text", lines=15, placeholder="Paste your essay here (minimum 250 words for accuracy)...")
            with gr.Row():
                clear_btn = gr.Button("Clear")
                run_btn = gr.Button("Analyze Text", variant="primary")
        
        with gr.Column(scale=1):
            verdict_out = gr.Label(label="Global Verdict")
            score_out = gr.Label(label="Weighted Probability")
            gr.Markdown("---")
            gr.Markdown("### How to read:")
            gr.Markdown("- **Red Highlight:** High AI probability\n- **Green Highlight:** Likely Human\n- **Super-script:** Exact sentence-level AI score")
    
    with gr.Tabs():
        with gr.TabItem("Visual Heatmap"):
            html_out = gr.HTML(label="Heatmap")
        with gr.TabItem("Data Breakdown"):
            table_out = gr.Dataframe(headers=["Sentence", "AI Confidence"], wrap=True)
            
    run_btn.click(analyze, inputs=text_input, outputs=[verdict_out, score_out, html_out, table_out])
    clear_btn.click(lambda: ["", "", "", "", None], outputs=[text_input, verdict_out, score_out, html_out, table_out])

if __name__ == "__main__":
    demo.launch()