Spaces:
Sleeping
Sleeping
File size: 7,732 Bytes
ea2c6a2 38debf0 ea2c6a2 24abfdf ea2c6a2 c059497 ea2c6a2 205f405 ea2c6a2 205f405 38debf0 dfecc14 205f405 ea2c6a2 34ea950 ea2c6a2 c1d0bb0 ea2c6a2 205f405 ea2c6a2 205f405 7e7e4c6 ea2c6a2 205f405 7e7e4c6 205f405 7e7e4c6 ea2c6a2 205f405 ea2c6a2 f6d6e24 ea2c6a2 205f405 ea2c6a2 205f405 ea2c6a2 205f405 24abfdf 205f405 24abfdf 205f405 ea2c6a2 205f405 ea2c6a2 f6d6e24 ea2c6a2 205f405 ea2c6a2 205f405 ea2c6a2 205f405 ea2c6a2 41bba56 205f405 c1d0bb0 205f405 ea2c6a2 205f405 f6d6e24 ea2c6a2 205f405 ea2c6a2 205f405 ea2c6a2 205f405 ea2c6a2 205f405 ea2c6a2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import pandas as pd
import gradio as gr
# -----------------------------
# MODEL INITIALIZATION
# -----------------------------
MODEL_NAME = "desklib/ai-text-detector-v1.01"
tokenizer = None
model = None
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def get_model():
global tokenizer, model
if model is None:
print(f"Loading Specialized Model: {MODEL_NAME} on {device}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# Load with default labels; if the model has 2 (Human/AI), we handle it in analyze()
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_NAME,
ignore_mismatched_sizes=True
).to(device).eval()
return tokenizer, model
THRESHOLD = 0.59
# -----------------------------
# UTILITIES
# -----------------------------
ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al", "jr", "sr", "st", "inc", "ltd", "u.s", "u.k"]
ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
def _protect(text):
text = text.replace("...", "⟨ELLIPSIS⟩")
text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
return text
def _restore(text):
return text.replace("⟨ABBRDOT⟩", ".").replace("⟨DECIMAL⟩", ".").replace("⟨ELLIPSIS⟩", "...")
def split_preserving_structure(text):
blocks = re.split(r"(\n+)", text)
final_blocks = []
for block in blocks:
if block.startswith("\n"):
final_blocks.append(block)
else:
protected = _protect(block)
parts = re.split(r"([.?!])(\s+)", protected)
for i in range(0, len(parts), 3):
sentence = parts[i]
punct = parts[i+1] if i+1 < len(parts) else ""
space = parts[i+2] if i+2 < len(parts) else ""
if sentence.strip():
final_blocks.append(_restore(sentence + punct))
if space:
final_blocks.append(space)
return final_blocks
# -----------------------------
# ANALYSIS
# -----------------------------
@torch.inference_mode()
def analyze(text):
text = text.strip()
if not text:
return "—", "—", "<em>Please enter text...</em>", None
word_count = len(text.split())
if word_count < 300: # Slightly lowered for testing flexibility
warning_msg = f"⚠️ <b>Insufficient Text:</b> Your input has {word_count} words. Please enter at least 250-300 words for accurate results."
return "Too Short", "N/A", f"<div style='color: #b80d0d; padding: 20px; border: 1px solid #b80d0d; border-radius: 8px;'>{warning_msg}</div>", None
try:
tok, mod = get_model()
except Exception as e:
return "ERROR", "0%", f"Failed to load model: {str(e)}", None
blocks = split_preserving_structure(text)
pure_sents_indices = [i for i, b in enumerate(blocks) if b.strip() and not b.startswith("\n")]
pure_sents = [blocks[i] for i in pure_sents_indices]
if not pure_sents:
return "—", "—", "<em>No sentences detected.</em>", None
# Sliding Window Generation (Context of 3 sentences)
windows = []
for i in range(len(pure_sents)):
start = max(0, i - 1)
end = min(len(pure_sents), i + 2)
windows.append(" ".join(pure_sents[start:end]))
# --- BATCHED INFERENCE (Prevents OOM) ---
batch_size = 8
probs = []
for i in range(0, len(windows), batch_size):
batch = windows[i : i + batch_size]
inputs = tok(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
output = mod(**inputs)
# Check if model is binary classification (2 labels) or regression (1 label)
if output.logits.shape[1] > 1:
# Assumes Label 1 is 'AI'
batch_probs = F.softmax(output.logits, dim=-1)[:, 1].cpu().numpy().tolist()
else:
batch_probs = torch.sigmoid(output.logits).cpu().numpy().flatten().tolist()
probs.extend(batch_probs)
# Calculation for Final Score
lengths = [len(s.split()) for s in pure_sents]
total_words = sum(lengths)
weighted_avg = sum(p * l for p, l in zip(probs, lengths)) / total_words if total_words > 0 else 0
# -----------------------------
# HTML RECONSTRUCTION
# -----------------------------
highlighted_html = "<div style='font-family: -apple-system, BlinkMacSystemFont, \"Segoe UI\", Roboto, sans-serif; line-height: 1.8;'>"
prob_map = {idx: probs[i] for i, idx in enumerate(pure_sents_indices)}
for i, block in enumerate(blocks):
if block.startswith("\n") or block.isspace():
highlighted_html += block.replace("\n", "<br>")
continue
if i in prob_map:
score = prob_map[i]
# Color logic based on Threshold
if score >= THRESHOLD:
color, bg = "#d32f2f", "rgba(211, 47, 47, 0.12)" # Soft Red
border = "2px solid #d32f2f"
else:
color, bg = "#2e7d32", "rgba(46, 125, 50, 0.08)" # Soft Green
border = "1px solid transparent"
highlighted_html += (
f"<span style='background:{bg}; padding:1px 2px; border-radius:3px; border-bottom: {border}; cursor: help;' "
f"title='AI Confidence: {score:.2%}'>"
f"<span style='color:{color}; font-weight: bold; font-size: 0.75em; vertical-align: super; margin-right: 2px;'>{score:.0%}</span>"
f"{block}</span>"
)
else:
highlighted_html += block
highlighted_html += "</div>"
label = f"{weighted_avg:.1%} AI Written"
display_score = f"{weighted_avg:.2%}"
df = pd.DataFrame({"Sentence": pure_sents, "AI Confidence": [f"{p:.2%}" for p in probs]})
return label, display_score, highlighted_html, df
# -----------------------------
# GRADIO INTERFACE
# -----------------------------
with gr.Blocks(theme=gr.themes.Soft(), title="AI Detector Pro") as demo:
gr.Markdown("# 🕵️ AI Detector Pro")
gr.Markdown(f"Utilizing **{MODEL_NAME}**. Values above **{THRESHOLD*100:.0f}%** are flagged as highly likely AI.")
with gr.Row():
with gr.Column(scale=3):
text_input = gr.Textbox(label="Input Text", lines=15, placeholder="Paste your essay here (minimum 250 words for accuracy)...")
with gr.Row():
clear_btn = gr.Button("Clear")
run_btn = gr.Button("Analyze Text", variant="primary")
with gr.Column(scale=1):
verdict_out = gr.Label(label="Global Verdict")
score_out = gr.Label(label="Weighted Probability")
gr.Markdown("---")
gr.Markdown("### How to read:")
gr.Markdown("- **Red Highlight:** High AI probability\n- **Green Highlight:** Likely Human\n- **Super-script:** Exact sentence-level AI score")
with gr.Tabs():
with gr.TabItem("Visual Heatmap"):
html_out = gr.HTML(label="Heatmap")
with gr.TabItem("Data Breakdown"):
table_out = gr.Dataframe(headers=["Sentence", "AI Confidence"], wrap=True)
run_btn.click(analyze, inputs=text_input, outputs=[verdict_out, score_out, html_out, table_out])
clear_btn.click(lambda: ["", "", "", "", None], outputs=[text_input, verdict_out, score_out, html_out, table_out])
if __name__ == "__main__":
demo.launch() |