Spaces:

VictorM-Coder
/

AIDetector

Running

File size: 5,787 Bytes

6d8431a
 
 
41a5821
6d8431a
49d2f3f
ceeca7d
0d83dcd
6d8431a
0d83dcd
6d8431a
 
 
fdd45e5
814a384
0d83dcd
21a21f1
41a5821
0d83dcd
70fc9f3
0d83dcd
70fc9f3
fdd45e5
 
 
6d8431a
fdd45e5
70fc9f3
21a21f1
fdd45e5
21a21f1
70fc9f3
 
 
 
6d8431a
fdd45e5
21a21f1
70fc9f3
 
 
 
 
21a21f1
fdd45e5
70fc9f3
 
 
 
 
 
21a21f1
70fc9f3
 
8d27116
70fc9f3
 
 
21a21f1
70fc9f3
 
fdd45e5
 
21a21f1
70fc9f3
 
 
 
 
21a21f1
70fc9f3
21a21f1
fdd45e5
70fc9f3
 
 
 
 
 
fdd45e5
814a384
70fc9f3
814a384
 
70fc9f3
ea83121
fdd45e5
ea83121
70fc9f3
6d8431a
 
 
70fc9f3
 
26af59c
 
 
21a21f1
fdd45e5
26af59c
814a384
6d8431a
fdd45e5
70fc9f3
 
 
6d8431a
 
 
814a384
fdd45e5
 
 
70fc9f3
814a384
 
26af59c
814a384
70fc9f3
ea83121
26af59c
fdd45e5
26af59c
 
fdd45e5
23b2adf
26af59c
70fc9f3
 
 
 
26af59c
70fc9f3
 
26af59c
fdd45e5
 
 
 
 
 
 
70fc9f3
 
 
fdd45e5
70fc9f3
fdd45e5
70fc9f3
fdd45e5
96ab1a6
70fc9f3
fdd45e5
 
8d27116
70fc9f3
668274d
70fc9f3
fdd45e5
70fc9f3
 
 
26af59c
 
 
70fc9f3
26af59c
 
6d8431a
26af59c
ea83121
fdd45e5
6d8431a
fdd45e5
6d8431a
 
fdd45e5
b0b36a6
70fc9f3
6d8431a
0d83dcd
70fc9f3
 
 
21a21f1
0d83dcd
6d8431a
49d2f3f
 
6d8431a

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import pandas as pd
import gradio as gr

# -----------------------------
# MODEL (Fakespot 2025)
# -----------------------------
MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.bfloat16 if (device.type == "cuda" and torch.cuda.is_bf16_supported()) else torch.float32
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()

THRESHOLD = 0.80

# -----------------------------
# ABBREVIATION PROTECTION
# -----------------------------
ABBR = [
    "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc",
    "fig", "al", "jr", "sr", "st", "no", "vol", "pp", "mt",
    "inc", "ltd", "co", "u.s", "u.k", "a.m", "p.m"
]

ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)


def _protect(text):
    text = text.replace("...", "⟨ELLIPSIS⟩")
    text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
    text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
    return text


def _restore(text):
    return (
        text.replace("⟨ABBRDOT⟩", ".")
            .replace("⟨DECIMAL⟩", ".")
            .replace("⟨ELLIPSIS⟩", "...")
    )


# -----------------------------
# PERFECT PARAGRAPH-PRESERVING SPLITTER
# -----------------------------
def split_preserving_structure(text):
    blocks = re.split(r"(\n+)", text)  # keep newline separators
    final_blocks = []

    for block in blocks:
        if block.startswith("\n"):
            final_blocks.append(block)
        else:
            protected = _protect(block)
            parts = re.split(r"([.?!])(\s+)", protected)

            for i in range(0, len(parts), 3):
                sentence = parts[i]
                punct = parts[i + 1] if i + 1 < len(parts) else ""
                space = parts[i + 2] if i + 2 < len(parts) else ""

                whole = sentence + punct
                if whole.strip():
                    final_blocks.append(_restore(whole))
                if space:
                    final_blocks.append(space)

    return final_blocks


def extract_sentences_only(blocks):
    return [
        b for b in blocks
        if b.strip() != "" and not b.startswith("\n") and not b.isspace()
    ]


# -----------------------------
# GROUPING
# -----------------------------
def group_sentences(sents, size=3):
    return [" ".join(sents[i:i + size]) for i in range(0, len(sents), size)]


# -----------------------------
# ANALYSIS LOGIC
# -----------------------------
def analyze(text, max_len=512):

    blocks = split_preserving_structure(text)
    pure_sentences = extract_sentences_only(blocks)

    if not pure_sentences:
        return "—", "—", "<em>Paste text to analyze.</em>", None

    # Group into 3-sentence windows
    grouped = group_sentences(pure_sentences, 3)
    clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]

    # Model forward pass
    inputs = tokenizer(clean_grouped, return_tensors="pt",
                       padding=True, truncation=True,
                       max_length=max_len).to(device)

    with torch.no_grad():
        logits = model(**inputs).logits

    chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()

    # expand back
    ai_scores = []
    for idx, prob in enumerate(chunk_probs):
        start = idx * 3
        end = min(start + 3, len(pure_sentences))
        for _ in range(start, end):
            ai_scores.append(prob)

    # -----------------------------
    # RECONSTRUCTION WITH HIGHLIGHT
    # -----------------------------
    highlighted = ""
    sentence_index = 0

    for block in blocks:
        if block.startswith("\n"):
            highlighted += block
            continue

        if block.isspace():
            highlighted += block
            continue

        # safety
        if sentence_index >= len(ai_scores):
            ai_p = ai_scores[-1]
        else:
            ai_p = ai_scores[sentence_index]
        sentence_index += 1

        pct = f"{ai_p * 100:.1f}%"

        if ai_p < 0.30:
            color = "#11823b"
        elif ai_p < 0.70:
            color = "#b8860b"
        else:
            color = "#b80d0d"

        highlighted += (
            f"<span style='background:rgba(0,0,0,0.03); padding:3px 4px; "
            f"border-radius:4px;'><strong style='color:{color}'>[{pct}]</strong> "
            f"{block.strip()}</span> "
        )

    # -----------------------------
    # OVERALL SCORE
    # -----------------------------
    overall = sum(ai_scores) / len(ai_scores)
    overall_pct = f"{overall * 100:.1f}%"
    overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"

    df = pd.DataFrame(
        [[i + 1, s, ai_scores[i]] for i, s in enumerate(pure_sentences)],
        columns=["#", "Sentence", "AI_Prob"]
    )

    return overall_label, overall_pct, highlighted, df


# -----------------------------
# UI
# -----------------------------
with gr.Blocks() as demo:
    gr.Markdown("### 🕵️ AI Sentence-Level Detector — Exact Structure Highlighting")

    text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your text…")
    btn = gr.Button("Analyze")

    verdict = gr.Label(label="Verdict (Overall)")
    score = gr.Label(label="AI Score")
    highlights = gr.HTML(label="Highlighted Text (Exact Structure)")
    table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)

    btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])

if __name__ == "__main__":
    demo.launch()