Spaces:

VictorM-Coder
/

AIDetector

Running

File size: 6,179 Bytes

6d8431a
 
 
41a5821
6d8431a
49d2f3f
ceeca7d
0d83dcd
6d8431a
0d83dcd
6d8431a
 
 
814a384
 
0d83dcd
21a21f1
41a5821
0d83dcd
96ab1a6
0d83dcd
96ab1a6
 
 
 
6d8431a
96ab1a6
21a21f1
 
96ab1a6
 
 
 
6d8431a
21a21f1
96ab1a6
 
 
 
 
21a21f1
96ab1a6
 
 
 
 
 
 
 
 
 
 
21a21f1
96ab1a6
 
 
 
 
 
21a21f1
96ab1a6
 
 
 
21a21f1
96ab1a6
 
 
21a21f1
96ab1a6
 
21a21f1
96ab1a6
21a21f1
 
96ab1a6
 
 
 
 
 
 
814a384
96ab1a6
814a384
 
96ab1a6
ea83121
 
96ab1a6
6d8431a
 
 
96ab1a6
 
 
26af59c
 
 
21a21f1
96ab1a6
26af59c
814a384
6d8431a
96ab1a6
 
 
 
6d8431a
 
 
21a21f1
814a384
96ab1a6
 
814a384
 
26af59c
814a384
96ab1a6
ea83121
26af59c
96ab1a6
26af59c
 
96ab1a6
23b2adf
26af59c
96ab1a6
 
 
 
 
 
 
26af59c
96ab1a6
 
26af59c
96ab1a6
 
 
26af59c
96ab1a6
 
 
 
 
 
 
 
26af59c
96ab1a6
 
 
 
 
26af59c
96ab1a6
 
 
 
 
 
 
 
26af59c
 
 
 
96ab1a6
26af59c
 
6d8431a
26af59c
ea83121
6d8431a
26af59c
6d8431a
 
96ab1a6
b0b36a6
96ab1a6
6d8431a
0d83dcd
96ab1a6
 
 
21a21f1
0d83dcd
6d8431a
49d2f3f
 
6d8431a

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import re
import pandas as pd
import gradio as gr

# -----------------------------
# MODEL (Fakespot 2025)
# -----------------------------
MODEL_NAME = "fakespot-ai/roberta-base-ai-text-detection-v1"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()

THRESHOLD = 0.80

# -----------------------------
# ABBREVIATION PROTECTION
# -----------------------------
ABBR = [
    "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
    "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
    "u.s", "u.k", "a.m", "p.m"
]
ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)

def _protect(text):
    text = text.replace("...", "⟨ELLIPSIS⟩")
    text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
    text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
    return text

def _restore(text):
    return (
        text.replace("⟨ABBRDOT⟩", ".")
            .replace("⟨DECIMAL⟩", ".")
            .replace("⟨ELLIPSIS⟩", "...")
    )

# -----------------------------
# PERFECT PARAGRAPH-PRESERVING SPLITTER
# -----------------------------
def split_preserving_structure(text):
    """
    Splits text into:
    - EXACT newline blocks (\n, \n\n, etc.)
    - Sentences inside non-newline blocks
    """
    blocks = re.split(r"(\n+)", text)  # keep newline separators
    final_blocks = []

    for block in blocks:
        if block.startswith("\n"):
            final_blocks.append(block)  # preserve EXACT paragraph spacing
        else:
            protected = _protect(block)
            parts = re.split(r"([.?!])(\s+)", protected)

            for i in range(0, len(parts), 3):
                sentence = parts[i]
                punct = parts[i+1] if i+1 < len(parts) else ""
                space = parts[i+2] if i+2 < len(parts) else ""

                whole = sentence + punct
                if whole.strip():
                    final_blocks.append(_restore(whole))

                if space:
                    final_blocks.append(space)

    return final_blocks


def extract_sentences_only(blocks):
    """Return only sentence blocks (no whitespace/newlines)."""
    return [
        b for b in blocks
        if b.strip() != "" and not b.startswith("\n") and not b.isspace()
    ]

# -----------------------------
# GROUPING
# -----------------------------
def group_sentences(sents, size=3):
    return [" ".join(sents[i:i + size]) for i in range(0, len(sents), size)]

# -----------------------------
# ANALYSIS LOGIC
# -----------------------------
def analyze(text, max_len=512):

    # Structured block split
    blocks = split_preserving_structure(text)
    pure_sentences = extract_sentences_only(blocks)

    if not pure_sentences:
        return "—", "—", "<em>Paste text to analyze.</em>", None

    # Group into 3-sentence windows (Turnitin style)
    grouped = group_sentences(pure_sentences, 3)
    clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]

    # Run model
    inputs = tokenizer(clean_grouped, return_tensors="pt",
                       padding=True, truncation=True,
                       max_length=max_len).to(device)

    with torch.no_grad():
        logits = model(**inputs).logits
        chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()

    # Expand group scores back to individual sentences
    ai_scores = []
    for idx, prob in enumerate(chunk_probs):
        start = idx * 3
        end = min(start + 3, len(pure_sentences))
        for _ in range(start, end):
            ai_scores.append(prob)

    # -----------------------------
    # RECONSTRUCT ORIGINAL TEXT W/ HIGHLIGHTING
    # -----------------------------
    highlighted = ""
    current_sentence = 0

    for block in blocks:

        # newline block → keep EXACT
        if block.startswith("\n"):
            highlighted += block
            continue

        # whitespace block → keep
        if block.isspace():
            highlighted += block
            continue

        # real sentence → highlight
        ai_p = ai_scores[current_sentence]
        current_sentence += 1

        pct = f"{ai_p * 100:.1f}%"

        if ai_p < 0.30:
            color = "#11823b"
        elif ai_p < 0.70:
            color = "#b8860b"
        else:
            color = "#b80d0d"

        highlighted += (
            f"<span style='background:rgba(0,0,0,0.03); padding:3px 4px; "
            f"border-radius:4px;'><strong style='color:{color}'>[{pct}]</strong> "
            f"{block.strip()}</span>"
        )

        # maintain spacing after sentence
        highlighted += " "

    # -----------------------------
    # OVERALL SCORE
    # -----------------------------
    overall = sum(ai_scores) / len(ai_scores)
    overall_pct = f"{overall * 100:.1f}%"
    overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"

    # Table output
    df = pd.DataFrame(
        [[i + 1, s, ai_scores[i]] for i, s in enumerate(pure_sentences)],
        columns=["#", "Sentence", "AI_Prob"]
    )

    return overall_label, overall_pct, highlighted, df

# -----------------------------
# UI
# -----------------------------
with gr.Blocks() as demo:
    gr.Markdown("### 🕵️ AI Sentence-Level Detector — Exact Structure Highlighting")

    text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your text…")
    btn = gr.Button("Analyze")

    verdict = gr.Label(label="Verdict (Overall)")
    score = gr.Label(label="AI Score")
    highlights = gr.HTML(label="Highlighted Text (Exact Structure)")
    table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)

    btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])

if __name__ == "__main__":
    demo.launch()