Spaces:

VictorM-Coder
/

AIDetector

Running

File size: 4,600 Bytes

b0b36a6
41a5821
c40b953
b0b36a6
49d2f3f
c40b953
ceeca7d
0d83dcd
c40b953
0d83dcd
 
c40b953
b0b36a6
 
 
c40b953
27d1d53
c40b953
 
 
 
 
 
 
 
 
 
 
 
 
0d83dcd
 
c40b953
0d83dcd
c40b953
 
 
 
b0b36a6
 
 
c40b953
 
 
 
 
b0b36a6
 
c40b953
 
b0b36a6
c40b953
 
b0b36a6
c40b953
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0b36a6
 
 
 
c40b953
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0b36a6
c40b953
41a5821
0d83dcd
c40b953
0d83dcd
 
c40b953
b0b36a6
 
 
 
 
c40b953
 
 
b0b36a6
c40b953
 
 
b0b36a6
c40b953
0d83dcd
b0b36a6
c40b953
b0b36a6
c40b953
0d83dcd
c40b953
49d2f3f
 
b0b36a6

import os
import re
from typing import List, Tuple

import gradio as gr
from transformers import pipeline

# -----------------------------
# Model & simple pre-processing
# -----------------------------

MODEL_ID = "fakespot-ai/roberta-base-ai-text-detection-v1"

# If you’re on CPU-only Space and want to be explicit, uncomment device=-1
# clf = pipeline("text-classification", model=MODEL_ID, device=-1)
clf = pipeline("text-classification", model=MODEL_ID)

def clean_text(s: str) -> str:
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    return s

def chunk_text(text: str, max_words: int = 300) -> List[str]:
    words = text.split()
    if len(words) <= max_words:
        return [" ".join(words)]
    chunks = []
    for i in range(0, len(words), max_words):
        chunks.append(" ".join(words[i : i + max_words]))
    return chunks

# -----------------------------
# Core inference
# -----------------------------

def detect_ai(text: str) -> Tuple[str, float, str]:
    """
    Returns (label, score_float, explanation)
    - label: "AI" or "Human"
    - score_float: mean AI likelihood in [0,1]
    - explanation: short narrative with a few heuristic cues
    """
    if not text or not text.strip():
        return "—", 0.0, "Please paste some text to analyze."

    chunks = [clean_text(c) for c in chunk_text(text, max_words=300)]

    # Batch for speed and lower overhead
    preds = clf(chunks)

    # Aggregate AI likelihood: if a chunk label is 'AI', use score; if 'Human', use (1-score)
    ai_probs = []
    for p in preds:
        label = str(p.get("label", "")).upper()
        score = float(p.get("score", 0.0))
        ai_prob = score if label.startswith("AI") else (1.0 - score)
        ai_probs.append(ai_prob)

    mean_ai = sum(ai_probs) / len(ai_probs)
    label = "AI" if mean_ai >= 0.5 else "Human"

    explanation = build_explanation(text, mean_ai, len(chunks))
    return label, float(mean_ai), explanation

def build_explanation(text: str, ai_prob: float, n_chunks: int) -> str:
    words = re.findall(r"\w+", text)
    sentences = re.split(r"[.!?]+", text)
    words = [w for w in words if w.strip()]
    sentences = [s for s in sentences if s.strip()]

    avg_len = (
        sum(len(s.split()) for s in sentences) / max(1, len(sentences))
        if sentences else 0
    )
    vocab = set(w.lower() for w in words)
    ttr = len(vocab) / max(1, len(words))  # type-token ratio

    cues = []
    if ai_prob >= 0.75:
        cues.append("very strong statistical signal matching AI-generated patterns")
    elif ai_prob >= 0.6:
        cues.append("moderate signal matching AI-generated patterns")
    elif ai_prob <= 0.25:
        cues.append("very low likelihood of AI, text patterns align with human writing")
    else:
        cues.append("mixed indicators, borderline case")

    if avg_len > 25:
        cues.append("longer-than-usual sentences")
    elif avg_len < 10:
        cues.append("very short, choppy sentences")

    if ttr < 0.35:
        cues.append("lower lexical variety")
    elif ttr > 0.6:
        cues.append("high lexical variety")

    cues.append(f"analyzed in {n_chunks} chunk(s)")

    return (
        f"Overall this text is estimated to be {ai_prob:.2%} likely AI-generated. "
        f"Notable cues: " + "; ".join(cues) + ". "
        "Reminder: detectors can be wrong—use results as a hint, not proof."
    )

# -----------------------------
# Gradio UI
# -----------------------------

with gr.Blocks(title="AI Text Detector") as demo:
    gr.Markdown(
        "## 🕵️ AI Text Detector (Simple)\n"
        "Paste text and get an approximate AI-likeness score.\n\n"
        "> Model: `fakespot-ai/roberta-base-ai-text-detection-v1`"
    )

    with gr.Row():
        inp = gr.Textbox(label="Input Text", lines=14, placeholder="Paste your text here...")

    with gr.Row():
        label_out = gr.Label(label="Predicted Class")
        score_out = gr.Slider(label="AI Likelihood", minimum=0.0, maximum=1.0, step=0.001, interactive=False)

    explain = gr.Textbox(label="Explanation", lines=6)

    def _run(t: str):
        label, score, expl = detect_ai(t)
        # gr.Label expects a dict of {class_name: confidence} for pretty display
        return {label_out: {label: 1.0}, score_out: score, explain: expl}

    gr.Button("Analyze").click(_run, inputs=inp, outputs=[label_out, score_out, explain])

if __name__ == "__main__":
    # For Spaces, PORT is provided by the environment
    demo.queue(concurrency_count=1).launch(
        server_name="0.0.0.0",
        server_port=int(os.getenv("PORT", 7860))
    )