File size: 4,600 Bytes
b0b36a6
41a5821
c40b953
b0b36a6
49d2f3f
c40b953
ceeca7d
0d83dcd
c40b953
0d83dcd
 
c40b953
b0b36a6
 
 
c40b953
27d1d53
c40b953
 
 
 
 
 
 
 
 
 
 
 
 
0d83dcd
 
c40b953
0d83dcd
c40b953
 
 
 
b0b36a6
 
 
c40b953
 
 
 
 
b0b36a6
 
c40b953
 
b0b36a6
c40b953
 
b0b36a6
c40b953
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0b36a6
 
 
 
c40b953
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0b36a6
c40b953
41a5821
0d83dcd
c40b953
0d83dcd
 
c40b953
b0b36a6
 
 
 
 
c40b953
 
 
b0b36a6
c40b953
 
 
b0b36a6
c40b953
0d83dcd
b0b36a6
c40b953
b0b36a6
c40b953
0d83dcd
c40b953
49d2f3f
 
b0b36a6
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
import re
from typing import List, Tuple

import gradio as gr
from transformers import pipeline

# -----------------------------
# Model & simple pre-processing
# -----------------------------

MODEL_ID = "fakespot-ai/roberta-base-ai-text-detection-v1"

# If you’re on CPU-only Space and want to be explicit, uncomment device=-1
# clf = pipeline("text-classification", model=MODEL_ID, device=-1)
clf = pipeline("text-classification", model=MODEL_ID)

def clean_text(s: str) -> str:
    s = s.strip()
    s = re.sub(r"\s+", " ", s)
    return s

def chunk_text(text: str, max_words: int = 300) -> List[str]:
    words = text.split()
    if len(words) <= max_words:
        return [" ".join(words)]
    chunks = []
    for i in range(0, len(words), max_words):
        chunks.append(" ".join(words[i : i + max_words]))
    return chunks

# -----------------------------
# Core inference
# -----------------------------

def detect_ai(text: str) -> Tuple[str, float, str]:
    """
    Returns (label, score_float, explanation)
    - label: "AI" or "Human"
    - score_float: mean AI likelihood in [0,1]
    - explanation: short narrative with a few heuristic cues
    """
    if not text or not text.strip():
        return "—", 0.0, "Please paste some text to analyze."

    chunks = [clean_text(c) for c in chunk_text(text, max_words=300)]

    # Batch for speed and lower overhead
    preds = clf(chunks)

    # Aggregate AI likelihood: if a chunk label is 'AI', use score; if 'Human', use (1-score)
    ai_probs = []
    for p in preds:
        label = str(p.get("label", "")).upper()
        score = float(p.get("score", 0.0))
        ai_prob = score if label.startswith("AI") else (1.0 - score)
        ai_probs.append(ai_prob)

    mean_ai = sum(ai_probs) / len(ai_probs)
    label = "AI" if mean_ai >= 0.5 else "Human"

    explanation = build_explanation(text, mean_ai, len(chunks))
    return label, float(mean_ai), explanation

def build_explanation(text: str, ai_prob: float, n_chunks: int) -> str:
    words = re.findall(r"\w+", text)
    sentences = re.split(r"[.!?]+", text)
    words = [w for w in words if w.strip()]
    sentences = [s for s in sentences if s.strip()]

    avg_len = (
        sum(len(s.split()) for s in sentences) / max(1, len(sentences))
        if sentences else 0
    )
    vocab = set(w.lower() for w in words)
    ttr = len(vocab) / max(1, len(words))  # type-token ratio

    cues = []
    if ai_prob >= 0.75:
        cues.append("very strong statistical signal matching AI-generated patterns")
    elif ai_prob >= 0.6:
        cues.append("moderate signal matching AI-generated patterns")
    elif ai_prob <= 0.25:
        cues.append("very low likelihood of AI, text patterns align with human writing")
    else:
        cues.append("mixed indicators, borderline case")

    if avg_len > 25:
        cues.append("longer-than-usual sentences")
    elif avg_len < 10:
        cues.append("very short, choppy sentences")

    if ttr < 0.35:
        cues.append("lower lexical variety")
    elif ttr > 0.6:
        cues.append("high lexical variety")

    cues.append(f"analyzed in {n_chunks} chunk(s)")

    return (
        f"Overall this text is estimated to be {ai_prob:.2%} likely AI-generated. "
        f"Notable cues: " + "; ".join(cues) + ". "
        "Reminder: detectors can be wrong—use results as a hint, not proof."
    )

# -----------------------------
# Gradio UI
# -----------------------------

with gr.Blocks(title="AI Text Detector") as demo:
    gr.Markdown(
        "## 🕵️ AI Text Detector (Simple)\n"
        "Paste text and get an approximate AI-likeness score.\n\n"
        "> Model: `fakespot-ai/roberta-base-ai-text-detection-v1`"
    )

    with gr.Row():
        inp = gr.Textbox(label="Input Text", lines=14, placeholder="Paste your text here...")

    with gr.Row():
        label_out = gr.Label(label="Predicted Class")
        score_out = gr.Slider(label="AI Likelihood", minimum=0.0, maximum=1.0, step=0.001, interactive=False)

    explain = gr.Textbox(label="Explanation", lines=6)

    def _run(t: str):
        label, score, expl = detect_ai(t)
        # gr.Label expects a dict of {class_name: confidence} for pretty display
        return {label_out: {label: 1.0}, score_out: score, explain: expl}

    gr.Button("Analyze").click(_run, inputs=inp, outputs=[label_out, score_out, explain])

if __name__ == "__main__":
    # For Spaces, PORT is provided by the environment
    demo.queue(concurrency_count=1).launch(
        server_name="0.0.0.0",
        server_port=int(os.getenv("PORT", 7860))
    )