Spaces:
Running
Running
File size: 4,600 Bytes
b0b36a6 41a5821 c40b953 b0b36a6 49d2f3f c40b953 ceeca7d 0d83dcd c40b953 0d83dcd c40b953 b0b36a6 c40b953 27d1d53 c40b953 0d83dcd c40b953 0d83dcd c40b953 b0b36a6 c40b953 b0b36a6 c40b953 b0b36a6 c40b953 b0b36a6 c40b953 b0b36a6 c40b953 b0b36a6 c40b953 41a5821 0d83dcd c40b953 0d83dcd c40b953 b0b36a6 c40b953 b0b36a6 c40b953 b0b36a6 c40b953 0d83dcd b0b36a6 c40b953 b0b36a6 c40b953 0d83dcd c40b953 49d2f3f b0b36a6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import os
import re
from typing import List, Tuple
import gradio as gr
from transformers import pipeline
# -----------------------------
# Model & simple pre-processing
# -----------------------------
MODEL_ID = "fakespot-ai/roberta-base-ai-text-detection-v1"
# If you’re on CPU-only Space and want to be explicit, uncomment device=-1
# clf = pipeline("text-classification", model=MODEL_ID, device=-1)
clf = pipeline("text-classification", model=MODEL_ID)
def clean_text(s: str) -> str:
s = s.strip()
s = re.sub(r"\s+", " ", s)
return s
def chunk_text(text: str, max_words: int = 300) -> List[str]:
words = text.split()
if len(words) <= max_words:
return [" ".join(words)]
chunks = []
for i in range(0, len(words), max_words):
chunks.append(" ".join(words[i : i + max_words]))
return chunks
# -----------------------------
# Core inference
# -----------------------------
def detect_ai(text: str) -> Tuple[str, float, str]:
"""
Returns (label, score_float, explanation)
- label: "AI" or "Human"
- score_float: mean AI likelihood in [0,1]
- explanation: short narrative with a few heuristic cues
"""
if not text or not text.strip():
return "—", 0.0, "Please paste some text to analyze."
chunks = [clean_text(c) for c in chunk_text(text, max_words=300)]
# Batch for speed and lower overhead
preds = clf(chunks)
# Aggregate AI likelihood: if a chunk label is 'AI', use score; if 'Human', use (1-score)
ai_probs = []
for p in preds:
label = str(p.get("label", "")).upper()
score = float(p.get("score", 0.0))
ai_prob = score if label.startswith("AI") else (1.0 - score)
ai_probs.append(ai_prob)
mean_ai = sum(ai_probs) / len(ai_probs)
label = "AI" if mean_ai >= 0.5 else "Human"
explanation = build_explanation(text, mean_ai, len(chunks))
return label, float(mean_ai), explanation
def build_explanation(text: str, ai_prob: float, n_chunks: int) -> str:
words = re.findall(r"\w+", text)
sentences = re.split(r"[.!?]+", text)
words = [w for w in words if w.strip()]
sentences = [s for s in sentences if s.strip()]
avg_len = (
sum(len(s.split()) for s in sentences) / max(1, len(sentences))
if sentences else 0
)
vocab = set(w.lower() for w in words)
ttr = len(vocab) / max(1, len(words)) # type-token ratio
cues = []
if ai_prob >= 0.75:
cues.append("very strong statistical signal matching AI-generated patterns")
elif ai_prob >= 0.6:
cues.append("moderate signal matching AI-generated patterns")
elif ai_prob <= 0.25:
cues.append("very low likelihood of AI, text patterns align with human writing")
else:
cues.append("mixed indicators, borderline case")
if avg_len > 25:
cues.append("longer-than-usual sentences")
elif avg_len < 10:
cues.append("very short, choppy sentences")
if ttr < 0.35:
cues.append("lower lexical variety")
elif ttr > 0.6:
cues.append("high lexical variety")
cues.append(f"analyzed in {n_chunks} chunk(s)")
return (
f"Overall this text is estimated to be {ai_prob:.2%} likely AI-generated. "
f"Notable cues: " + "; ".join(cues) + ". "
"Reminder: detectors can be wrong—use results as a hint, not proof."
)
# -----------------------------
# Gradio UI
# -----------------------------
with gr.Blocks(title="AI Text Detector") as demo:
gr.Markdown(
"## 🕵️ AI Text Detector (Simple)\n"
"Paste text and get an approximate AI-likeness score.\n\n"
"> Model: `fakespot-ai/roberta-base-ai-text-detection-v1`"
)
with gr.Row():
inp = gr.Textbox(label="Input Text", lines=14, placeholder="Paste your text here...")
with gr.Row():
label_out = gr.Label(label="Predicted Class")
score_out = gr.Slider(label="AI Likelihood", minimum=0.0, maximum=1.0, step=0.001, interactive=False)
explain = gr.Textbox(label="Explanation", lines=6)
def _run(t: str):
label, score, expl = detect_ai(t)
# gr.Label expects a dict of {class_name: confidence} for pretty display
return {label_out: {label: 1.0}, score_out: score, explain: expl}
gr.Button("Analyze").click(_run, inputs=inp, outputs=[label_out, score_out, explain])
if __name__ == "__main__":
# For Spaces, PORT is provided by the environment
demo.queue(concurrency_count=1).launch(
server_name="0.0.0.0",
server_port=int(os.getenv("PORT", 7860))
)
|