AIDetector / app.py
VictorM-Coder's picture
Update app.py
41a5821 verified
raw
history blame
4.83 kB
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
import pandas as pd
import re
import gradio as gr
# ----------------------------------------------
# LOAD FAST MODEL (DistilGPT2)
# ----------------------------------------------
MODEL_NAME = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device).eval()
# ----------------------------------------------
# SENTENCE SPLITTER
# ----------------------------------------------
def sentence_split(text):
text = text.replace("\n", ". ")
s = re.split(r'(?<=[.!?])\s+', text)
return [x.strip() for x in s if x.strip()]
# ----------------------------------------------
# PERPLEXITY
# ----------------------------------------------
def perplexity(sentence):
enc = tokenizer(sentence, return_tensors="pt").to(device)
with torch.no_grad():
out = model(**enc, labels=enc["input_ids"])
return float(torch.exp(out.loss))
# ----------------------------------------------
# TOKEN-LEVEL ENTROPY
# ----------------------------------------------
def token_entropy(sentence):
enc = tokenizer(sentence, return_tensors="pt").to(device)
input_ids = enc["input_ids"][0]
with torch.no_grad():
outputs = model(enc["input_ids"], labels=enc["input_ids"])
logits = outputs.logits[0]
entropies = []
for i in range(1, len(input_ids)):
probs = torch.softmax(logits[i-1], dim=-1)
entropy = -torch.sum(probs * torch.log(probs + 1e-10))
entropies.append(float(entropy))
return np.mean(entropies), np.std(entropies)
# ----------------------------------------------
# TURNITIN-STYLE SCORING PIPELINE
# ----------------------------------------------
def analyze_sentence(sentence):
perp = perplexity(sentence)
mean_ent, std_ent = token_entropy(sentence)
length = len(sentence.split())
punct = sum([sentence.count(p) for p in ".,;:!?"])
return {
"sentence": sentence,
"perplexity": perp,
"entropy_mean": mean_ent,
"entropy_std": std_ent,
"length": length,
"punctuation": punct
}
# ----------------------------------------------
# MAIN TURNITIN STYLE DETECTOR
# ----------------------------------------------
def classify_text(text):
sentences = sentence_split(text)
stats = [analyze_sentence(s) for s in sentences]
df = pd.DataFrame(stats)
# ---------- TURNITIN STYLE METRICS ----------
perplexity_mean = df["perplexity"].mean()
perplexity_std = df["perplexity"].std()
entropy_mean = df["entropy_mean"].mean()
entropy_std = df["entropy_std"].mean()
length_std = df["length"].std()
punct_std = df["punctuation"].std()
# ---------- NORMALIZED SCORES ----------
# Low variance = AI-like
burstiness_score = np.exp(-perplexity_std)
entropy_smoothness = np.exp(-entropy_std)
length_uniformity = np.exp(-length_std / (df["length"].mean() + 1e-5))
punct_uniformity = np.exp(-punct_std / (df["punctuation"].mean() + 1e-5))
# ---------- ENSEMBLE SCORE (Turnitin-like) ----------
ai_score = (
0.35 * burstiness_score +
0.25 * entropy_smoothness +
0.20 * length_uniformity +
0.20 * punct_uniformity
)
ai_percent = float(ai_score * 100)
# ---------- PER-SENTENCE LABELS ----------
highlighted = []
for i, row in df.iterrows():
is_ai = row["perplexity"] < perplexity_mean * 0.75 and row["entropy_std"] < entropy_std * 0.8
if is_ai:
highlighted.append(f"<p style='color:red;font-weight:bold'>{row['sentence']}</p>")
else:
highlighted.append(f"<p style='color:green;font-weight:bold'>{row['sentence']}</p>")
html = "\n".join(highlighted)
# Display readable columns
df_display = df[["sentence", "perplexity", "entropy_mean", "entropy_std", "length", "punctuation"]]
return f"βš–οΈ Estimated AI Probability (Turnitin-style): {ai_percent:.1f}%", html, df_display
# ----------------------------------------------
# GRADIO UI
# ----------------------------------------------
with gr.Blocks() as demo:
gr.Markdown("## 🧠 Writenix β€” Turnitin-Style AI Detector")
text_input = gr.Textbox(label="Enter text", lines=10, placeholder="Paste your essay...")
classify_btn = gr.Button("πŸš€ Analyze")
ai_score = gr.Label(label="Turnitin-Style AI Likelihood")
highlighted = gr.HTML()
table = gr.Dataframe(headers=["Sentence", "Perplexity", "Entropy Mean", "Entropy Std", "Length", "Punctuation"], wrap=True)
classify_btn.click(classify_text, text_input, [ai_score, highlighted, table])
if __name__ == "__main__":
demo.launch()