Spaces:

VictorM-Coder
/

AIDetector

Running

App Files Files Community

AIDetector / app.py

VictorM-Coder

Update app.py

41a5821 verified 18 days ago

raw

history blame

4.83 kB

	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import numpy as np
	import pandas as pd
	import re
	import gradio as gr

	# ----------------------------------------------
	# LOAD FAST MODEL (DistilGPT2)
	# ----------------------------------------------
	MODEL_NAME = "distilgpt2"

	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device).eval()


	# ----------------------------------------------
	# SENTENCE SPLITTER
	# ----------------------------------------------
	def sentence_split(text):
	text = text.replace("\n", ". ")
	s = re.split(r'(?<=[.!?])\s+', text)
	return [x.strip() for x in s if x.strip()]


	# ----------------------------------------------
	# PERPLEXITY
	# ----------------------------------------------
	def perplexity(sentence):
	enc = tokenizer(sentence, return_tensors="pt").to(device)
	with torch.no_grad():
	out = model(**enc, labels=enc["input_ids"])
	return float(torch.exp(out.loss))


	# ----------------------------------------------
	# TOKEN-LEVEL ENTROPY
	# ----------------------------------------------
	def token_entropy(sentence):
	enc = tokenizer(sentence, return_tensors="pt").to(device)
	input_ids = enc["input_ids"][0]

	with torch.no_grad():
	outputs = model(enc["input_ids"], labels=enc["input_ids"])
	logits = outputs.logits[0]

	entropies = []
	for i in range(1, len(input_ids)):
	probs = torch.softmax(logits[i-1], dim=-1)
	entropy = -torch.sum(probs * torch.log(probs + 1e-10))
	entropies.append(float(entropy))

	return np.mean(entropies), np.std(entropies)


	# ----------------------------------------------
	# TURNITIN-STYLE SCORING PIPELINE
	# ----------------------------------------------
	def analyze_sentence(sentence):
	perp = perplexity(sentence)
	mean_ent, std_ent = token_entropy(sentence)
	length = len(sentence.split())
	punct = sum([sentence.count(p) for p in ".,;:!?"])

	return {
	"sentence": sentence,
	"perplexity": perp,
	"entropy_mean": mean_ent,
	"entropy_std": std_ent,
	"length": length,
	"punctuation": punct
	}


	# ----------------------------------------------
	# MAIN TURNITIN STYLE DETECTOR
	# ----------------------------------------------
	def classify_text(text):

	sentences = sentence_split(text)
	stats = [analyze_sentence(s) for s in sentences]

	df = pd.DataFrame(stats)

	# ---------- TURNITIN STYLE METRICS ----------
	perplexity_mean = df["perplexity"].mean()
	perplexity_std = df["perplexity"].std()

	entropy_mean = df["entropy_mean"].mean()
	entropy_std = df["entropy_std"].mean()

	length_std = df["length"].std()
	punct_std = df["punctuation"].std()

	# ---------- NORMALIZED SCORES ----------
	# Low variance = AI-like
	burstiness_score = np.exp(-perplexity_std)

	entropy_smoothness = np.exp(-entropy_std)

	length_uniformity = np.exp(-length_std / (df["length"].mean() + 1e-5))
	punct_uniformity = np.exp(-punct_std / (df["punctuation"].mean() + 1e-5))

	# ---------- ENSEMBLE SCORE (Turnitin-like) ----------
	ai_score = (
	0.35 * burstiness_score +
	0.25 * entropy_smoothness +
	0.20 * length_uniformity +
	0.20 * punct_uniformity
	)

	ai_percent = float(ai_score * 100)

	# ---------- PER-SENTENCE LABELS ----------
	highlighted = []
	for i, row in df.iterrows():
	is_ai = row["perplexity"] < perplexity_mean * 0.75 and row["entropy_std"] < entropy_std * 0.8
	if is_ai:
	highlighted.append(f"<p style='color:red;font-weight:bold'>{row['sentence']}</p>")
	else:
	highlighted.append(f"<p style='color:green;font-weight:bold'>{row['sentence']}</p>")

	html = "\n".join(highlighted)

	# Display readable columns
	df_display = df[["sentence", "perplexity", "entropy_mean", "entropy_std", "length", "punctuation"]]

	return f"⚖️ Estimated AI Probability (Turnitin-style): {ai_percent:.1f}%", html, df_display



	# ----------------------------------------------
	# GRADIO UI
	# ----------------------------------------------
	with gr.Blocks() as demo:
	gr.Markdown("## 🧠 Writenix — Turnitin-Style AI Detector")

	text_input = gr.Textbox(label="Enter text", lines=10, placeholder="Paste your essay...")

	classify_btn = gr.Button("🚀 Analyze")

	ai_score = gr.Label(label="Turnitin-Style AI Likelihood")
	highlighted = gr.HTML()
	table = gr.Dataframe(headers=["Sentence", "Perplexity", "Entropy Mean", "Entropy Std", "Length", "Punctuation"], wrap=True)

	classify_btn.click(classify_text, text_input, [ai_score, highlighted, table])

	if __name__ == "__main__":
	demo.launch()