import torch from transformers import AutoTokenizer, AutoModelForCausalLM import numpy as np import pandas as pd import re import gradio as gr # ---------------------------------------------- # LOAD FAST MODEL (DistilGPT2) # ---------------------------------------------- MODEL_NAME = "distilgpt2" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device).eval() # ---------------------------------------------- # SENTENCE SPLITTER # ---------------------------------------------- def sentence_split(text): text = text.replace("\n", ". ") s = re.split(r'(?<=[.!?])\s+', text) return [x.strip() for x in s if x.strip()] # ---------------------------------------------- # PERPLEXITY # ---------------------------------------------- def perplexity(sentence): enc = tokenizer(sentence, return_tensors="pt").to(device) with torch.no_grad(): out = model(**enc, labels=enc["input_ids"]) return float(torch.exp(out.loss)) # ---------------------------------------------- # TOKEN-LEVEL ENTROPY # ---------------------------------------------- def token_entropy(sentence): enc = tokenizer(sentence, return_tensors="pt").to(device) input_ids = enc["input_ids"][0] with torch.no_grad(): outputs = model(enc["input_ids"], labels=enc["input_ids"]) logits = outputs.logits[0] entropies = [] for i in range(1, len(input_ids)): probs = torch.softmax(logits[i-1], dim=-1) entropy = -torch.sum(probs * torch.log(probs + 1e-10)) entropies.append(float(entropy)) return np.mean(entropies), np.std(entropies) # ---------------------------------------------- # TURNITIN-STYLE SCORING PIPELINE # ---------------------------------------------- def analyze_sentence(sentence): perp = perplexity(sentence) mean_ent, std_ent = token_entropy(sentence) length = len(sentence.split()) punct = sum([sentence.count(p) for p in ".,;:!?"]) return { "sentence": sentence, "perplexity": perp, "entropy_mean": mean_ent, "entropy_std": std_ent, "length": length, "punctuation": punct } # ---------------------------------------------- # MAIN TURNITIN STYLE DETECTOR # ---------------------------------------------- def classify_text(text): sentences = sentence_split(text) stats = [analyze_sentence(s) for s in sentences] df = pd.DataFrame(stats) # ---------- TURNITIN STYLE METRICS ---------- perplexity_mean = df["perplexity"].mean() perplexity_std = df["perplexity"].std() entropy_mean = df["entropy_mean"].mean() entropy_std = df["entropy_std"].mean() length_std = df["length"].std() punct_std = df["punctuation"].std() # ---------- NORMALIZED SCORES ---------- # Low variance = AI-like burstiness_score = np.exp(-perplexity_std) entropy_smoothness = np.exp(-entropy_std) length_uniformity = np.exp(-length_std / (df["length"].mean() + 1e-5)) punct_uniformity = np.exp(-punct_std / (df["punctuation"].mean() + 1e-5)) # ---------- ENSEMBLE SCORE (Turnitin-like) ---------- ai_score = ( 0.35 * burstiness_score + 0.25 * entropy_smoothness + 0.20 * length_uniformity + 0.20 * punct_uniformity ) ai_percent = float(ai_score * 100) # ---------- PER-SENTENCE LABELS ---------- highlighted = [] for i, row in df.iterrows(): is_ai = row["perplexity"] < perplexity_mean * 0.75 and row["entropy_std"] < entropy_std * 0.8 if is_ai: highlighted.append(f"

{row['sentence']}

") else: highlighted.append(f"

{row['sentence']}

") html = "\n".join(highlighted) # Display readable columns df_display = df[["sentence", "perplexity", "entropy_mean", "entropy_std", "length", "punctuation"]] return f"⚖️ Estimated AI Probability (Turnitin-style): {ai_percent:.1f}%", html, df_display # ---------------------------------------------- # GRADIO UI # ---------------------------------------------- with gr.Blocks() as demo: gr.Markdown("## 🧠 Writenix — Turnitin-Style AI Detector") text_input = gr.Textbox(label="Enter text", lines=10, placeholder="Paste your essay...") classify_btn = gr.Button("🚀 Analyze") ai_score = gr.Label(label="Turnitin-Style AI Likelihood") highlighted = gr.HTML() table = gr.Dataframe(headers=["Sentence", "Perplexity", "Entropy Mean", "Entropy Std", "Length", "Punctuation"], wrap=True) classify_btn.click(classify_text, text_input, [ai_score, highlighted, table]) if __name__ == "__main__": demo.launch()