Spaces:
Running
Running
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import numpy as np | |
| import pandas as pd | |
| import re | |
| import gradio as gr | |
| # ---------------------------------------------- | |
| # LOAD FAST MODEL (DistilGPT2) | |
| # ---------------------------------------------- | |
| MODEL_NAME = "distilgpt2" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device).eval() | |
| # ---------------------------------------------- | |
| # SENTENCE SPLITTER | |
| # ---------------------------------------------- | |
| def sentence_split(text): | |
| text = text.replace("\n", ". ") | |
| s = re.split(r'(?<=[.!?])\s+', text) | |
| return [x.strip() for x in s if x.strip()] | |
| # ---------------------------------------------- | |
| # PERPLEXITY | |
| # ---------------------------------------------- | |
| def perplexity(sentence): | |
| enc = tokenizer(sentence, return_tensors="pt").to(device) | |
| with torch.no_grad(): | |
| out = model(**enc, labels=enc["input_ids"]) | |
| return float(torch.exp(out.loss)) | |
| # ---------------------------------------------- | |
| # TOKEN-LEVEL ENTROPY | |
| # ---------------------------------------------- | |
| def token_entropy(sentence): | |
| enc = tokenizer(sentence, return_tensors="pt").to(device) | |
| input_ids = enc["input_ids"][0] | |
| with torch.no_grad(): | |
| outputs = model(enc["input_ids"], labels=enc["input_ids"]) | |
| logits = outputs.logits[0] | |
| entropies = [] | |
| for i in range(1, len(input_ids)): | |
| probs = torch.softmax(logits[i-1], dim=-1) | |
| entropy = -torch.sum(probs * torch.log(probs + 1e-10)) | |
| entropies.append(float(entropy)) | |
| return np.mean(entropies), np.std(entropies) | |
| # ---------------------------------------------- | |
| # TURNITIN-STYLE SCORING PIPELINE | |
| # ---------------------------------------------- | |
| def analyze_sentence(sentence): | |
| perp = perplexity(sentence) | |
| mean_ent, std_ent = token_entropy(sentence) | |
| length = len(sentence.split()) | |
| punct = sum([sentence.count(p) for p in ".,;:!?"]) | |
| return { | |
| "sentence": sentence, | |
| "perplexity": perp, | |
| "entropy_mean": mean_ent, | |
| "entropy_std": std_ent, | |
| "length": length, | |
| "punctuation": punct | |
| } | |
| # ---------------------------------------------- | |
| # MAIN TURNITIN STYLE DETECTOR | |
| # ---------------------------------------------- | |
| def classify_text(text): | |
| sentences = sentence_split(text) | |
| stats = [analyze_sentence(s) for s in sentences] | |
| df = pd.DataFrame(stats) | |
| # ---------- TURNITIN STYLE METRICS ---------- | |
| perplexity_mean = df["perplexity"].mean() | |
| perplexity_std = df["perplexity"].std() | |
| entropy_mean = df["entropy_mean"].mean() | |
| entropy_std = df["entropy_std"].mean() | |
| length_std = df["length"].std() | |
| punct_std = df["punctuation"].std() | |
| # ---------- NORMALIZED SCORES ---------- | |
| # Low variance = AI-like | |
| burstiness_score = np.exp(-perplexity_std) | |
| entropy_smoothness = np.exp(-entropy_std) | |
| length_uniformity = np.exp(-length_std / (df["length"].mean() + 1e-5)) | |
| punct_uniformity = np.exp(-punct_std / (df["punctuation"].mean() + 1e-5)) | |
| # ---------- ENSEMBLE SCORE (Turnitin-like) ---------- | |
| ai_score = ( | |
| 0.35 * burstiness_score + | |
| 0.25 * entropy_smoothness + | |
| 0.20 * length_uniformity + | |
| 0.20 * punct_uniformity | |
| ) | |
| ai_percent = float(ai_score * 100) | |
| # ---------- PER-SENTENCE LABELS ---------- | |
| highlighted = [] | |
| for i, row in df.iterrows(): | |
| is_ai = row["perplexity"] < perplexity_mean * 0.75 and row["entropy_std"] < entropy_std * 0.8 | |
| if is_ai: | |
| highlighted.append(f"<p style='color:red;font-weight:bold'>{row['sentence']}</p>") | |
| else: | |
| highlighted.append(f"<p style='color:green;font-weight:bold'>{row['sentence']}</p>") | |
| html = "\n".join(highlighted) | |
| # Display readable columns | |
| df_display = df[["sentence", "perplexity", "entropy_mean", "entropy_std", "length", "punctuation"]] | |
| return f"βοΈ Estimated AI Probability (Turnitin-style): {ai_percent:.1f}%", html, df_display | |
| # ---------------------------------------------- | |
| # GRADIO UI | |
| # ---------------------------------------------- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## π§ Writenix β Turnitin-Style AI Detector") | |
| text_input = gr.Textbox(label="Enter text", lines=10, placeholder="Paste your essay...") | |
| classify_btn = gr.Button("π Analyze") | |
| ai_score = gr.Label(label="Turnitin-Style AI Likelihood") | |
| highlighted = gr.HTML() | |
| table = gr.Dataframe(headers=["Sentence", "Perplexity", "Entropy Mean", "Entropy Std", "Length", "Punctuation"], wrap=True) | |
| classify_btn.click(classify_text, text_input, [ai_score, highlighted, table]) | |
| if __name__ == "__main__": | |
| demo.launch() | |