Spaces:
Running
Running
File size: 4,831 Bytes
958e345 2b59ac0 ceeca7d 41a5821 49d2f3f ceeca7d 41a5821 5a39ff3 9267b26 49d2f3f 96c50c6 ceeca7d 41a5821 2b59ac0 41a5821 f2f742a 41a5821 9267b26 41a5821 2b59ac0 41a5821 2b59ac0 41a5821 5a39ff3 2b59ac0 41a5821 49d2f3f f2f742a 41a5821 96c50c6 41a5821 49d2f3f 41a5821 2b59ac0 49d2f3f 41a5821 49d2f3f 9267b26 41a5821 1feb8eb 41a5821 49d2f3f 41a5821 49d2f3f 41a5821 49d2f3f 41a5821 49d2f3f 41a5821 49d2f3f 41a5821 49d2f3f 2b59ac0 49d2f3f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
import pandas as pd
import re
import gradio as gr
# ----------------------------------------------
# LOAD FAST MODEL (DistilGPT2)
# ----------------------------------------------
MODEL_NAME = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device).eval()
# ----------------------------------------------
# SENTENCE SPLITTER
# ----------------------------------------------
def sentence_split(text):
text = text.replace("\n", ". ")
s = re.split(r'(?<=[.!?])\s+', text)
return [x.strip() for x in s if x.strip()]
# ----------------------------------------------
# PERPLEXITY
# ----------------------------------------------
def perplexity(sentence):
enc = tokenizer(sentence, return_tensors="pt").to(device)
with torch.no_grad():
out = model(**enc, labels=enc["input_ids"])
return float(torch.exp(out.loss))
# ----------------------------------------------
# TOKEN-LEVEL ENTROPY
# ----------------------------------------------
def token_entropy(sentence):
enc = tokenizer(sentence, return_tensors="pt").to(device)
input_ids = enc["input_ids"][0]
with torch.no_grad():
outputs = model(enc["input_ids"], labels=enc["input_ids"])
logits = outputs.logits[0]
entropies = []
for i in range(1, len(input_ids)):
probs = torch.softmax(logits[i-1], dim=-1)
entropy = -torch.sum(probs * torch.log(probs + 1e-10))
entropies.append(float(entropy))
return np.mean(entropies), np.std(entropies)
# ----------------------------------------------
# TURNITIN-STYLE SCORING PIPELINE
# ----------------------------------------------
def analyze_sentence(sentence):
perp = perplexity(sentence)
mean_ent, std_ent = token_entropy(sentence)
length = len(sentence.split())
punct = sum([sentence.count(p) for p in ".,;:!?"])
return {
"sentence": sentence,
"perplexity": perp,
"entropy_mean": mean_ent,
"entropy_std": std_ent,
"length": length,
"punctuation": punct
}
# ----------------------------------------------
# MAIN TURNITIN STYLE DETECTOR
# ----------------------------------------------
def classify_text(text):
sentences = sentence_split(text)
stats = [analyze_sentence(s) for s in sentences]
df = pd.DataFrame(stats)
# ---------- TURNITIN STYLE METRICS ----------
perplexity_mean = df["perplexity"].mean()
perplexity_std = df["perplexity"].std()
entropy_mean = df["entropy_mean"].mean()
entropy_std = df["entropy_std"].mean()
length_std = df["length"].std()
punct_std = df["punctuation"].std()
# ---------- NORMALIZED SCORES ----------
# Low variance = AI-like
burstiness_score = np.exp(-perplexity_std)
entropy_smoothness = np.exp(-entropy_std)
length_uniformity = np.exp(-length_std / (df["length"].mean() + 1e-5))
punct_uniformity = np.exp(-punct_std / (df["punctuation"].mean() + 1e-5))
# ---------- ENSEMBLE SCORE (Turnitin-like) ----------
ai_score = (
0.35 * burstiness_score +
0.25 * entropy_smoothness +
0.20 * length_uniformity +
0.20 * punct_uniformity
)
ai_percent = float(ai_score * 100)
# ---------- PER-SENTENCE LABELS ----------
highlighted = []
for i, row in df.iterrows():
is_ai = row["perplexity"] < perplexity_mean * 0.75 and row["entropy_std"] < entropy_std * 0.8
if is_ai:
highlighted.append(f"<p style='color:red;font-weight:bold'>{row['sentence']}</p>")
else:
highlighted.append(f"<p style='color:green;font-weight:bold'>{row['sentence']}</p>")
html = "\n".join(highlighted)
# Display readable columns
df_display = df[["sentence", "perplexity", "entropy_mean", "entropy_std", "length", "punctuation"]]
return f"βοΈ Estimated AI Probability (Turnitin-style): {ai_percent:.1f}%", html, df_display
# ----------------------------------------------
# GRADIO UI
# ----------------------------------------------
with gr.Blocks() as demo:
gr.Markdown("## π§ Writenix β Turnitin-Style AI Detector")
text_input = gr.Textbox(label="Enter text", lines=10, placeholder="Paste your essay...")
classify_btn = gr.Button("π Analyze")
ai_score = gr.Label(label="Turnitin-Style AI Likelihood")
highlighted = gr.HTML()
table = gr.Dataframe(headers=["Sentence", "Perplexity", "Entropy Mean", "Entropy Std", "Length", "Punctuation"], wrap=True)
classify_btn.click(classify_text, text_input, [ai_score, highlighted, table])
if __name__ == "__main__":
demo.launch()
|