File size: 4,831 Bytes
958e345
2b59ac0
 
ceeca7d
41a5821
49d2f3f
ceeca7d
41a5821
 
 
5a39ff3
9267b26
49d2f3f
 
96c50c6
ceeca7d
 
41a5821
2b59ac0
41a5821
f2f742a
 
41a5821
 
9267b26
 
41a5821
 
 
2b59ac0
41a5821
2b59ac0
41a5821
5a39ff3
2b59ac0
 
41a5821
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49d2f3f
 
f2f742a
41a5821
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96c50c6
41a5821
 
 
 
49d2f3f
41a5821
 
2b59ac0
49d2f3f
41a5821
 
 
 
49d2f3f
9267b26
41a5821
 
1feb8eb
41a5821
49d2f3f
41a5821
49d2f3f
41a5821
49d2f3f
41a5821
49d2f3f
41a5821
49d2f3f
41a5821
49d2f3f
2b59ac0
49d2f3f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np
import pandas as pd
import re
import gradio as gr

# ----------------------------------------------
# LOAD FAST MODEL (DistilGPT2)
# ----------------------------------------------
MODEL_NAME = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device).eval()


# ----------------------------------------------
# SENTENCE SPLITTER
# ----------------------------------------------
def sentence_split(text):
    text = text.replace("\n", ". ")
    s = re.split(r'(?<=[.!?])\s+', text)
    return [x.strip() for x in s if x.strip()]


# ----------------------------------------------
# PERPLEXITY
# ----------------------------------------------
def perplexity(sentence):
    enc = tokenizer(sentence, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model(**enc, labels=enc["input_ids"])
    return float(torch.exp(out.loss))


# ----------------------------------------------
# TOKEN-LEVEL ENTROPY
# ----------------------------------------------
def token_entropy(sentence):
    enc = tokenizer(sentence, return_tensors="pt").to(device)
    input_ids = enc["input_ids"][0]

    with torch.no_grad():
        outputs = model(enc["input_ids"], labels=enc["input_ids"])
        logits = outputs.logits[0]

    entropies = []
    for i in range(1, len(input_ids)):
        probs = torch.softmax(logits[i-1], dim=-1)
        entropy = -torch.sum(probs * torch.log(probs + 1e-10))
        entropies.append(float(entropy))

    return np.mean(entropies), np.std(entropies)


# ----------------------------------------------
# TURNITIN-STYLE SCORING PIPELINE
# ----------------------------------------------
def analyze_sentence(sentence):
    perp = perplexity(sentence)
    mean_ent, std_ent = token_entropy(sentence)
    length = len(sentence.split())
    punct = sum([sentence.count(p) for p in ".,;:!?"])

    return {
        "sentence": sentence,
        "perplexity": perp,
        "entropy_mean": mean_ent,
        "entropy_std": std_ent,
        "length": length,
        "punctuation": punct
    }


# ----------------------------------------------
# MAIN TURNITIN STYLE DETECTOR
# ----------------------------------------------
def classify_text(text):

    sentences = sentence_split(text)
    stats = [analyze_sentence(s) for s in sentences]

    df = pd.DataFrame(stats)

    # ---------- TURNITIN STYLE METRICS ----------
    perplexity_mean = df["perplexity"].mean()
    perplexity_std  = df["perplexity"].std()

    entropy_mean = df["entropy_mean"].mean()
    entropy_std  = df["entropy_std"].mean()

    length_std = df["length"].std()
    punct_std  = df["punctuation"].std()

    # ---------- NORMALIZED SCORES ----------
    # Low variance = AI-like
    burstiness_score = np.exp(-perplexity_std)

    entropy_smoothness = np.exp(-entropy_std)

    length_uniformity = np.exp(-length_std / (df["length"].mean() + 1e-5))
    punct_uniformity  = np.exp(-punct_std / (df["punctuation"].mean() + 1e-5))

    # ---------- ENSEMBLE SCORE (Turnitin-like) ----------
    ai_score = (
        0.35 * burstiness_score +
        0.25 * entropy_smoothness +
        0.20 * length_uniformity +
        0.20 * punct_uniformity
    )

    ai_percent = float(ai_score * 100)

    # ---------- PER-SENTENCE LABELS ----------
    highlighted = []
    for i, row in df.iterrows():
        is_ai = row["perplexity"] < perplexity_mean * 0.75 and row["entropy_std"] < entropy_std * 0.8
        if is_ai:
            highlighted.append(f"<p style='color:red;font-weight:bold'>{row['sentence']}</p>")
        else:
            highlighted.append(f"<p style='color:green;font-weight:bold'>{row['sentence']}</p>")

    html = "\n".join(highlighted)

    # Display readable columns
    df_display = df[["sentence", "perplexity", "entropy_mean", "entropy_std", "length", "punctuation"]]

    return f"βš–οΈ Estimated AI Probability (Turnitin-style): {ai_percent:.1f}%", html, df_display



# ----------------------------------------------
# GRADIO UI
# ----------------------------------------------
with gr.Blocks() as demo:
    gr.Markdown("## 🧠 Writenix β€” Turnitin-Style AI Detector")

    text_input = gr.Textbox(label="Enter text", lines=10, placeholder="Paste your essay...")

    classify_btn = gr.Button("πŸš€ Analyze")

    ai_score = gr.Label(label="Turnitin-Style AI Likelihood")
    highlighted = gr.HTML()
    table = gr.Dataframe(headers=["Sentence", "Perplexity", "Entropy Mean", "Entropy Std", "Length", "Punctuation"], wrap=True)

    classify_btn.click(classify_text, text_input, [ai_score, highlighted, table])

if __name__ == "__main__":
    demo.launch()