VictorM-Coder commited on
Commit
0d83dcd
·
verified ·
1 Parent(s): 41a5821

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +207 -135
app.py CHANGED
@@ -1,150 +1,222 @@
1
  import torch
2
- from transformers import AutoTokenizer, AutoModelForCausalLM
3
- import numpy as np
4
- import pandas as pd
5
  import re
 
 
6
  import gradio as gr
7
 
8
- # ----------------------------------------------
9
- # LOAD FAST MODEL (DistilGPT2)
10
- # ----------------------------------------------
11
- MODEL_NAME = "distilgpt2"
12
-
13
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device).eval()
16
-
17
-
18
- # ----------------------------------------------
19
- # SENTENCE SPLITTER
20
- # ----------------------------------------------
21
- def sentence_split(text):
22
- text = text.replace("\n", ". ")
23
- s = re.split(r'(?<=[.!?])\s+', text)
24
- return [x.strip() for x in s if x.strip()]
25
-
26
-
27
- # ----------------------------------------------
28
- # PERPLEXITY
29
- # ----------------------------------------------
30
- def perplexity(sentence):
31
- enc = tokenizer(sentence, return_tensors="pt").to(device)
32
- with torch.no_grad():
33
- out = model(**enc, labels=enc["input_ids"])
34
- return float(torch.exp(out.loss))
35
-
36
-
37
- # ----------------------------------------------
38
- # TOKEN-LEVEL ENTROPY
39
- # ----------------------------------------------
40
- def token_entropy(sentence):
41
- enc = tokenizer(sentence, return_tensors="pt").to(device)
42
- input_ids = enc["input_ids"][0]
43
-
44
- with torch.no_grad():
45
- outputs = model(enc["input_ids"], labels=enc["input_ids"])
46
- logits = outputs.logits[0]
47
-
48
- entropies = []
49
- for i in range(1, len(input_ids)):
50
- probs = torch.softmax(logits[i-1], dim=-1)
51
- entropy = -torch.sum(probs * torch.log(probs + 1e-10))
52
- entropies.append(float(entropy))
53
-
54
- return np.mean(entropies), np.std(entropies)
55
-
56
-
57
- # ----------------------------------------------
58
- # TURNITIN-STYLE SCORING PIPELINE
59
- # ----------------------------------------------
60
- def analyze_sentence(sentence):
61
- perp = perplexity(sentence)
62
- mean_ent, std_ent = token_entropy(sentence)
63
- length = len(sentence.split())
64
- punct = sum([sentence.count(p) for p in ".,;:!?"])
65
-
66
- return {
67
- "sentence": sentence,
68
- "perplexity": perp,
69
- "entropy_mean": mean_ent,
70
- "entropy_std": std_ent,
71
- "length": length,
72
- "punctuation": punct
73
- }
74
-
75
-
76
- # ----------------------------------------------
77
- # MAIN TURNITIN STYLE DETECTOR
78
- # ----------------------------------------------
79
- def classify_text(text):
80
-
81
- sentences = sentence_split(text)
82
- stats = [analyze_sentence(s) for s in sentences]
83
-
84
- df = pd.DataFrame(stats)
85
-
86
- # ---------- TURNITIN STYLE METRICS ----------
87
- perplexity_mean = df["perplexity"].mean()
88
- perplexity_std = df["perplexity"].std()
89
-
90
- entropy_mean = df["entropy_mean"].mean()
91
- entropy_std = df["entropy_std"].mean()
92
-
93
- length_std = df["length"].std()
94
- punct_std = df["punctuation"].std()
95
-
96
- # ---------- NORMALIZED SCORES ----------
97
- # Low variance = AI-like
98
- burstiness_score = np.exp(-perplexity_std)
99
-
100
- entropy_smoothness = np.exp(-entropy_std)
101
-
102
- length_uniformity = np.exp(-length_std / (df["length"].mean() + 1e-5))
103
- punct_uniformity = np.exp(-punct_std / (df["punctuation"].mean() + 1e-5))
104
-
105
- # ---------- ENSEMBLE SCORE (Turnitin-like) ----------
106
- ai_score = (
107
- 0.35 * burstiness_score +
108
- 0.25 * entropy_smoothness +
109
- 0.20 * length_uniformity +
110
- 0.20 * punct_uniformity
111
- )
112
-
113
- ai_percent = float(ai_score * 100)
114
-
115
- # ---------- PER-SENTENCE LABELS ----------
116
- highlighted = []
117
- for i, row in df.iterrows():
118
- is_ai = row["perplexity"] < perplexity_mean * 0.75 and row["entropy_std"] < entropy_std * 0.8
119
- if is_ai:
120
- highlighted.append(f"<p style='color:red;font-weight:bold'>{row['sentence']}</p>")
121
  else:
122
- highlighted.append(f"<p style='color:green;font-weight:bold'>{row['sentence']}</p>")
123
-
124
- html = "\n".join(highlighted)
125
-
126
- # Display readable columns
127
- df_display = df[["sentence", "perplexity", "entropy_mean", "entropy_std", "length", "punctuation"]]
128
-
129
- return f"⚖️ Estimated AI Probability (Turnitin-style): {ai_percent:.1f}%", html, df_display
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
 
 
 
 
 
 
 
 
 
 
131
 
 
 
 
132
 
133
- # ----------------------------------------------
134
  # GRADIO UI
135
- # ----------------------------------------------
136
  with gr.Blocks() as demo:
137
- gr.Markdown("## 🧠 Writenix — Turnitin-Style AI Detector")
138
-
139
- text_input = gr.Textbox(label="Enter text", lines=10, placeholder="Paste your essay...")
140
-
141
- classify_btn = gr.Button("🚀 Analyze")
142
-
143
- ai_score = gr.Label(label="Turnitin-Style AI Likelihood")
144
- highlighted = gr.HTML()
145
- table = gr.Dataframe(headers=["Sentence", "Perplexity", "Entropy Mean", "Entropy Std", "Length", "Punctuation"], wrap=True)
146
-
147
- classify_btn.click(classify_text, text_input, [ai_score, highlighted, table])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  if __name__ == "__main__":
150
  demo.launch()
 
1
  import torch
2
+ import torch.nn.functional as F
3
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
 
4
  import re
5
+ import math
6
+ import pandas as pd
7
  import gradio as gr
8
 
9
+ # -----------------------------
10
+ # MODEL
11
+ # -----------------------------
12
+ MODEL_NAME = "openai-community/roberta-base-openai-detector"
 
13
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+ dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
16
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, torch_dtype=dtype).to(device).eval()
17
+
18
+ # -----------------------------
19
+ # SENTENCE SPLITTER (robust, no externals)
20
+ # -----------------------------
21
+ _ABBR = r"(?:e\.g|i\.e|mr|mrs|ms|dr|prof|vs|etc|fig|al|jr|sr|st|no|vol|pp|mt|inc|ltd|co|u\.s|u\.k|a\.m|p\.m)\."
22
+ _QUOTE = r"[\"“”‘’']?"
23
+ # Split on ., ?, ! when followed by space/newline + a capital/quote or end of text,
24
+ # while avoiding common abbreviations and decimals.
25
+ _SENT_PAT = re.compile(
26
+ rf"""
27
+ (?<!\b{_ABBR}) # not common abbreviation
28
+ (?<!\d)\.|\?|! # ., ?, !
29
+ (?=\s+{_QUOTE}[A-Z(]|$) # lookahead for next sentence start or end
30
+ """,
31
+ re.VERBOSE
32
+ )
33
+
34
+ def sentence_split(text: str):
35
+ # Normalize hard breaks to spaces (Turnitin-like continuous flow)
36
+ t = re.sub(r"\s*\n+\s*", " ", text.strip())
37
+ if not t:
38
+ return []
39
+ # Temporarily protect ellipses to avoid over-splitting
40
+ t = t.replace("...", "…")
41
+ pieces = []
42
+ start = 0
43
+ for m in _SENT_PAT.finditer(t):
44
+ end = m.end()
45
+ chunk = t[start:end].strip()
46
+ if chunk:
47
+ pieces.append(chunk)
48
+ start = end
49
+ # tail
50
+ tail = t[start:].strip()
51
+ if tail:
52
+ pieces.append(tail)
53
+ # Restore ellipses
54
+ return [s.replace("…", "...") for s in pieces]
55
+
56
+ # -----------------------------
57
+ # UTILITIES
58
+ # -----------------------------
59
+ def batched(iterable, n=64):
60
+ for i in range(0, len(iterable), n):
61
+ yield iterable[i:i+n], i
62
+
63
+ def contig_spans(labels):
64
+ """Return (num_spans, longest_span_len) for consecutive 'AI' labels."""
65
+ longest = 0
66
+ count = 0
67
+ run = 0
68
+ for lab in labels:
69
+ if lab == "AI":
70
+ run += 1
71
+ longest = max(longest, run)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  else:
73
+ if run > 0:
74
+ count += 1
75
+ run = 0
76
+ if run > 0:
77
+ count += 1
78
+ return count, longest
79
+
80
+ def verdict_from_stats(flag_pct, longest_span, avg_ai_prob):
81
+ """
82
+ Turnitin-ish qualitative summary.
83
+ - Emphasize consecutive AI-like sentences (spans) and overall prevalence.
84
+ """
85
+ if flag_pct >= 85 and longest_span >= 6 and avg_ai_prob >= 0.80:
86
+ return "⚠️ Highly likely AI-generated (long consecutive spans and high prevalence)."
87
+ if flag_pct >= 60 and longest_span >= 4:
88
+ return "⚠️ Strong AI signals (multiple/long spans)."
89
+ if flag_pct >= 30 or longest_span >= 3:
90
+ return "△ Some AI indicators (partial/short spans)."
91
+ return "✓ No clear AI indication (by this detector)."
92
+
93
+ # -----------------------------
94
+ # CORE CLASSIFIER
95
+ # -----------------------------
96
+ def classify_sentences(text, ai_threshold=0.70, batch_size=64, max_len=512):
97
+ sents = sentence_split(text)
98
+ if not sents:
99
+ return [], [], 0.0, 0.0, (0, 0)
100
+
101
+ all_probs = []
102
+ all_labels = []
103
+
104
+ for chunk, base in batched(sents, n=batch_size):
105
+ inputs = tokenizer(
106
+ chunk,
107
+ return_tensors="pt",
108
+ padding=True,
109
+ truncation=True,
110
+ max_length=max_len
111
+ ).to(device)
112
+ with torch.no_grad():
113
+ logits = model(**inputs).logits
114
+ probs = F.softmax(logits, dim=-1) # [:, 0]=Human, [:, 1]=AI
115
+
116
+ ai_probs = probs[:, 1].detach().cpu().tolist()
117
+ all_probs.extend(ai_probs)
118
+
119
+ for p in all_probs:
120
+ all_labels.append("AI" if p >= ai_threshold else "Human")
121
+
122
+ avg_ai_prob = float(sum(all_probs) / len(all_probs))
123
+ flagged_pct = 100.0 * sum(1 for l in all_labels if l == "AI") / len(all_labels)
124
+ spans = contig_spans(all_labels)
125
+
126
+ rows = []
127
+ for i, (s, p, lab) in enumerate(zip(sents, all_probs, all_labels), start=1):
128
+ rows.append({
129
+ "Sentence #": i,
130
+ "Sentence": s,
131
+ "AI Probability": round(p, 4),
132
+ "Label": lab
133
+ })
134
+
135
+ return sents, rows, avg_ai_prob, flagged_pct, spans
136
+
137
+ # -----------------------------
138
+ # HTML HIGHLIGHT (Turnitin-ish)
139
+ # -----------------------------
140
+ def color_for_prob(p):
141
+ # 0-0.3 green, 0.3-0.7 yellow, 0.7-1.0 red
142
+ if p < 0.30:
143
+ return "#11823b"
144
+ if p < 0.70:
145
+ return "#b8860b"
146
+ return "#b80d0d"
147
+
148
+ def build_highlight_html(rows):
149
+ blocks = []
150
+ for r in rows:
151
+ p = r["AI Probability"]
152
+ col = color_for_prob(p)
153
+ pct = f"{p*100:.1f}%"
154
+ text = re.sub(r"\s+", " ", r["Sentence"]).strip()
155
+ blocks.append(
156
+ f"<span style='background:rgba(0,0,0,0.02); "
157
+ f"padding:4px 6px; border-radius:6px; display:block; margin:6px 0;'>"
158
+ f"<strong style='color:{col}'>[{pct} {r['Label']}]</strong> {text}</span>"
159
+ )
160
+ return "\n".join(blocks)
161
+
162
+ # -----------------------------
163
+ # PUBLIC API FOR GRADIO
164
+ # -----------------------------
165
+ def generate_report(text, threshold):
166
+ if not text or not text.strip():
167
+ return "⚠️ Please enter some text.", None, None, None
168
+
169
+ sents, rows, avg_ai_prob, flagged_pct, (span_count, longest_span) = classify_sentences(
170
+ text, ai_threshold=threshold
171
+ )
172
 
173
+ verdict = verdict_from_stats(flagged_pct, longest_span, avg_ai_prob)
174
+ overall = (
175
+ f"⚖️ Turnitin-style Summary\n"
176
+ f"- Overall AI probability (avg per sentence): {avg_ai_prob*100:.1f}%\n"
177
+ f"- Sentences flagged as AI ≥ {int(threshold*100)}%: {flagged_pct:.1f}%\n"
178
+ f"- Consecutive AI spans: {span_count} (longest: {longest_span})\n"
179
+ f"- Verdict: {verdict}\n"
180
+ f"\nⓘ This is an approximation using an open detector; "
181
+ f"actual Turnitin results may differ."
182
+ )
183
 
184
+ html = build_highlight_html(rows)
185
+ df = pd.DataFrame(rows, columns=["Sentence #", "Sentence", "AI Probability", "Label"])
186
+ return overall, html, df, f"{flagged_pct:.1f}%"
187
 
188
+ # -----------------------------
189
  # GRADIO UI
190
+ # -----------------------------
191
  with gr.Blocks() as demo:
192
+ gr.Markdown("## 🧭 Writenix AI Detector — Turnitin-style (Sentence-Level)")
193
+
194
+ with gr.Row():
195
+ text_input = gr.Textbox(
196
+ label="Paste your content",
197
+ lines=16,
198
+ placeholder="Drop your essay/article here…"
199
+ )
200
+
201
+ with gr.Row():
202
+ threshold = gr.Slider(
203
+ 0.50, 0.95, value=0.70, step=0.01,
204
+ label="AI Flag Threshold (probability ≥ threshold ⇒ AI)"
205
+ )
206
+ detect_btn = gr.Button("🔎 Analyze")
207
+
208
+ with gr.Row():
209
+ ai_summary = gr.Textbox(label="Report Summary", lines=8)
210
+ flagged_pct = gr.Label(label="% Sentences Flagged (AI)")
211
+
212
+ highlighted = gr.HTML(label="Per-Sentence Highlights")
213
+ table = gr.Dataframe(headers=["Sentence #", "Sentence", "AI Probability", "Label"], wrap=True)
214
+
215
+ detect_btn.click(
216
+ fn=generate_report,
217
+ inputs=[text_input, threshold],
218
+ outputs=[ai_summary, highlighted, table, flagged_pct]
219
+ )
220
 
221
  if __name__ == "__main__":
222
  demo.launch()