VictorM-Coder commited on
Commit
6cabfbd
·
verified ·
1 Parent(s): 72d2f9a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -162
app.py CHANGED
@@ -2,7 +2,6 @@ import torch
2
  import torch.nn.functional as F
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
  import re
5
- import math
6
  import pandas as pd
7
  import gradio as gr
8
 
@@ -16,7 +15,7 @@ dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported(
16
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
17
 
18
  # -----------------------------
19
- # SENTENCE SPLITTER (no lookbehinds)
20
  # Protect → split → restore
21
  # -----------------------------
22
  ABBR = [
@@ -30,19 +29,10 @@ def _protect(text: str) -> str:
30
  t = text.strip()
31
  if not t:
32
  return ""
33
-
34
- # Normalize newlines to spaces (Turnitin-like continuous flow)
35
- t = re.sub(r"\s*\n+\s*", " ", t)
36
-
37
- # Protect ellipses
38
- t = t.replace("...", "⟨ELLIPSIS⟩")
39
-
40
- # Protect decimals like 3.14
41
- t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t)
42
-
43
- # Protect known abbreviations' final dot
44
- t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
45
-
46
  return t
47
 
48
  def _restore(text: str) -> str:
@@ -55,179 +45,72 @@ def sentence_split(text: str):
55
  t = _protect(text)
56
  if not t:
57
  return []
58
-
59
- # Split on ., ?, ! followed by whitespace and then a plausible sentence starter
60
- # (quote or capital or opening paren) OR end of string.
61
  parts = re.split(r"([.?!])\s+(?=(?:[\"“”‘’']?\s*[A-Z(])|$)", t)
62
-
63
- # Rebuild sentences: regex split keeps the delimiter in alternating groups
64
- sentences = []
65
- buf = ""
66
  for i, chunk in enumerate(parts):
67
  if i % 2 == 0:
68
  buf += chunk
69
  else:
70
- # chunk is the delimiter [.?!]
71
  buf += chunk
72
- sentences.append(buf.strip())
73
- buf = ""
74
  if buf.strip():
75
  sentences.append(buf.strip())
76
-
77
- # Clean/restore
78
- sentences = [_restore(s).strip() for s in sentences if s.strip()]
79
- return sentences
80
 
81
  # -----------------------------
82
- # UTILITIES
83
  # -----------------------------
84
- def batched(iterable, n=64):
85
- for i in range(0, len(iterable), n):
86
- yield iterable[i:i+n], i
87
-
88
- def contig_spans(labels):
89
- longest = 0
90
- count = 0
91
- run = 0
92
- for lab in labels:
93
- if lab == "AI":
94
- run += 1
95
- longest = max(longest, run)
96
- else:
97
- if run > 0:
98
- count += 1
99
- run = 0
100
- if run > 0:
101
- count += 1
102
- return count, longest
103
-
104
- def verdict_from_stats(flag_pct, longest_span, avg_ai_prob):
105
- if flag_pct >= 85 and longest_span >= 6 and avg_ai_prob >= 0.80:
106
- return "⚠️ Highly likely AI-generated (long consecutive spans and high prevalence)."
107
- if flag_pct >= 60 and longest_span >= 4:
108
- return "⚠️ Strong AI signals (multiple/long spans)."
109
- if flag_pct >= 30 or longest_span >= 3:
110
- return "△ Some AI indicators (partial/short spans)."
111
- return "✓ No clear AI indication (by this detector)."
112
-
113
- # -----------------------------
114
- # CORE CLASSIFIER
115
- # -----------------------------
116
- def classify_sentences(text, ai_threshold=0.70, batch_size=64, max_len=512):
117
  sents = sentence_split(text)
118
  if not sents:
119
- return [], [], 0.0, 0.0, (0, 0)
120
-
121
- all_probs = []
122
- for chunk, _ in batched(sents, n=batch_size):
123
- inputs = tokenizer(
124
- chunk,
125
- return_tensors="pt",
126
- padding=True,
127
- truncation=True,
128
- max_length=max_len
129
- ).to(device)
130
- with torch.no_grad():
131
- logits = model(**inputs).logits
132
- probs = F.softmax(logits, dim=-1) # [:,0]=Human, [:,1]=AI
133
- all_probs.extend(probs[:, 1].detach().cpu().tolist())
134
 
135
- labels = ["AI" if p >= ai_threshold else "Human" for p in all_probs]
 
 
136
 
137
- avg_ai_prob = float(sum(all_probs) / len(all_probs))
138
- flagged_pct = 100.0 * sum(1 for l in labels if l == "AI") / len(labels)
139
- spans = contig_spans(labels)
140
 
141
  rows = []
142
- for i, (s, p, lab) in enumerate(zip(sents, all_probs, labels), start=1):
143
- rows.append({
144
- "Sentence #": i,
145
- "Sentence": s,
146
- "AI Probability": round(p, 4),
147
- "Label": lab
148
- })
149
-
150
- return sents, rows, avg_ai_prob, flagged_pct, spans
151
-
152
- # -----------------------------
153
- # HTML HIGHLIGHT
154
- # -----------------------------
155
- def color_for_prob(p):
156
- if p < 0.30: return "#11823b" # green
157
- if p < 0.70: return "#b8860b" # amber
158
- return "#b80d0d" # red
159
-
160
- def build_highlight_html(rows):
161
- blocks = []
162
- for r in rows:
163
- p = r["AI Probability"]
164
- col = color_for_prob(p)
165
- pct = f"{p*100:.1f}%"
166
- text = re.sub(r"\s+", " ", r["Sentence"]).strip()
167
- blocks.append(
168
- f"<span style='background:rgba(0,0,0,0.02); "
169
- f"padding:4px 6px; border-radius:6px; display:block; margin:6px 0;'>"
170
- f"<strong style='color:{col}'>[{pct} {r['Label']}]</strong> {text}</span>"
171
  )
172
- return "\n".join(blocks)
173
-
174
- # -----------------------------
175
- # PUBLIC API FOR GRADIO
176
- # -----------------------------
177
- def generate_report(text, threshold):
178
- if not text or not text.strip():
179
- return "⚠️ Please enter some text.", None, None, None
180
-
181
- sents, rows, avg_ai_prob, flagged_pct, (span_count, longest_span) = classify_sentences(
182
- text, ai_threshold=threshold
183
- )
184
 
185
- verdict = verdict_from_stats(flagged_pct, longest_span, avg_ai_prob)
186
- overall = (
187
- f"⚖️ Turnitin-style Summary\n"
188
- f"- Overall AI probability (avg per sentence): {avg_ai_prob*100:.1f}%\n"
189
- f"- Sentences flagged as AI ≥ {int(threshold*100)}%: {flagged_pct:.1f}%\n"
190
- f"- Consecutive AI spans: {span_count} (longest: {longest_span})\n"
191
- f"- Verdict: {verdict}\n"
192
- f"\nⓘ This is an approximation using an open detector; actual Turnitin results may differ."
193
- )
194
-
195
- html = build_highlight_html(rows)
196
- df = pd.DataFrame(rows, columns=["Sentence #", "Sentence", "AI Probability", "Label"])
197
- return overall, html, df, f"{flagged_pct:.1f}%"
198
 
199
  # -----------------------------
200
- # GRADIO UI
201
  # -----------------------------
202
  with gr.Blocks() as demo:
203
- gr.Markdown("## 🧭 Writenix AI Detector — Turnitin-style (Sentence-Level)")
204
-
205
- with gr.Row():
206
- text_input = gr.Textbox(
207
- label="Paste your content",
208
- lines=16,
209
- placeholder="Drop your essay/article here…"
210
- )
211
-
212
- with gr.Row():
213
- threshold = gr.Slider(
214
- 0.50, 0.95, value=0.70, step=0.01,
215
- label="AI Flag Threshold (probability ≥ threshold ⇒ AI)"
216
- )
217
- detect_btn = gr.Button("🔎 Analyze")
218
 
219
- with gr.Row():
220
- ai_summary = gr.Textbox(label="Report Summary", lines=8)
221
- flagged_pct = gr.Label(label="% Sentences Flagged (AI)")
222
 
223
- highlighted = gr.HTML(label="Per-Sentence Highlights")
224
- table = gr.Dataframe(headers=["Sentence #", "Sentence", "AI Probability", "Label"], wrap=True)
 
225
 
226
- detect_btn.click(
227
- fn=generate_report,
228
- inputs=[text_input, threshold],
229
- outputs=[ai_summary, highlighted, table, flagged_pct]
230
- )
231
 
232
  if __name__ == "__main__":
233
  demo.launch()
 
2
  import torch.nn.functional as F
3
  from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
  import re
 
5
  import pandas as pd
6
  import gradio as gr
7
 
 
15
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
16
 
17
  # -----------------------------
18
+ # SENTENCE SPLITTER (simple, robust, no lookbehinds)
19
  # Protect → split → restore
20
  # -----------------------------
21
  ABBR = [
 
29
  t = text.strip()
30
  if not t:
31
  return ""
32
+ t = re.sub(r"\s*\n+\s*", " ", t) # normalize newlines
33
+ t = t.replace("...", "⟨ELLIPSIS⟩") # ellipses
34
+ t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t) # decimals like 3.14
35
+ t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t) # abbreviations' dot
 
 
 
 
 
 
 
 
 
36
  return t
37
 
38
  def _restore(text: str) -> str:
 
45
  t = _protect(text)
46
  if not t:
47
  return []
48
+ # split on [.?!] followed by whitespace and likely sentence start or end
 
 
49
  parts = re.split(r"([.?!])\s+(?=(?:[\"“”‘’']?\s*[A-Z(])|$)", t)
50
+ sentences, buf = [], ""
 
 
 
51
  for i, chunk in enumerate(parts):
52
  if i % 2 == 0:
53
  buf += chunk
54
  else:
 
55
  buf += chunk
56
+ sentences.append(buf.strip()); buf = ""
 
57
  if buf.strip():
58
  sentences.append(buf.strip())
59
+ return [_restore(s).strip() for s in sentences if s.strip()]
 
 
 
60
 
61
  # -----------------------------
62
+ # CLASSIFY SENTENCE-BY-SENTENCE
63
  # -----------------------------
64
+ def classify_sentence_by_sentence(text, threshold=0.70, max_len=512):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  sents = sentence_split(text)
66
  if not sents:
67
+ return "⚠️ Please paste some text.", None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
+ inputs = tokenizer(
70
+ sents, return_tensors="pt", padding=True, truncation=True, max_length=max_len
71
+ ).to(device)
72
 
73
+ with torch.no_grad():
74
+ logits = model(**inputs).logits
75
+ probs = F.softmax(logits, dim=-1) # [:,0]=Human, [:,1]=AI
76
 
77
  rows = []
78
+ highlights = []
79
+ for i, s in enumerate(sents, start=1):
80
+ ai_p = float(probs[i-1, 1].item())
81
+ label = "AI" if ai_p >= threshold else "Human"
82
+ pct = f"{ai_p*100:.1f}%"
83
+ # color
84
+ if ai_p < 0.30: color = "#11823b" # green
85
+ elif ai_p < 0.70: color = "#b8860b" # amber
86
+ else: color = "#b80d0d" # red
87
+ highlights.append(
88
+ f"<div style='margin:6px 0; padding:6px 8px; border-radius:6px; background:rgba(0,0,0,0.03)'>"
89
+ f"<strong style='color:{color}'>[{pct} {label}]</strong> "
90
+ f"{re.sub(r'\\s+', ' ', s)}</div>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  )
92
+ rows.append([i, s, round(ai_p, 4), label])
 
 
 
 
 
 
 
 
 
 
 
93
 
94
+ html = "\n".join(highlights)
95
+ df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob", "Label"])
96
+ return "Done ✅ (sentence-by-sentence only)", html, df
 
 
 
 
 
 
 
 
 
 
97
 
98
  # -----------------------------
99
+ # GRADIO UI (minimal)
100
  # -----------------------------
101
  with gr.Blocks() as demo:
102
+ gr.Markdown("### 🧠 Sentence-by-Sentence AI Check")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
+ text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your content…")
105
+ threshold = gr.Slider(0.50, 0.95, value=0.70, step=0.01, label="AI threshold")
106
+ btn = gr.Button("Analyze")
107
 
108
+ status = gr.Label(label="Status")
109
+ highlights = gr.HTML(label="Per-Sentence Highlights")
110
+ table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob", "Label"], wrap=True)
111
 
112
+ btn.click(classify_sentence_by_sentence, inputs=[text_input, threshold],
113
+ outputs=[status, highlights, table])
 
 
 
114
 
115
  if __name__ == "__main__":
116
  demo.launch()