VictorM-Coder commited on
Commit
668274d
·
verified ·
1 Parent(s): 96ab1a6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -104
app.py CHANGED
@@ -14,178 +14,141 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
  dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
15
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
16
 
 
 
 
17
  THRESHOLD = 0.80
18
 
19
  # -----------------------------
20
- # ABBREVIATION PROTECTION
21
  # -----------------------------
22
- ABBR = [
23
- "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
24
- "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
25
- "u.s", "u.k", "a.m", "p.m"
26
  ]
27
- ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
28
 
29
  def _protect(text):
30
- text = text.replace("...", "⟨ELLIPSIS⟩")
31
- text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
32
- text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
33
- return text
34
 
35
  def _restore(text):
36
- return (
37
- text.replace("⟨ABBRDOT⟩", ".")
38
- .replace("⟨DECIMAL⟩", ".")
39
- .replace("⟨ELLIPSIS⟩", "...")
40
- )
41
 
42
- # -----------------------------
43
- # PERFECT PARAGRAPH-PRESERVING SPLITTER
44
- # -----------------------------
45
- def split_preserving_structure(text):
46
- """
47
- Splits text into:
48
- - EXACT newline blocks (\n, \n\n, etc.)
49
- - Sentences inside non-newline blocks
50
- """
51
- blocks = re.split(r"(\n+)", text) # keep newline separators
52
- final_blocks = []
53
 
54
- for block in blocks:
55
- if block.startswith("\n"):
56
- final_blocks.append(block) # preserve EXACT paragraph spacing
57
- else:
58
- protected = _protect(block)
59
- parts = re.split(r"([.?!])(\s+)", protected)
60
 
61
- for i in range(0, len(parts), 3):
62
- sentence = parts[i]
63
- punct = parts[i+1] if i+1 < len(parts) else ""
64
- space = parts[i+2] if i+2 < len(parts) else ""
65
 
66
- whole = sentence + punct
67
- if whole.strip():
68
- final_blocks.append(_restore(whole))
69
 
70
- if space:
71
- final_blocks.append(space)
72
 
73
- return final_blocks
 
74
 
75
 
76
- def extract_sentences_only(blocks):
77
- """Return only sentence blocks (no whitespace/newlines)."""
78
- return [
79
- b for b in blocks
80
- if b.strip() != "" and not b.startswith("\n") and not b.isspace()
81
- ]
82
-
83
  # -----------------------------
84
- # GROUPING
85
  # -----------------------------
86
  def group_sentences(sents, size=3):
87
- return [" ".join(sents[i:i + size]) for i in range(0, len(sents), size)]
 
88
 
89
  # -----------------------------
90
- # ANALYSIS LOGIC
91
  # -----------------------------
92
  def analyze(text, max_len=512):
93
 
94
- # Structured block split
95
- blocks = split_preserving_structure(text)
96
- pure_sentences = extract_sentences_only(blocks)
97
 
98
  if not pure_sentences:
99
  return "—", "—", "<em>Paste text to analyze.</em>", None
100
 
101
- # Group into 3-sentence windows (Turnitin style)
102
  grouped = group_sentences(pure_sentences, 3)
103
  clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
104
 
105
- # Run model
106
- inputs = tokenizer(clean_grouped, return_tensors="pt",
107
- padding=True, truncation=True,
108
- max_length=max_len).to(device)
109
 
110
  with torch.no_grad():
111
  logits = model(**inputs).logits
112
  chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
113
 
114
- # Expand group scores back to individual sentences
115
- ai_scores = []
116
  for idx, prob in enumerate(chunk_probs):
117
  start = idx * 3
118
  end = min(start + 3, len(pure_sentences))
119
  for _ in range(start, end):
120
- ai_scores.append(prob)
121
 
122
  # -----------------------------
123
- # RECONSTRUCT ORIGINAL TEXT W/ HIGHLIGHTING
124
  # -----------------------------
125
  highlighted = ""
126
- current_sentence = 0
127
 
128
  for block in blocks:
129
-
130
- # newline block → keep EXACT
131
- if block.startswith("\n"):
132
- highlighted += block
133
- continue
134
-
135
- # whitespace block → keep
136
  if block.isspace():
137
- highlighted += block
138
- continue
139
-
140
- # real sentence → highlight
141
- ai_p = ai_scores[current_sentence]
142
- current_sentence += 1
143
-
144
- pct = f"{ai_p * 100:.1f}%"
145
-
146
- if ai_p < 0.30:
147
- color = "#11823b"
148
- elif ai_p < 0.70:
149
- color = "#b8860b"
150
  else:
151
- color = "#b80d0d"
 
 
152
 
153
- highlighted += (
154
- f"<span style='background:rgba(0,0,0,0.03); padding:3px 4px; "
155
- f"border-radius:4px;'><strong style='color:{color}'>[{pct}]</strong> "
156
- f"{block.strip()}</span>"
157
- )
158
 
159
- # maintain spacing after sentence
160
- highlighted += " "
 
 
 
 
161
 
162
- # -----------------------------
163
- # OVERALL SCORE
164
- # -----------------------------
165
- overall = sum(ai_scores) / len(ai_scores)
166
- overall_pct = f"{overall * 100:.1f}%"
167
  overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
168
 
169
  # Table output
170
  df = pd.DataFrame(
171
- [[i + 1, s, ai_scores[i]] for i, s in enumerate(pure_sentences)],
172
  columns=["#", "Sentence", "AI_Prob"]
173
  )
174
 
175
  return overall_label, overall_pct, highlighted, df
176
 
 
177
  # -----------------------------
178
  # UI
179
  # -----------------------------
180
  with gr.Blocks() as demo:
181
- gr.Markdown("### 🕵️ AI Sentence-Level Detector — Exact Structure Highlighting")
182
 
183
- text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your text…")
184
  btn = gr.Button("Analyze")
185
 
186
- verdict = gr.Label(label="Verdict (Overall)")
187
- score = gr.Label(label="AI Score")
188
- highlights = gr.HTML(label="Highlighted Text (Exact Structure)")
189
  table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)
190
 
191
  btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])
 
14
  dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
15
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
16
 
17
+ # -----------------------------
18
+ # THRESHOLD
19
+ # -----------------------------
20
  THRESHOLD = 0.80
21
 
22
  # -----------------------------
23
+ # SENTENCE SPLITTING
24
  # -----------------------------
25
+ ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
26
+ "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co", "u.s", "u.k",
27
+ "a.m", "p.m"
 
28
  ]
29
+ ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", flags=re.IGNORECASE)
30
 
31
  def _protect(text):
32
+ t = text.replace("...", "⟨ELLIPSIS⟩")
33
+ t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t)
34
+ t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
35
+ return t
36
 
37
  def _restore(text):
38
+ return text.replace("⟨ABBRDOT⟩", ".").replace("⟨DECIMAL⟩", ".").replace("⟨ELLIPSIS⟩", "...")
 
 
 
 
39
 
40
+ def split_sentences_preserving(text):
41
+ protected = _protect(text)
42
+ parts = re.split(r"([.?!])(\s+)", protected)
 
 
 
 
 
 
 
 
43
 
44
+ sentences = []
45
+ current = ""
 
 
 
 
46
 
47
+ for i in range(0, len(parts), 3):
48
+ part = parts[i]
49
+ punct = parts[i+1] if i+1 < len(parts) else ""
50
+ space = parts[i+2] if i+2 < len(parts) else ""
51
 
52
+ current = part + punct
53
+ sentences.append(_restore(current))
54
+ sentences.append(space) # preserve exact spacing (spaces and newlines)
55
 
56
+ return sentences # alternating [sentence, whitespace, sentence, whitespace...]
 
57
 
58
+ def extract_pure_sentences(sent_block):
59
+ return [s for s in sent_block if not s.isspace()]
60
 
61
 
 
 
 
 
 
 
 
62
  # -----------------------------
63
+ # GROUP SENTENCES
64
  # -----------------------------
65
  def group_sentences(sents, size=3):
66
+ return [" ".join(sents[i:i+size]) for i in range(0, len(sents), size)]
67
+
68
 
69
  # -----------------------------
70
+ # MAIN ANALYSIS
71
  # -----------------------------
72
  def analyze(text, max_len=512):
73
 
74
+ # 1. Split while preserving structure
75
+ blocks = split_sentences_preserving(text)
76
+ pure_sentences = extract_pure_sentences(blocks)
77
 
78
  if not pure_sentences:
79
  return "—", "—", "<em>Paste text to analyze.</em>", None
80
 
81
+ # 2. Group for model
82
  grouped = group_sentences(pure_sentences, 3)
83
  clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
84
 
85
+ # 3. Run model
86
+ inputs = tokenizer(clean_grouped, return_tensors="pt", padding=True,
87
+ truncation=True, max_length=max_len).to(device)
 
88
 
89
  with torch.no_grad():
90
  logits = model(**inputs).logits
91
  chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
92
 
93
+ # 4. Expand chunk scores to per-sentence
94
+ sentence_ai = []
95
  for idx, prob in enumerate(chunk_probs):
96
  start = idx * 3
97
  end = min(start + 3, len(pure_sentences))
98
  for _ in range(start, end):
99
+ sentence_ai.append(prob)
100
 
101
  # -----------------------------
102
+ # FINAL OUTPUT RECONSTRUCTION
103
  # -----------------------------
104
  highlighted = ""
105
+ sent_index = 0
106
 
107
  for block in blocks:
 
 
 
 
 
 
 
108
  if block.isspace():
109
+ highlighted += block # preserve exact spacing
 
 
 
 
 
 
 
 
 
 
 
 
110
  else:
111
+ # this block is a real sentence
112
+ ai_p = sentence_ai[sent_index]
113
+ sent_index += 1
114
 
115
+ pct = f"{ai_p*100:.1f}%"
 
 
 
 
116
 
117
+ if ai_p < 0.30:
118
+ color = "#11823b"
119
+ elif ai_p < 0.70:
120
+ color = "#b8860b"
121
+ else:
122
+ color = "#b80d0d"
123
 
124
+ highlighted += f"<span style='background-color:rgba(0,0,0,0.03); padding:3px 4px; border-radius:4px;'><strong style='color:{color}'>[{pct}]</strong> {block.strip()}</span> "
125
+
126
+ # Overall score
127
+ overall = sum(sentence_ai) / len(sentence_ai)
128
+ overall_pct = f"{overall*100:.1f}%"
129
  overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
130
 
131
  # Table output
132
  df = pd.DataFrame(
133
+ [[i+1, s, sentence_ai[i]] for i, s in enumerate(pure_sentences)],
134
  columns=["#", "Sentence", "AI_Prob"]
135
  )
136
 
137
  return overall_label, overall_pct, highlighted, df
138
 
139
+
140
  # -----------------------------
141
  # UI
142
  # -----------------------------
143
  with gr.Blocks() as demo:
144
+ gr.Markdown("### 🕵️ AI Sentence-Level Detector — Original Format Highlighting")
145
 
146
+ text_input = gr.Textbox(label="Paste text", lines=14)
147
  btn = gr.Button("Analyze")
148
 
149
+ verdict = gr.Label(label="Overall Verdict")
150
+ score = gr.Label(label="Overall AI Score")
151
+ highlights = gr.HTML(label="Highlighted Text (Original Format)")
152
  table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)
153
 
154
  btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])