VictorM-Coder commited on
Commit
96ab1a6
·
verified ·
1 Parent(s): 26af59c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -67
app.py CHANGED
@@ -14,141 +14,178 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
14
  dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
15
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
16
 
17
- # -----------------------------
18
- # THRESHOLD
19
- # -----------------------------
20
  THRESHOLD = 0.80
21
 
22
  # -----------------------------
23
- # SENTENCE SPLITTING
24
  # -----------------------------
25
- ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
26
- "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co", "u.s", "u.k",
27
- "a.m", "p.m"
 
28
  ]
29
- ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", flags=re.IGNORECASE)
30
 
31
  def _protect(text):
32
- t = text.replace("...", "⟨ELLIPSIS⟩")
33
- t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t)
34
- t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
35
- return t
36
 
37
  def _restore(text):
38
- return text.replace("⟨ABBRDOT⟩", ".").replace("⟨DECIMAL⟩", ".").replace("⟨ELLIPSIS⟩", "...")
 
 
 
 
39
 
40
- def split_sentences_preserving(text):
41
- protected = _protect(text)
42
- parts = re.split(r"([.?!])(\s+)", protected)
 
 
 
 
 
 
 
 
43
 
44
- sentences = []
45
- current = ""
 
 
 
 
46
 
47
- for i in range(0, len(parts), 3):
48
- part = parts[i]
49
- punct = parts[i+1] if i+1 < len(parts) else ""
50
- space = parts[i+2] if i+2 < len(parts) else ""
51
 
52
- current = part + punct
53
- sentences.append(_restore(current))
54
- sentences.append(space) # preserve exact spacing (spaces and newlines)
55
 
56
- return sentences # alternating [sentence, whitespace, sentence, whitespace...]
 
57
 
58
- def extract_pure_sentences(sent_block):
59
- return [s for s in sent_block if not s.isspace()]
60
 
61
 
 
 
 
 
 
 
 
62
  # -----------------------------
63
- # GROUP SENTENCES
64
  # -----------------------------
65
  def group_sentences(sents, size=3):
66
- return [" ".join(sents[i:i+size]) for i in range(0, len(sents), size)]
67
-
68
 
69
  # -----------------------------
70
- # MAIN ANALYSIS
71
  # -----------------------------
72
  def analyze(text, max_len=512):
73
 
74
- # 1. Split while preserving structure
75
- blocks = split_sentences_preserving(text)
76
- pure_sentences = extract_pure_sentences(blocks)
77
 
78
  if not pure_sentences:
79
  return "—", "—", "<em>Paste text to analyze.</em>", None
80
 
81
- # 2. Group for model
82
  grouped = group_sentences(pure_sentences, 3)
83
  clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
84
 
85
- # 3. Run model
86
- inputs = tokenizer(clean_grouped, return_tensors="pt", padding=True,
87
- truncation=True, max_length=max_len).to(device)
 
88
 
89
  with torch.no_grad():
90
  logits = model(**inputs).logits
91
  chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
92
 
93
- # 4. Expand chunk scores to per-sentence
94
- sentence_ai = []
95
  for idx, prob in enumerate(chunk_probs):
96
  start = idx * 3
97
  end = min(start + 3, len(pure_sentences))
98
  for _ in range(start, end):
99
- sentence_ai.append(prob)
100
 
101
  # -----------------------------
102
- # FINAL OUTPUT RECONSTRUCTION
103
  # -----------------------------
104
  highlighted = ""
105
- sent_index = 0
106
 
107
  for block in blocks:
 
 
 
 
 
 
 
108
  if block.isspace():
109
- highlighted += block # preserve exact spacing
110
- else:
111
- # this block is a real sentence
112
- ai_p = sentence_ai[sent_index]
113
- sent_index += 1
114
 
115
- pct = f"{ai_p*100:.1f}%"
 
 
116
 
117
- if ai_p < 0.30:
118
- color = "#11823b"
119
- elif ai_p < 0.70:
120
- color = "#b8860b"
121
- else:
122
- color = "#b80d0d"
 
 
123
 
124
- highlighted += f"<span style='background-color:rgba(0,0,0,0.03); padding:3px 4px; border-radius:4px;'><strong style='color:{color}'>[{pct}]</strong> {block.strip()}</span> "
 
 
 
 
125
 
126
- # Overall score
127
- overall = sum(sentence_ai) / len(sentence_ai)
128
- overall_pct = f"{overall*100:.1f}%"
 
 
 
 
 
129
  overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
130
 
131
  # Table output
132
  df = pd.DataFrame(
133
- [[i+1, s, sentence_ai[i]] for i, s in enumerate(pure_sentences)],
134
  columns=["#", "Sentence", "AI_Prob"]
135
  )
136
 
137
  return overall_label, overall_pct, highlighted, df
138
 
139
-
140
  # -----------------------------
141
  # UI
142
  # -----------------------------
143
  with gr.Blocks() as demo:
144
- gr.Markdown("### 🕵️ AI Sentence-Level Detector — Original Format Highlighting")
145
 
146
- text_input = gr.Textbox(label="Paste text", lines=14)
147
  btn = gr.Button("Analyze")
148
 
149
- verdict = gr.Label(label="Overall Verdict")
150
- score = gr.Label(label="Overall AI Score")
151
- highlights = gr.HTML(label="Highlighted Text (Original Format)")
152
  table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)
153
 
154
  btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])
 
14
  dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported()) else torch.float32
15
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
16
 
 
 
 
17
  THRESHOLD = 0.80
18
 
19
  # -----------------------------
20
+ # ABBREVIATION PROTECTION
21
  # -----------------------------
22
+ ABBR = [
23
+ "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
24
+ "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
25
+ "u.s", "u.k", "a.m", "p.m"
26
  ]
27
+ ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", re.IGNORECASE)
28
 
29
  def _protect(text):
30
+ text = text.replace("...", "⟨ELLIPSIS⟩")
31
+ text = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", text)
32
+ text = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", text)
33
+ return text
34
 
35
  def _restore(text):
36
+ return (
37
+ text.replace("⟨ABBRDOT⟩", ".")
38
+ .replace("⟨DECIMAL⟩", ".")
39
+ .replace("⟨ELLIPSIS⟩", "...")
40
+ )
41
 
42
+ # -----------------------------
43
+ # PERFECT PARAGRAPH-PRESERVING SPLITTER
44
+ # -----------------------------
45
+ def split_preserving_structure(text):
46
+ """
47
+ Splits text into:
48
+ - EXACT newline blocks (\n, \n\n, etc.)
49
+ - Sentences inside non-newline blocks
50
+ """
51
+ blocks = re.split(r"(\n+)", text) # keep newline separators
52
+ final_blocks = []
53
 
54
+ for block in blocks:
55
+ if block.startswith("\n"):
56
+ final_blocks.append(block) # preserve EXACT paragraph spacing
57
+ else:
58
+ protected = _protect(block)
59
+ parts = re.split(r"([.?!])(\s+)", protected)
60
 
61
+ for i in range(0, len(parts), 3):
62
+ sentence = parts[i]
63
+ punct = parts[i+1] if i+1 < len(parts) else ""
64
+ space = parts[i+2] if i+2 < len(parts) else ""
65
 
66
+ whole = sentence + punct
67
+ if whole.strip():
68
+ final_blocks.append(_restore(whole))
69
 
70
+ if space:
71
+ final_blocks.append(space)
72
 
73
+ return final_blocks
 
74
 
75
 
76
+ def extract_sentences_only(blocks):
77
+ """Return only sentence blocks (no whitespace/newlines)."""
78
+ return [
79
+ b for b in blocks
80
+ if b.strip() != "" and not b.startswith("\n") and not b.isspace()
81
+ ]
82
+
83
  # -----------------------------
84
+ # GROUPING
85
  # -----------------------------
86
  def group_sentences(sents, size=3):
87
+ return [" ".join(sents[i:i + size]) for i in range(0, len(sents), size)]
 
88
 
89
  # -----------------------------
90
+ # ANALYSIS LOGIC
91
  # -----------------------------
92
  def analyze(text, max_len=512):
93
 
94
+ # Structured block split
95
+ blocks = split_preserving_structure(text)
96
+ pure_sentences = extract_sentences_only(blocks)
97
 
98
  if not pure_sentences:
99
  return "—", "—", "<em>Paste text to analyze.</em>", None
100
 
101
+ # Group into 3-sentence windows (Turnitin style)
102
  grouped = group_sentences(pure_sentences, 3)
103
  clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
104
 
105
+ # Run model
106
+ inputs = tokenizer(clean_grouped, return_tensors="pt",
107
+ padding=True, truncation=True,
108
+ max_length=max_len).to(device)
109
 
110
  with torch.no_grad():
111
  logits = model(**inputs).logits
112
  chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
113
 
114
+ # Expand group scores back to individual sentences
115
+ ai_scores = []
116
  for idx, prob in enumerate(chunk_probs):
117
  start = idx * 3
118
  end = min(start + 3, len(pure_sentences))
119
  for _ in range(start, end):
120
+ ai_scores.append(prob)
121
 
122
  # -----------------------------
123
+ # RECONSTRUCT ORIGINAL TEXT W/ HIGHLIGHTING
124
  # -----------------------------
125
  highlighted = ""
126
+ current_sentence = 0
127
 
128
  for block in blocks:
129
+
130
+ # newline block → keep EXACT
131
+ if block.startswith("\n"):
132
+ highlighted += block
133
+ continue
134
+
135
+ # whitespace block → keep
136
  if block.isspace():
137
+ highlighted += block
138
+ continue
 
 
 
139
 
140
+ # real sentence → highlight
141
+ ai_p = ai_scores[current_sentence]
142
+ current_sentence += 1
143
 
144
+ pct = f"{ai_p * 100:.1f}%"
145
+
146
+ if ai_p < 0.30:
147
+ color = "#11823b"
148
+ elif ai_p < 0.70:
149
+ color = "#b8860b"
150
+ else:
151
+ color = "#b80d0d"
152
 
153
+ highlighted += (
154
+ f"<span style='background:rgba(0,0,0,0.03); padding:3px 4px; "
155
+ f"border-radius:4px;'><strong style='color:{color}'>[{pct}]</strong> "
156
+ f"{block.strip()}</span>"
157
+ )
158
 
159
+ # maintain spacing after sentence
160
+ highlighted += " "
161
+
162
+ # -----------------------------
163
+ # OVERALL SCORE
164
+ # -----------------------------
165
+ overall = sum(ai_scores) / len(ai_scores)
166
+ overall_pct = f"{overall * 100:.1f}%"
167
  overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
168
 
169
  # Table output
170
  df = pd.DataFrame(
171
+ [[i + 1, s, ai_scores[i]] for i, s in enumerate(pure_sentences)],
172
  columns=["#", "Sentence", "AI_Prob"]
173
  )
174
 
175
  return overall_label, overall_pct, highlighted, df
176
 
 
177
  # -----------------------------
178
  # UI
179
  # -----------------------------
180
  with gr.Blocks() as demo:
181
+ gr.Markdown("### 🕵️ AI Sentence-Level Detector — Exact Structure Highlighting")
182
 
183
+ text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your text…")
184
  btn = gr.Button("Analyze")
185
 
186
+ verdict = gr.Label(label="Verdict (Overall)")
187
+ score = gr.Label(label="AI Score")
188
+ highlights = gr.HTML(label="Highlighted Text (Exact Structure)")
189
  table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)
190
 
191
  btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])