VictorM-Coder commited on
Commit
26af59c
·
verified ·
1 Parent(s): 21a21f1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -132
app.py CHANGED
@@ -15,193 +15,140 @@ dtype = torch.bfloat16 if (device.type=="cuda" and torch.cuda.is_bf16_supported(
15
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
16
 
17
  # -----------------------------
18
- # AI DECISION THRESHOLD (80%)
19
  # -----------------------------
20
  THRESHOLD = 0.80
21
 
22
  # -----------------------------
23
- # SENTENCE SPLITTING UTILITIES
24
  # -----------------------------
25
- ABBR = [
26
- "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
27
- "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
28
- "u.s", "u.k", "a.m", "p.m"
29
  ]
30
-
31
- ABBR_REGEX = re.compile(
32
- r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.",
33
- flags=re.IGNORECASE
34
- )
35
 
36
  def _protect(text):
37
- t = text.strip()
38
- if not t:
39
- return ""
40
- t = re.sub(r"\s*\n+\s*", " ", t)
41
- t = t.replace("...", "⟨ELLIPSIS⟩")
42
  t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t)
43
  t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
44
  return t
45
 
46
  def _restore(text):
47
- return (
48
- text.replace("⟨ABBRDOT⟩", ".")
49
- .replace("⟨DECIMAL⟩", ".")
50
- .replace("⟨ELLIPSIS⟩", "...")
51
- )
52
-
53
- def sentence_split(text):
54
- t = _protect(text)
55
- if not t:
56
- return []
57
-
58
- parts = re.split(
59
- r"([.?!])\s+(?=(?:[\"“”‘’']?\s*[A-Z(])|$)", t
60
- )
61
-
62
- sentences, buf = [], ""
63
- for i, chunk in enumerate(parts):
64
- if i % 2 == 0:
65
- buf += chunk
66
- else:
67
- buf += chunk
68
- sentences.append(buf.strip())
69
- buf = ""
70
-
71
- if buf.strip():
72
- sentences.append(buf.strip())
73
-
74
- return [_restore(s).strip() for s in sentences if s.strip()]
75
-
76
 
77
- # -----------------------------
78
- # PARAGRAPH UTILITIES
79
- # -----------------------------
80
- def split_paragraphs(text):
81
- paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
82
- return paragraphs
83
-
84
- def map_sentences_to_paragraphs(paragraphs):
85
- all_sentences = []
86
- mapping = []
87
 
88
- for p_idx, para in enumerate(paragraphs):
89
- sents = sentence_split(para)
90
- for s_idx, s in enumerate(sents):
91
- all_sentences.append(s)
92
- mapping.append((p_idx, s_idx))
93
 
94
- return all_sentences, mapping
 
 
 
95
 
96
- def combine_paragraph_scores(paragraphs, mapping, sentence_probs):
97
- bucket = [[] for _ in paragraphs]
 
98
 
99
- for (p_idx, _), prob in zip(mapping, sentence_probs):
100
- bucket[p_idx].append(prob)
101
 
102
- final_scores = [
103
- (sum(scores) / len(scores)) if scores else 0
104
- for scores in bucket
105
- ]
106
-
107
- return final_scores
108
 
109
 
110
  # -----------------------------
111
- # GROUP SENTENCES (TURNITIN STYLE)
112
  # -----------------------------
113
  def group_sentences(sents, size=3):
114
- return [" ".join(sents[i:i + size]) for i in range(0, len(sents), size)]
115
 
116
 
117
  # -----------------------------
118
- # CORE ANALYSIS
119
  # -----------------------------
120
  def analyze(text, max_len=512):
121
- paragraphs = split_paragraphs(text)
122
- if not paragraphs:
123
- return "—", "—", "<em>Paste some text to analyze.</em>", None
124
 
125
- # map paragraphs sentences
126
- sents, mapping = map_sentences_to_paragraphs(paragraphs)
 
 
 
 
127
 
128
- # group sentences in 3s
129
- grouped = group_sentences(sents, 3)
130
  clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
131
 
132
- # tokenize chunks
133
- inputs = tokenizer(
134
- clean_grouped,
135
- return_tensors="pt",
136
- padding=True,
137
- truncation=True,
138
- max_length=max_len
139
- ).to(device)
140
 
141
  with torch.no_grad():
142
  logits = model(**inputs).logits
143
  chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
144
 
145
- # expand chunk probability to each sentence
146
- ai_probs = []
147
  for idx, prob in enumerate(chunk_probs):
148
  start = idx * 3
149
- end = min(start + 3, len(sents))
150
  for _ in range(start, end):
151
- ai_probs.append(prob)
152
 
153
- # final paragraph-level scores
154
- paragraph_ai = combine_paragraph_scores(paragraphs, mapping, ai_probs)
 
 
 
155
 
156
- # overall score
157
- overall = sum(ai_probs) / len(ai_probs)
158
- overall_pct = f"{overall * 100:.1f}%"
159
- overall_label = (
160
- "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
161
- )
162
-
163
- # paragraph-based HTML output
164
- final_html = ""
165
- for idx, (para, ai) in enumerate(zip(paragraphs, paragraph_ai), start=1):
166
- pct = f"{ai * 100:.1f}%"
167
- label = "AI" if ai >= THRESHOLD else "Human"
168
-
169
- # color
170
- if ai < 0.30:
171
- color = "#11823b"
172
- elif ai < 0.70:
173
- color = "#b8860b"
174
  else:
175
- color = "#b80d0d"
176
-
177
- final_html += f"""
178
- <div style='margin:12px 0; padding:12px; border-radius:8px; background:#fafafa'>
179
- <strong style='color:{color}'>[Paragraph {idx}: {pct} {label}]</strong>
180
- <div style='margin-top:8px; white-space:pre-wrap'>{para}</div>
181
- </div>
182
- """
183
-
184
- # sentence table (still available if needed)
185
- rows = []
186
- for i, s in enumerate(sents, start=1):
187
- rows.append([i, s, round(ai_probs[i-1], 4)])
188
- df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob"])
 
 
 
 
 
 
 
 
 
 
 
189
 
190
- return overall_label, overall_pct, final_html, df
191
 
192
 
193
  # -----------------------------
194
- # GRADIO UI
195
  # -----------------------------
196
  with gr.Blocks() as demo:
197
- gr.Markdown("### 🕵️ AI Written Text Detector — Fakespot Model (Turnitin-style Paragraph Mode)")
198
 
199
- text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your content…")
200
  btn = gr.Button("Analyze")
201
 
202
  verdict = gr.Label(label="Overall Verdict")
203
  score = gr.Label(label="Overall AI Score")
204
- highlights = gr.HTML(label="Paragraph Highlights (Original Format)")
205
  table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)
206
 
207
  btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])
 
15
  model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dtype).to(device).eval()
16
 
17
  # -----------------------------
18
+ # THRESHOLD
19
  # -----------------------------
20
  THRESHOLD = 0.80
21
 
22
  # -----------------------------
23
+ # SENTENCE SPLITTING
24
  # -----------------------------
25
+ ABBR = ["e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
26
+ "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co", "u.s", "u.k",
27
+ "a.m", "p.m"
 
28
  ]
29
+ ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", flags=re.IGNORECASE)
 
 
 
 
30
 
31
  def _protect(text):
32
+ t = text.replace("...", "⟨ELLIPSIS⟩")
 
 
 
 
33
  t = re.sub(r"(?<=\d)\.(?=\d)", "⟨DECIMAL⟩", t)
34
  t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
35
  return t
36
 
37
  def _restore(text):
38
+ return text.replace("⟨ABBRDOT⟩", ".").replace("⟨DECIMAL⟩", ".").replace("⟨ELLIPSIS⟩", "...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
+ def split_sentences_preserving(text):
41
+ protected = _protect(text)
42
+ parts = re.split(r"([.?!])(\s+)", protected)
 
 
 
 
 
 
 
43
 
44
+ sentences = []
45
+ current = ""
 
 
 
46
 
47
+ for i in range(0, len(parts), 3):
48
+ part = parts[i]
49
+ punct = parts[i+1] if i+1 < len(parts) else ""
50
+ space = parts[i+2] if i+2 < len(parts) else ""
51
 
52
+ current = part + punct
53
+ sentences.append(_restore(current))
54
+ sentences.append(space) # preserve exact spacing (spaces and newlines)
55
 
56
+ return sentences # alternating [sentence, whitespace, sentence, whitespace...]
 
57
 
58
+ def extract_pure_sentences(sent_block):
59
+ return [s for s in sent_block if not s.isspace()]
 
 
 
 
60
 
61
 
62
  # -----------------------------
63
+ # GROUP SENTENCES
64
  # -----------------------------
65
  def group_sentences(sents, size=3):
66
+ return [" ".join(sents[i:i+size]) for i in range(0, len(sents), size)]
67
 
68
 
69
  # -----------------------------
70
+ # MAIN ANALYSIS
71
  # -----------------------------
72
  def analyze(text, max_len=512):
 
 
 
73
 
74
+ # 1. Split while preserving structure
75
+ blocks = split_sentences_preserving(text)
76
+ pure_sentences = extract_pure_sentences(blocks)
77
+
78
+ if not pure_sentences:
79
+ return "—", "—", "<em>Paste text to analyze.</em>", None
80
 
81
+ # 2. Group for model
82
+ grouped = group_sentences(pure_sentences, 3)
83
  clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
84
 
85
+ # 3. Run model
86
+ inputs = tokenizer(clean_grouped, return_tensors="pt", padding=True,
87
+ truncation=True, max_length=max_len).to(device)
 
 
 
 
 
88
 
89
  with torch.no_grad():
90
  logits = model(**inputs).logits
91
  chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
92
 
93
+ # 4. Expand chunk scores to per-sentence
94
+ sentence_ai = []
95
  for idx, prob in enumerate(chunk_probs):
96
  start = idx * 3
97
+ end = min(start + 3, len(pure_sentences))
98
  for _ in range(start, end):
99
+ sentence_ai.append(prob)
100
 
101
+ # -----------------------------
102
+ # FINAL OUTPUT RECONSTRUCTION
103
+ # -----------------------------
104
+ highlighted = ""
105
+ sent_index = 0
106
 
107
+ for block in blocks:
108
+ if block.isspace():
109
+ highlighted += block # preserve exact spacing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  else:
111
+ # this block is a real sentence
112
+ ai_p = sentence_ai[sent_index]
113
+ sent_index += 1
114
+
115
+ pct = f"{ai_p*100:.1f}%"
116
+
117
+ if ai_p < 0.30:
118
+ color = "#11823b"
119
+ elif ai_p < 0.70:
120
+ color = "#b8860b"
121
+ else:
122
+ color = "#b80d0d"
123
+
124
+ highlighted += f"<span style='background-color:rgba(0,0,0,0.03); padding:3px 4px; border-radius:4px;'><strong style='color:{color}'>[{pct}]</strong> {block.strip()}</span> "
125
+
126
+ # Overall score
127
+ overall = sum(sentence_ai) / len(sentence_ai)
128
+ overall_pct = f"{overall*100:.1f}%"
129
+ overall_label = "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
130
+
131
+ # Table output
132
+ df = pd.DataFrame(
133
+ [[i+1, s, sentence_ai[i]] for i, s in enumerate(pure_sentences)],
134
+ columns=["#", "Sentence", "AI_Prob"]
135
+ )
136
 
137
+ return overall_label, overall_pct, highlighted, df
138
 
139
 
140
  # -----------------------------
141
+ # UI
142
  # -----------------------------
143
  with gr.Blocks() as demo:
144
+ gr.Markdown("### 🕵️ AI Sentence-Level Detector — Original Format Highlighting")
145
 
146
+ text_input = gr.Textbox(label="Paste text", lines=14)
147
  btn = gr.Button("Analyze")
148
 
149
  verdict = gr.Label(label="Overall Verdict")
150
  score = gr.Label(label="Overall AI Score")
151
+ highlights = gr.HTML(label="Highlighted Text (Original Format)")
152
  table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)
153
 
154
  btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])