VictorM-Coder commited on
Commit
21a21f1
·
verified ·
1 Parent(s): 814a384

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -56
app.py CHANGED
@@ -17,19 +17,23 @@ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, dtype=dty
17
  # -----------------------------
18
  # AI DECISION THRESHOLD (80%)
19
  # -----------------------------
20
- THRESHOLD = 0.80 # AI from 80% and above
21
 
22
  # -----------------------------
23
  # SENTENCE SPLITTING UTILITIES
24
  # -----------------------------
25
  ABBR = [
26
  "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
27
- "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
28
  "u.s", "u.k", "a.m", "p.m"
29
  ]
30
- ABBR_REGEX = re.compile(r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.", flags=re.IGNORECASE)
31
 
32
- def _protect(text: str) -> str:
 
 
 
 
 
33
  t = text.strip()
34
  if not t:
35
  return ""
@@ -39,17 +43,21 @@ def _protect(text: str) -> str:
39
  t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
40
  return t
41
 
42
- def _restore(text: str) -> str:
43
- return (text
44
- .replace("⟨ABBRDOT⟩", ".")
45
  .replace("⟨DECIMAL⟩", ".")
46
- .replace("⟨ELLIPSIS⟩", "..."))
 
47
 
48
- def sentence_split(text: str):
49
  t = _protect(text)
50
  if not t:
51
  return []
52
- parts = re.split(r"([.?!])\s+(?=(?:[\"“”‘’']?\s*[A-Z(])|$)", t)
 
 
 
53
 
54
  sentences, buf = [], ""
55
  for i, chunk in enumerate(parts):
@@ -65,39 +73,76 @@ def sentence_split(text: str):
65
 
66
  return [_restore(s).strip() for s in sentences if s.strip()]
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  # -----------------------------
69
  # GROUP SENTENCES (TURNITIN STYLE)
70
  # -----------------------------
71
  def group_sentences(sents, size=3):
72
- grouped = []
73
- for i in range(0, len(sents), size):
74
- grouped.append(" ".join(sents[i:i+size]))
75
- return grouped
76
 
77
  # -----------------------------
78
- # CORE ANALYSIS (3 SENTENCE WINDOWS)
79
  # -----------------------------
80
  def analyze(text, max_len=512):
81
- sents = sentence_split(text)
82
- if not sents:
83
  return "—", "—", "<em>Paste some text to analyze.</em>", None
84
 
85
- # GROUP sentences (3 at a time)
86
- grouped = group_sentences(sents, size=3)
 
 
 
87
  clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
88
 
89
- # tokenize grouped chunks
90
  inputs = tokenizer(
91
- clean_grouped, return_tensors="pt",
92
- padding=True, truncation=True, max_length=max_len
 
 
 
93
  ).to(device)
94
 
95
- # model inference
96
  with torch.no_grad():
97
  logits = model(**inputs).logits
98
- chunk_probs = F.softmax(logits, dim=-1)[:, 1].detach().cpu().tolist()
99
 
100
- # EXPAND chunk-level probabilities to per-sentence (each chunk contributes to its 3 sentences)
101
  ai_probs = []
102
  for idx, prob in enumerate(chunk_probs):
103
  start = idx * 3
@@ -105,59 +150,59 @@ def analyze(text, max_len=512):
105
  for _ in range(start, end):
106
  ai_probs.append(prob)
107
 
108
- # overall AI score
109
- overall_ai = sum(ai_probs) / len(ai_probs)
110
- overall_pct = f"{overall_ai * 100:.1f}%"
111
 
 
 
 
112
  overall_label = (
113
- "🤖 Likely AI Written" if overall_ai >= THRESHOLD else "🧒 Likely Human Written"
114
  )
115
 
116
- # HIGHLIGHTS + TABLE
117
- rows, highlights = [], []
118
-
119
- for i, orig in enumerate(sents, start=1):
120
- ai_p = float(ai_probs[i-1])
121
- pct = f"{ai_p * 100:.1f}%"
122
 
123
- label = "AI" if ai_p >= THRESHOLD else "Human"
124
-
125
- if ai_p < 0.30:
126
  color = "#11823b"
127
- elif ai_p < 0.70:
128
  color = "#b8860b"
129
  else:
130
  color = "#b80d0d"
131
 
132
- normalized = re.sub(r"\s+", " ", orig)
133
-
134
- highlights.append(
135
- "<div style='margin:6px 0; padding:6px 8px; border-radius:6px;"
136
- "background:rgba(0,0,0,0.03)'>"
137
- f"<strong style='color:{color}'>[{pct} {label}]</strong> "
138
- f"{normalized}</div>"
139
- )
140
 
141
- rows.append([i, orig, round(ai_p, 4), label])
 
 
 
 
142
 
143
- df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob", "Label"])
144
- html = "\n".join(highlights)
145
 
146
- return overall_label, overall_pct, html, df
147
 
148
  # -----------------------------
149
  # GRADIO UI
150
  # -----------------------------
151
  with gr.Blocks() as demo:
152
- gr.Markdown("### 🕵️ AI Written Text Detector — Fakespot Model (80% Threshold)")
153
 
154
  text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your content…")
155
  btn = gr.Button("Analyze")
156
 
157
- verdict = gr.Label(label="Verdict (Overall)")
158
- score = gr.Label(label="AI Score (Average across sentences)")
159
- highlights = gr.HTML(label="Per-Sentence Highlights")
160
- table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob", "Label"], wrap=True)
161
 
162
  btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])
163
 
 
17
  # -----------------------------
18
  # AI DECISION THRESHOLD (80%)
19
  # -----------------------------
20
+ THRESHOLD = 0.80
21
 
22
  # -----------------------------
23
  # SENTENCE SPLITTING UTILITIES
24
  # -----------------------------
25
  ABBR = [
26
  "e.g", "i.e", "mr", "mrs", "ms", "dr", "prof", "vs", "etc", "fig", "al",
27
+ "jr", "sr", "st", "no", "vol", "pp", "mt", "inc", "ltd", "co",
28
  "u.s", "u.k", "a.m", "p.m"
29
  ]
 
30
 
31
+ ABBR_REGEX = re.compile(
32
+ r"\b(" + "|".join(map(re.escape, ABBR)) + r")\.",
33
+ flags=re.IGNORECASE
34
+ )
35
+
36
+ def _protect(text):
37
  t = text.strip()
38
  if not t:
39
  return ""
 
43
  t = ABBR_REGEX.sub(r"\1⟨ABBRDOT⟩", t)
44
  return t
45
 
46
+ def _restore(text):
47
+ return (
48
+ text.replace("⟨ABBRDOT⟩", ".")
49
  .replace("⟨DECIMAL⟩", ".")
50
+ .replace("⟨ELLIPSIS⟩", "...")
51
+ )
52
 
53
+ def sentence_split(text):
54
  t = _protect(text)
55
  if not t:
56
  return []
57
+
58
+ parts = re.split(
59
+ r"([.?!])\s+(?=(?:[\"“”‘’']?\s*[A-Z(])|$)", t
60
+ )
61
 
62
  sentences, buf = [], ""
63
  for i, chunk in enumerate(parts):
 
73
 
74
  return [_restore(s).strip() for s in sentences if s.strip()]
75
 
76
+
77
+ # -----------------------------
78
+ # PARAGRAPH UTILITIES
79
+ # -----------------------------
80
+ def split_paragraphs(text):
81
+ paragraphs = [p.strip() for p in text.split("\n") if p.strip()]
82
+ return paragraphs
83
+
84
+ def map_sentences_to_paragraphs(paragraphs):
85
+ all_sentences = []
86
+ mapping = []
87
+
88
+ for p_idx, para in enumerate(paragraphs):
89
+ sents = sentence_split(para)
90
+ for s_idx, s in enumerate(sents):
91
+ all_sentences.append(s)
92
+ mapping.append((p_idx, s_idx))
93
+
94
+ return all_sentences, mapping
95
+
96
+ def combine_paragraph_scores(paragraphs, mapping, sentence_probs):
97
+ bucket = [[] for _ in paragraphs]
98
+
99
+ for (p_idx, _), prob in zip(mapping, sentence_probs):
100
+ bucket[p_idx].append(prob)
101
+
102
+ final_scores = [
103
+ (sum(scores) / len(scores)) if scores else 0
104
+ for scores in bucket
105
+ ]
106
+
107
+ return final_scores
108
+
109
+
110
  # -----------------------------
111
  # GROUP SENTENCES (TURNITIN STYLE)
112
  # -----------------------------
113
  def group_sentences(sents, size=3):
114
+ return [" ".join(sents[i:i + size]) for i in range(0, len(sents), size)]
115
+
 
 
116
 
117
  # -----------------------------
118
+ # CORE ANALYSIS
119
  # -----------------------------
120
  def analyze(text, max_len=512):
121
+ paragraphs = split_paragraphs(text)
122
+ if not paragraphs:
123
  return "—", "—", "<em>Paste some text to analyze.</em>", None
124
 
125
+ # map paragraphs sentences
126
+ sents, mapping = map_sentences_to_paragraphs(paragraphs)
127
+
128
+ # group sentences in 3s
129
+ grouped = group_sentences(sents, 3)
130
  clean_grouped = [re.sub(r"\s+", " ", g).strip() for g in grouped]
131
 
132
+ # tokenize chunks
133
  inputs = tokenizer(
134
+ clean_grouped,
135
+ return_tensors="pt",
136
+ padding=True,
137
+ truncation=True,
138
+ max_length=max_len
139
  ).to(device)
140
 
 
141
  with torch.no_grad():
142
  logits = model(**inputs).logits
143
+ chunk_probs = F.softmax(logits, dim=-1)[:, 1].cpu().tolist()
144
 
145
+ # expand chunk probability to each sentence
146
  ai_probs = []
147
  for idx, prob in enumerate(chunk_probs):
148
  start = idx * 3
 
150
  for _ in range(start, end):
151
  ai_probs.append(prob)
152
 
153
+ # final paragraph-level scores
154
+ paragraph_ai = combine_paragraph_scores(paragraphs, mapping, ai_probs)
 
155
 
156
+ # overall score
157
+ overall = sum(ai_probs) / len(ai_probs)
158
+ overall_pct = f"{overall * 100:.1f}%"
159
  overall_label = (
160
+ "🤖 Likely AI Written" if overall >= THRESHOLD else "🧒 Likely Human Written"
161
  )
162
 
163
+ # paragraph-based HTML output
164
+ final_html = ""
165
+ for idx, (para, ai) in enumerate(zip(paragraphs, paragraph_ai), start=1):
166
+ pct = f"{ai * 100:.1f}%"
167
+ label = "AI" if ai >= THRESHOLD else "Human"
 
168
 
169
+ # color
170
+ if ai < 0.30:
 
171
  color = "#11823b"
172
+ elif ai < 0.70:
173
  color = "#b8860b"
174
  else:
175
  color = "#b80d0d"
176
 
177
+ final_html += f"""
178
+ <div style='margin:12px 0; padding:12px; border-radius:8px; background:#fafafa'>
179
+ <strong style='color:{color}'>[Paragraph {idx}: {pct} {label}]</strong>
180
+ <div style='margin-top:8px; white-space:pre-wrap'>{para}</div>
181
+ </div>
182
+ """
 
 
183
 
184
+ # sentence table (still available if needed)
185
+ rows = []
186
+ for i, s in enumerate(sents, start=1):
187
+ rows.append([i, s, round(ai_probs[i-1], 4)])
188
+ df = pd.DataFrame(rows, columns=["#", "Sentence", "AI_Prob"])
189
 
190
+ return overall_label, overall_pct, final_html, df
 
191
 
 
192
 
193
  # -----------------------------
194
  # GRADIO UI
195
  # -----------------------------
196
  with gr.Blocks() as demo:
197
+ gr.Markdown("### 🕵️ AI Written Text Detector — Fakespot Model (Turnitin-style Paragraph Mode)")
198
 
199
  text_input = gr.Textbox(label="Paste text", lines=14, placeholder="Your content…")
200
  btn = gr.Button("Analyze")
201
 
202
+ verdict = gr.Label(label="Overall Verdict")
203
+ score = gr.Label(label="Overall AI Score")
204
+ highlights = gr.HTML(label="Paragraph Highlights (Original Format)")
205
+ table = gr.Dataframe(headers=["#", "Sentence", "AI_Prob"], wrap=True)
206
 
207
  btn.click(analyze, inputs=[text_input], outputs=[verdict, score, highlights, table])
208