elalber2000 commited on
Commit
77bfa68
·
verified ·
1 Parent(s): cecc2d8

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +403 -0
app.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from collections import Counter
5
+ from typing import Any
6
+
7
+ import gradio as gr
8
+ import numpy as np
9
+ import requests
10
+
11
+ STOPWORDS = {
12
+ "the",
13
+ "and",
14
+ "is",
15
+ "in",
16
+ "it",
17
+ "of",
18
+ "to",
19
+ "a",
20
+ "with",
21
+ "that",
22
+ "for",
23
+ "on",
24
+ "as",
25
+ "are",
26
+ "this",
27
+ "but",
28
+ "be",
29
+ "at",
30
+ "or",
31
+ "by",
32
+ "an",
33
+ "if",
34
+ "from",
35
+ "about",
36
+ "into",
37
+ "over",
38
+ "after",
39
+ "under",
40
+ }
41
+
42
+ _RX_SCRIPT_STYLE = re.compile(
43
+ r"<(?:script|style)[^>]*>.*?</(?:script|style)>", re.S | re.I
44
+ )
45
+ _RX_TAG = re.compile(r"<[^>]+>")
46
+ _RX_SENTENCE_SPLIT = re.compile(r"[.!?]+")
47
+ _RX_PARAGRAPH = re.compile(r"\n{2,}")
48
+ _RX_TOKENS = re.compile(r"\w+")
49
+ _RX_TAG_NAME = re.compile(r"<\s*(\w+)", re.I)
50
+ _RX_IFRAME = re.compile(r"<\s*iframe\b", re.I)
51
+ _RX_LINK = re.compile(r'href=["\']([^"\']+)["\']', re.I)
52
+
53
+ EXPRS = {
54
+ "i_x_that_is_not_y_but_z": re.compile(
55
+ r"\bI\s+\w+\s+that\s+is\s+not\s+\w+,\s*but\s+\w+", re.I
56
+ ),
57
+ "as_i_x_i_will_y": re.compile(r"\bAs\s+I\s+\w+,\s*I\s+will\s+\w+", re.I),
58
+ }
59
+
60
+
61
+ def _feature_dict(html: str) -> dict:
62
+ cleaned = _RX_SCRIPT_STYLE.sub("", html)
63
+ text = _RX_TAG.sub(" ", cleaned)
64
+ tokens = _RX_TOKENS.findall(text.lower())
65
+ paragraphs = [p for p in _RX_PARAGRAPH.split(text) if p.strip()]
66
+ total_bytes, text_bytes = len(html), len(text)
67
+ tags = _RX_TAG_NAME.findall(html.lower())
68
+ n_tags = len(tags) or 1
69
+ iframe_count = len(_RX_IFRAME.findall(html))
70
+ hrefs = _RX_LINK.findall(html)
71
+ total_links = len(hrefs)
72
+ links_per_kb = total_links / (total_bytes / 1024) if total_bytes else 0
73
+ sw_count = sum(1 for t in tokens if t in STOPWORDS)
74
+ stopword_ratio = sw_count / len(tokens) if tokens else 0
75
+ spp_list = [len(_RX_SENTENCE_SPLIT.split(p)) for p in paragraphs]
76
+ sentences_per_paragraph = sum(spp_list) / len(spp_list) if spp_list else 0
77
+ freq = Counter(tokens)
78
+ type_token_ratio = len(freq) / len(tokens) if tokens else 0
79
+ prp_count = len(
80
+ re.findall(r"\b(?:I|me|you|he|she|it|we|they|him|her|us|them)\b", text, re.I)
81
+ )
82
+ prp_ratio = prp_count / len(tokens) if tokens else 0
83
+ vbg_count = len(re.findall(r"\b\w+ing\b", text))
84
+ straight_apostrophe = text.count("'")
85
+ markup_to_text_ratio = (
86
+ (total_bytes - text_bytes) / total_bytes if total_bytes else 0
87
+ )
88
+ inline_css_ratio = html.lower().count("style=") / n_tags
89
+ ix_not = len(EXPRS["i_x_that_is_not_y_but_z"].findall(text))
90
+ as_i = len(EXPRS["as_i_x_i_will_y"].findall(text))
91
+ return {
92
+ "stopword_ratio": stopword_ratio,
93
+ "links_per_kb": links_per_kb,
94
+ "type_token_ratio": type_token_ratio,
95
+ "i_x_that_is_not_y_but_z": ix_not,
96
+ "prp_ratio": prp_ratio,
97
+ "sentences_per_paragraph": sentences_per_paragraph,
98
+ "markup_to_text_ratio": markup_to_text_ratio,
99
+ "inline_css_ratio": inline_css_ratio,
100
+ "iframe_count": iframe_count,
101
+ "as_i_x_i_will_y": as_i,
102
+ "vbg": vbg_count,
103
+ "straight_apostrophe": straight_apostrophe,
104
+ }
105
+
106
+
107
+ def load_weights():
108
+ with open(
109
+ os.path.join(os.path.dirname(__file__), "weights.json"), encoding="utf-8"
110
+ ) as f:
111
+ weights = json.load(f)
112
+ weight_names = ["W_num", "bias", "U", "mu", "sigma"]
113
+ w_num, bias, u_lst, mu, sigma = (weights[elem] for elem in weight_names)
114
+ w_num, bias, mu, sigma = (
115
+ np.array(weights[w]) for w in weight_names if w != "U"
116
+ )
117
+ u = {k: np.array(v) for k, v in u_lst.items()}
118
+ return w_num, bias, u, mu, sigma
119
+
120
+
121
+ def interpretability_viz(html: str):
122
+ re_tok = re.compile(r"\w+|[^\w\s]+")
123
+ allowed_lengths = {4, 5, 6, 7, 8, 9, 10}
124
+ allowed_tokens = [
125
+ "onee",
126
+ "rdle",
127
+ "reduction",
128
+ "efits",
129
+ "ssic",
130
+ "citizens",
131
+ "ideas",
132
+ "unlike",
133
+ "ueak",
134
+ "aked",
135
+ "bark",
136
+ "loak",
137
+ "udic",
138
+ "myste",
139
+ "eekl",
140
+ "oten",
141
+ "obal",
142
+ "cerem",
143
+ "eeds",
144
+ "arli",
145
+ "auty",
146
+ "research",
147
+ "bann",
148
+ "governor",
149
+ "ikel",
150
+ "regis",
151
+ "sparked",
152
+ "generous",
153
+ "ered",
154
+ "etal",
155
+ "efor",
156
+ "ghes",
157
+ "epit",
158
+ "ility",
159
+ "dynam",
160
+ "vente",
161
+ "oache",
162
+ "nuin",
163
+ "democratic",
164
+ "payw",
165
+ "cono",
166
+ "passi",
167
+ ]
168
+ num_columns = [
169
+ "as_i_x_i_will_y",
170
+ "i_x_that_is_not_y_but_z",
171
+ "iframe_count",
172
+ "inline_css_ratio",
173
+ "links_per_kb",
174
+ "markup_to_text_ratio",
175
+ "prp_ratio",
176
+ "sentences_per_paragraph",
177
+ "stopword_ratio",
178
+ "straight_apostrophe",
179
+ "type_token_ratio",
180
+ "vbg",
181
+ ]
182
+ w_num, bias, u, mu, sigma = load_weights()
183
+ tokens = re_tok.findall(html.lower())
184
+ matched_subs: list[str] = []
185
+
186
+ word_scores = []
187
+ emb_dim = next(iter(u.values())).shape[-1] if u else 2
188
+ for word in tokens:
189
+ embs = []
190
+ subs_for_word = []
191
+ for length in allowed_lengths:
192
+ if len(word) < length:
193
+ continue
194
+ for i in range(len(word) - length + 1):
195
+ sub = word[i : i + length]
196
+ if sub in allowed_tokens:
197
+ embs.append(u[sub])
198
+ subs_for_word.append(sub)
199
+ if subs_for_word:
200
+ matched_subs.extend(set(subs_for_word))
201
+ word_scores.append(np.mean(embs, axis=0))
202
+ else:
203
+ word_scores.append(np.zeros(emb_dim, dtype=np.float32))
204
+ text_score = (
205
+ np.mean(np.stack(word_scores, axis=0), axis=0)
206
+ if word_scores
207
+ else np.zeros(emb_dim, dtype=np.float32)
208
+ )
209
+ feats = _feature_dict(html)
210
+ num_vec = np.array([feats.get(col, 0.0) for col in num_columns], dtype=np.float32)
211
+ num_std = (num_vec - mu.reshape(-1)) / sigma.reshape(-1)
212
+ numeric_score = num_std @ w_num
213
+ logits = text_score + numeric_score + bias
214
+ exp_shift = np.exp(logits - np.max(logits))
215
+ probs = exp_shift / np.sum(exp_shift)
216
+
217
+ feature_info = []
218
+ for i, col in enumerate(num_columns):
219
+ delta = w_num[i, 1] - w_num[i, 0]
220
+ cval = num_std[i] * delta
221
+ abs_cval = abs(cval)
222
+ direction = cval > 0 # True = slop, False = not-slop
223
+ feature_info.append(
224
+ {
225
+ "col": col,
226
+ "value": feats.get(col, 0),
227
+ "abs_cval": abs_cval,
228
+ "direction": direction,
229
+ "cval": cval,
230
+ }
231
+ )
232
+
233
+ verdict = "slop" if probs[1] > probs[0] else "not slop"
234
+ for f in feature_info:
235
+ f["signed"] = (
236
+ f["abs_cval"] if f["direction"] == (verdict == "slop") else -f["abs_cval"]
237
+ )
238
+ feature_info.sort(key=lambda x: x["signed"], reverse=True)
239
+ feature_info = feature_info[:5]
240
+
241
+ feature_map = {
242
+ "as_i_x_i_will_y": "Phrases: <b>'As I …, I will …'</b>",
243
+ "i_x_that_is_not_y_but_z": "Phrases: <b>'I … that is not …, but …'</b>",
244
+ "iframe_count": "Contains &lt;iframe&gt; elements",
245
+ "inline_css_ratio": "Uses lots of inline CSS styling",
246
+ "links_per_kb": "Has many hyperlinks",
247
+ "markup_to_text_ratio": "High markup-to-text proportion",
248
+ "prp_ratio": "Uses personal pronouns",
249
+ "sentences_per_paragraph": "Multiple sentences per paragraph",
250
+ "stopword_ratio": "High use of common words",
251
+ "straight_apostrophe": "Contains straight apostrophes",
252
+ "type_token_ratio": "Diverse vocabulary",
253
+ "vbg": "Contains words ending in <b>-ing</b>",
254
+ }
255
+ cleaned = _RX_SCRIPT_STYLE.sub("", html)
256
+ text_only = _RX_TAG.sub(" ", cleaned)
257
+ pattern_matches = {
258
+ "as_i_x_i_will_y": "('"
259
+ + "', '".join(EXPRS["as_i_x_i_will_y"].findall(text_only)[:3])
260
+ + "')",
261
+ "i_x_that_is_not_y_but_z": "('"
262
+ + "', '".join(EXPRS["i_x_that_is_not_y_but_z"].findall(text_only)[:3])
263
+ + "')",
264
+ }
265
+
266
+ def feat_color(strength, direction, max_strength):
267
+ if max_strength <= 0:
268
+ return "background:#fffde7;color:#333;"
269
+ norm = min(strength / max_strength, 1.0)
270
+ yellow, red, green = (227, 213, 123), (196, 70, 67), (92, 173, 95)
271
+ if direction:
272
+ r, g, b = (y + (norm * (r - y)) for y, r in zip(yellow, red))
273
+ else:
274
+ r, g, b = (y + (norm * (g - y)) for y, g in zip(yellow, green))
275
+ return f"background:rgb({r},{g},{b});color:#111;"
276
+
277
+ top_feats_table = (
278
+ "<table style='border-collapse:collapse;width:100%;margin-bottom:12px;'>"
279
+ )
280
+ top_feats_table += "<tr><th style='padding:4px 8px;text-align:center;'>Top Features</th><th style='padding:4px 8px;text-align:center;'>Value</th></tr>"
281
+
282
+ tot_abs = sum(f["abs_cval"] for f in feature_info) or 1.0
283
+ for f in feature_info:
284
+ f["norm01"] = f["abs_cval"] / tot_abs
285
+
286
+ for feat in feature_info:
287
+ feat_col = feat["col"]
288
+ human = feature_map[feat_col]
289
+ extra = pattern_matches.get(feat_col, "") if "Phrases" in human else ""
290
+ color = feat_color(
291
+ feat["abs_cval"],
292
+ feat["direction"],
293
+ max(f["abs_cval"] for f in feature_info),
294
+ )
295
+ sign = "+" if feat["signed"] > 0 else "-"
296
+ cell = f"{sign}{abs(feat['norm01']):.2f}"
297
+ if cell[1:] != "0.00":
298
+ top_feats_table += (
299
+ f"<tr style='{color}'>"
300
+ f"<td style='padding:4px 8px;'>{human}{extra}</td>"
301
+ f"<td style='padding:4px 8px;text-align:right;'>{cell}</td>"
302
+ f"</tr>"
303
+ )
304
+
305
+ def verdict_button(verdict):
306
+ if verdict == "not slop":
307
+ return "<button style='background:#43a047;color:white;font-weight:800;font-size:1.2em;padding:16px 32px;border-radius:10px;border:none;margin-bottom:14px;box-shadow:0 2px 8px #1111;'>NOT SLOP</button>"
308
+ else:
309
+ return "<button style='background:#e53935;color:white;font-weight:800;font-size:1.2em;padding:16px 32px;border-radius:10px;border:none;margin-bottom:14px;box-shadow:0 2px 8px #1111;'>SLOP</button>"
310
+
311
+ ngram_html = ""
312
+ if matched_subs:
313
+ unique_subs = sorted(set(matched_subs))
314
+ subs_info: list[dict[str, Any]] = []
315
+ for s in unique_subs:
316
+ emb = u.get(s, np.zeros(emb_dim, dtype=np.float32))
317
+ delta_sub = float(emb[1] - emb[0])
318
+ abs_delta = abs(delta_sub)
319
+ direction_sub = delta_sub > 0
320
+ subs_info.append(
321
+ {
322
+ "sub": s,
323
+ "score": delta_sub,
324
+ "abs_score": abs_delta,
325
+ "direction": direction_sub,
326
+ }
327
+ )
328
+
329
+ subs_info.sort(key=lambda x: x["abs_score"], reverse=True)
330
+ subs_info = subs_info[:5]
331
+
332
+ for s_i in subs_info:
333
+ s_i["signed"] = (
334
+ s_i["abs_score"]
335
+ if s_i["direction"] == (verdict == "slop")
336
+ else -s_i["abs_score"]
337
+ )
338
+ subs_info.sort(key=lambda x: x["signed"], reverse=True)
339
+
340
+ max_abs_sub = max(s["abs_score"] for s in subs_info) or 1.0
341
+ ngram_html = "<div style='margin:8px 0;'>Matched n-grams:<br>"
342
+ for s_i in subs_info:
343
+ color = feat_color(s_i["abs_score"], s_i["direction"], max_abs_sub)
344
+ sign = "+" if s_i["signed"] > 0 else "-"
345
+ ngram_html += (
346
+ f"<span style='{color} border-radius:4px; padding:2px 5px; margin:2px; display:inline-block; font-family:monospace;'>"
347
+ f"{sign}{s_i['sub']}"
348
+ f"</span>"
349
+ )
350
+ ngram_html += "</div>"
351
+
352
+ overall = f"""
353
+ <div style='padding:18px; background:#fff; border-radius:16px; box-shadow:0 2px 8px #0001;'>
354
+ <div style='text-align:center;'>{verdict_button(verdict)}</div>
355
+ {top_feats_table}
356
+ {ngram_html}
357
+ </div>
358
+ """
359
+ return overall
360
+
361
+
362
+ def process_input_viz(url_input, html_input):
363
+ user_input = (url_input or "").strip()
364
+ html = (html_input or "").strip()
365
+ if user_input:
366
+ try:
367
+ resp = requests.get(user_input, timeout=6)
368
+ html = resp.text
369
+ except Exception as e:
370
+ return f"<span style='color:red;'>Error fetching URL: {e}</span>"
371
+ elif html:
372
+ pass
373
+ else:
374
+ return "<span style='color:red;'>Please provide a URL or HTML code.</span>"
375
+ return interpretability_viz(html)
376
+
377
+
378
+ desc = (
379
+ "This is a demo for Stop-Slop, an AI model that detects slop "
380
+ "(low-quality, unoriginal, or spammy material—often AI-generated—that "
381
+ "adds noise rather than value) websites.\n"
382
+ "\n\n\n"
383
+ "To start, input a <b>valid URL (top box)</b> <span style='color:#888;"
384
+ "'>or</span> some <b>HTML code (bottom box)</b>."
385
+ )
386
+
387
+ iface = gr.Interface(
388
+ fn=process_input_viz,
389
+ inputs=[
390
+ gr.Textbox(
391
+ lines=1,
392
+ label="URL",
393
+ placeholder="https://nymag.com/intelligencer/article/ai-generated-content-internet-online-slop-spam.html",
394
+ ),
395
+ gr.Textbox(lines=10, label="HTML", placeholder="<html>...</html>"),
396
+ ],
397
+ outputs=gr.HTML(label="Result"),
398
+ description=desc,
399
+ title="🚫🧟 Stop Slop",
400
+ )
401
+
402
+ if __name__ == "__main__":
403
+ iface.launch()