Spaces:
Sleeping
Sleeping
| import json | |
| import os | |
| import re | |
| from collections import Counter | |
| from typing import Any | |
| import gradio as gr | |
| import numpy as np | |
| import requests | |
| STOPWORDS = { | |
| "the", | |
| "and", | |
| "is", | |
| "in", | |
| "it", | |
| "of", | |
| "to", | |
| "a", | |
| "with", | |
| "that", | |
| "for", | |
| "on", | |
| "as", | |
| "are", | |
| "this", | |
| "but", | |
| "be", | |
| "at", | |
| "or", | |
| "by", | |
| "an", | |
| "if", | |
| "from", | |
| "about", | |
| "into", | |
| "over", | |
| "after", | |
| "under", | |
| } | |
| _RX_SCRIPT_STYLE = re.compile( | |
| r"<(?:script|style)[^>]*>.*?</(?:script|style)>", re.S | re.I | |
| ) | |
| _RX_TAG = re.compile(r"<[^>]+>") | |
| _RX_SENTENCE_SPLIT = re.compile(r"[.!?]+") | |
| _RX_PARAGRAPH = re.compile(r"\n{2,}") | |
| _RX_TOKENS = re.compile(r"\w+") | |
| _RX_TAG_NAME = re.compile(r"<\s*(\w+)", re.I) | |
| _RX_IFRAME = re.compile(r"<\s*iframe\b", re.I) | |
| _RX_LINK = re.compile(r'href=["\']([^"\']+)["\']', re.I) | |
| EXPRS = { | |
| "i_x_that_is_not_y_but_z": re.compile( | |
| r"\bI\s+\w+\s+that\s+is\s+not\s+\w+,\s*but\s+\w+", re.I | |
| ), | |
| "as_i_x_i_will_y": re.compile(r"\bAs\s+I\s+\w+,\s*I\s+will\s+\w+", re.I), | |
| } | |
| def _feature_dict(html: str) -> dict: | |
| cleaned = _RX_SCRIPT_STYLE.sub("", html) | |
| text = _RX_TAG.sub(" ", cleaned) | |
| tokens = _RX_TOKENS.findall(text.lower()) | |
| paragraphs = [p for p in _RX_PARAGRAPH.split(text) if p.strip()] | |
| total_bytes, text_bytes = len(html), len(text) | |
| tags = _RX_TAG_NAME.findall(html.lower()) | |
| n_tags = len(tags) or 1 | |
| iframe_count = len(_RX_IFRAME.findall(html)) | |
| hrefs = _RX_LINK.findall(html) | |
| total_links = len(hrefs) | |
| links_per_kb = total_links / (total_bytes / 1024) if total_bytes else 0 | |
| sw_count = sum(1 for t in tokens if t in STOPWORDS) | |
| stopword_ratio = sw_count / len(tokens) if tokens else 0 | |
| spp_list = [len(_RX_SENTENCE_SPLIT.split(p)) for p in paragraphs] | |
| sentences_per_paragraph = sum(spp_list) / len(spp_list) if spp_list else 0 | |
| freq = Counter(tokens) | |
| type_token_ratio = len(freq) / len(tokens) if tokens else 0 | |
| prp_count = len( | |
| re.findall(r"\b(?:I|me|you|he|she|it|we|they|him|her|us|them)\b", text, re.I) | |
| ) | |
| prp_ratio = prp_count / len(tokens) if tokens else 0 | |
| vbg_count = len(re.findall(r"\b\w+ing\b", text)) | |
| straight_apostrophe = text.count("'") | |
| markup_to_text_ratio = ( | |
| (total_bytes - text_bytes) / total_bytes if total_bytes else 0 | |
| ) | |
| inline_css_ratio = html.lower().count("style=") / n_tags | |
| ix_not = len(EXPRS["i_x_that_is_not_y_but_z"].findall(text)) | |
| as_i = len(EXPRS["as_i_x_i_will_y"].findall(text)) | |
| return { | |
| "stopword_ratio": stopword_ratio, | |
| "links_per_kb": links_per_kb, | |
| "type_token_ratio": type_token_ratio, | |
| "i_x_that_is_not_y_but_z": ix_not, | |
| "prp_ratio": prp_ratio, | |
| "sentences_per_paragraph": sentences_per_paragraph, | |
| "markup_to_text_ratio": markup_to_text_ratio, | |
| "inline_css_ratio": inline_css_ratio, | |
| "iframe_count": iframe_count, | |
| "as_i_x_i_will_y": as_i, | |
| "vbg": vbg_count, | |
| "straight_apostrophe": straight_apostrophe, | |
| } | |
| def load_weights(): | |
| with open( | |
| os.path.join(os.path.dirname(__file__), "weights.json"), encoding="utf-8" | |
| ) as f: | |
| weights = json.load(f) | |
| weight_names = ["W_num", "bias", "U", "mu", "sigma"] | |
| w_num, bias, u_lst, mu, sigma = (weights[elem] for elem in weight_names) | |
| w_num, bias, mu, sigma = ( | |
| np.array(weights[w]) for w in weight_names if w != "U" | |
| ) | |
| u = {k: np.array(v) for k, v in u_lst.items()} | |
| return w_num, bias, u, mu, sigma | |
| def interpretability_viz(html: str): | |
| re_tok = re.compile(r"\w+|[^\w\s]+") | |
| allowed_lengths = {4, 5, 6, 7, 8, 9, 10} | |
| allowed_tokens = [ | |
| "onee", | |
| "rdle", | |
| "reduction", | |
| "efits", | |
| "ssic", | |
| "citizens", | |
| "ideas", | |
| "unlike", | |
| "ueak", | |
| "aked", | |
| "bark", | |
| "loak", | |
| "udic", | |
| "myste", | |
| "eekl", | |
| "oten", | |
| "obal", | |
| "cerem", | |
| "eeds", | |
| "arli", | |
| "auty", | |
| "research", | |
| "bann", | |
| "governor", | |
| "ikel", | |
| "regis", | |
| "sparked", | |
| "generous", | |
| "ered", | |
| "etal", | |
| "efor", | |
| "ghes", | |
| "epit", | |
| "ility", | |
| "dynam", | |
| "vente", | |
| "oache", | |
| "nuin", | |
| "democratic", | |
| "payw", | |
| "cono", | |
| "passi", | |
| ] | |
| num_columns = [ | |
| "as_i_x_i_will_y", | |
| "i_x_that_is_not_y_but_z", | |
| "iframe_count", | |
| "inline_css_ratio", | |
| "links_per_kb", | |
| "markup_to_text_ratio", | |
| "prp_ratio", | |
| "sentences_per_paragraph", | |
| "stopword_ratio", | |
| "straight_apostrophe", | |
| "type_token_ratio", | |
| "vbg", | |
| ] | |
| w_num, bias, u, mu, sigma = load_weights() | |
| tokens = re_tok.findall(html.lower()) | |
| matched_subs: list[str] = [] | |
| word_scores = [] | |
| emb_dim = next(iter(u.values())).shape[-1] if u else 2 | |
| for word in tokens: | |
| embs = [] | |
| subs_for_word = [] | |
| for length in allowed_lengths: | |
| if len(word) < length: | |
| continue | |
| for i in range(len(word) - length + 1): | |
| sub = word[i : i + length] | |
| if sub in allowed_tokens: | |
| embs.append(u[sub]) | |
| subs_for_word.append(sub) | |
| if subs_for_word: | |
| matched_subs.extend(set(subs_for_word)) | |
| word_scores.append(np.mean(embs, axis=0)) | |
| else: | |
| word_scores.append(np.zeros(emb_dim, dtype=np.float32)) | |
| text_score = ( | |
| np.mean(np.stack(word_scores, axis=0), axis=0) | |
| if word_scores | |
| else np.zeros(emb_dim, dtype=np.float32) | |
| ) | |
| feats = _feature_dict(html) | |
| num_vec = np.array([feats.get(col, 0.0) for col in num_columns], dtype=np.float32) | |
| num_std = (num_vec - mu.reshape(-1)) / sigma.reshape(-1) | |
| numeric_score = num_std @ w_num | |
| logits = text_score + numeric_score + bias | |
| exp_shift = np.exp(logits - np.max(logits)) | |
| probs = exp_shift / np.sum(exp_shift) | |
| feature_info = [] | |
| for i, col in enumerate(num_columns): | |
| delta = w_num[i, 1] - w_num[i, 0] | |
| cval = num_std[i] * delta | |
| abs_cval = abs(cval) | |
| direction = cval > 0 # True = slop, False = not-slop | |
| feature_info.append( | |
| { | |
| "col": col, | |
| "value": feats.get(col, 0), | |
| "abs_cval": abs_cval, | |
| "direction": direction, | |
| "cval": cval, | |
| } | |
| ) | |
| verdict = "slop" if probs[1] > probs[0] else "not slop" | |
| for f in feature_info: | |
| f["signed"] = ( | |
| f["abs_cval"] if f["direction"] == (verdict == "slop") else -f["abs_cval"] | |
| ) | |
| feature_info.sort(key=lambda x: x["signed"], reverse=True) | |
| feature_info = feature_info[:5] | |
| feature_map = { | |
| "as_i_x_i_will_y": "Phrases: <b>'As I …, I will …'</b>", | |
| "i_x_that_is_not_y_but_z": "Phrases: <b>'I … that is not …, but …'</b>", | |
| "iframe_count": "Contains <iframe> elements", | |
| "inline_css_ratio": "Uses lots of inline CSS styling", | |
| "links_per_kb": "Has many hyperlinks", | |
| "markup_to_text_ratio": "High markup-to-text proportion", | |
| "prp_ratio": "Uses personal pronouns", | |
| "sentences_per_paragraph": "Multiple sentences per paragraph", | |
| "stopword_ratio": "High use of common words", | |
| "straight_apostrophe": "Contains straight apostrophes", | |
| "type_token_ratio": "Diverse vocabulary", | |
| "vbg": "Contains words ending in <b>-ing</b>", | |
| } | |
| cleaned = _RX_SCRIPT_STYLE.sub("", html) | |
| text_only = _RX_TAG.sub(" ", cleaned) | |
| pattern_matches = { | |
| "as_i_x_i_will_y": "('" | |
| + "', '".join(EXPRS["as_i_x_i_will_y"].findall(text_only)[:3]) | |
| + "')", | |
| "i_x_that_is_not_y_but_z": "('" | |
| + "', '".join(EXPRS["i_x_that_is_not_y_but_z"].findall(text_only)[:3]) | |
| + "')", | |
| } | |
| def feat_color(strength, direction, max_strength): | |
| if max_strength <= 0: | |
| return "background:#fffde7;color:#333;" | |
| norm = min(strength / max_strength, 1.0) | |
| yellow, red, green = (227, 213, 123), (196, 70, 67), (92, 173, 95) | |
| if direction: | |
| r, g, b = (y + (norm * (r - y)) for y, r in zip(yellow, red)) | |
| else: | |
| r, g, b = (y + (norm * (g - y)) for y, g in zip(yellow, green)) | |
| return f"background:rgb({r},{g},{b});color:#111;" | |
| top_feats_table = ( | |
| "<table style='border-collapse:collapse;width:100%;margin-bottom:12px;'>" | |
| ) | |
| top_feats_table += "<tr><th style='padding:4px 8px;text-align:center;'>Top Features</th><th style='padding:4px 8px;text-align:center;'>Value</th></tr>" | |
| tot_abs = sum(f["abs_cval"] for f in feature_info) or 1.0 | |
| for f in feature_info: | |
| f["norm01"] = f["abs_cval"] / tot_abs | |
| for feat in feature_info: | |
| feat_col = feat["col"] | |
| human = feature_map[feat_col] | |
| extra = pattern_matches.get(feat_col, "") if "Phrases" in human else "" | |
| color = feat_color( | |
| feat["abs_cval"], | |
| feat["direction"], | |
| max(f["abs_cval"] for f in feature_info), | |
| ) | |
| sign = "+" if feat["signed"] > 0 else "-" | |
| cell = f"{sign}{abs(feat['norm01']):.2f}" | |
| if cell[1:] != "0.00": | |
| top_feats_table += ( | |
| f"<tr style='{color}'>" | |
| f"<td style='padding:4px 8px;'>{human}{extra}</td>" | |
| f"<td style='padding:4px 8px;text-align:right;'>{cell}</td>" | |
| f"</tr>" | |
| ) | |
| def verdict_button(verdict): | |
| if verdict == "not slop": | |
| return "<button style='background:#43a047;color:white;font-weight:800;font-size:1.2em;padding:16px 32px;border-radius:10px;border:none;margin-bottom:14px;box-shadow:0 2px 8px #1111;'>NOT SLOP</button>" | |
| else: | |
| return "<button style='background:#e53935;color:white;font-weight:800;font-size:1.2em;padding:16px 32px;border-radius:10px;border:none;margin-bottom:14px;box-shadow:0 2px 8px #1111;'>SLOP</button>" | |
| ngram_html = "" | |
| if matched_subs: | |
| unique_subs = sorted(set(matched_subs)) | |
| subs_info: list[dict[str, Any]] = [] | |
| for s in unique_subs: | |
| emb = u.get(s, np.zeros(emb_dim, dtype=np.float32)) | |
| delta_sub = float(emb[1] - emb[0]) | |
| abs_delta = abs(delta_sub) | |
| direction_sub = delta_sub > 0 | |
| subs_info.append( | |
| { | |
| "sub": s, | |
| "score": delta_sub, | |
| "abs_score": abs_delta, | |
| "direction": direction_sub, | |
| } | |
| ) | |
| subs_info.sort(key=lambda x: x["abs_score"], reverse=True) | |
| subs_info = subs_info[:5] | |
| for s_i in subs_info: | |
| s_i["signed"] = ( | |
| s_i["abs_score"] | |
| if s_i["direction"] == (verdict == "slop") | |
| else -s_i["abs_score"] | |
| ) | |
| subs_info.sort(key=lambda x: x["signed"], reverse=True) | |
| max_abs_sub = max(s["abs_score"] for s in subs_info) or 1.0 | |
| ngram_html = "<div style='margin:8px 0;'>Matched n-grams:<br>" | |
| for s_i in subs_info: | |
| color = feat_color(s_i["abs_score"], s_i["direction"], max_abs_sub) | |
| sign = "+" if s_i["signed"] > 0 else "-" | |
| ngram_html += ( | |
| f"<span style='{color} border-radius:4px; padding:2px 5px; margin:2px; display:inline-block; font-family:monospace;'>" | |
| f"{sign}{s_i['sub']}" | |
| f"</span>" | |
| ) | |
| ngram_html += "</div>" | |
| overall = f""" | |
| <div style='padding:18px; background:#fff; border-radius:16px; box-shadow:0 2px 8px #0001;'> | |
| <div style='text-align:center;'>{verdict_button(verdict)}</div> | |
| {top_feats_table} | |
| {ngram_html} | |
| </div> | |
| """ | |
| return overall | |
| def process_input_viz(url_input, html_input): | |
| user_input = (url_input or "").strip() | |
| html = (html_input or "").strip() | |
| if user_input: | |
| try: | |
| resp = requests.get(user_input, timeout=6) | |
| html = resp.text | |
| except Exception as e: | |
| return f"<span style='color:red;'>Error fetching URL: {e}</span>" | |
| elif html: | |
| pass | |
| else: | |
| return "<span style='color:red;'>Please provide a URL or HTML code.</span>" | |
| return interpretability_viz(html) | |
| desc = ( | |
| "This is a demo for Stop-Slop, an AI model that detects slop " | |
| "(low-quality, unoriginal, or spammy material—often AI-generated—that " | |
| "adds noise rather than value) websites.\n" | |
| "\n\n\n" | |
| "To start, input a <b>valid URL (top box)</b> <span style='color:#888;" | |
| "'>or</span> some <b>HTML code (bottom box)</b>." | |
| ) | |
| iface = gr.Interface( | |
| fn=process_input_viz, | |
| inputs=[ | |
| gr.Textbox( | |
| lines=1, | |
| label="URL", | |
| placeholder="https://nymag.com/intelligencer/article/ai-generated-content-internet-online-slop-spam.html", | |
| ), | |
| gr.Textbox(lines=10, label="HTML", placeholder="<html>...</html>"), | |
| ], | |
| outputs=gr.HTML(label="Result"), | |
| description=desc, | |
| title="🚫🧟 Stop Slop", | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |