Spaces:

elalber2000
/

stop-slop

Sleeping

App Files Files Community

elalber2000 commited on Jun 7, 2025

Commit

77bfa68

verified ·

1 Parent(s): cecc2d8

Upload app.py

Browse files

Files changed (1) hide show

app.py +403 -0

app.py ADDED Viewed

	@@ -0,0 +1,403 @@

+import json
+import os
+import re
+from collections import Counter
+from typing import Any
+import gradio as gr
+import numpy as np
+import requests
+STOPWORDS = {
+    "the",
+    "and",
+    "is",
+    "in",
+    "it",
+    "of",
+    "to",
+    "a",
+    "with",
+    "that",
+    "for",
+    "on",
+    "as",
+    "are",
+    "this",
+    "but",
+    "be",
+    "at",
+    "or",
+    "by",
+    "an",
+    "if",
+    "from",
+    "about",
+    "into",
+    "over",
+    "after",
+    "under",
+}
+_RX_SCRIPT_STYLE = re.compile(
+    r"<(?:script|style)[^>]*>.*?</(?:script|style)>", re.S | re.I
+)
+_RX_TAG = re.compile(r"<[^>]+>")
+_RX_SENTENCE_SPLIT = re.compile(r"[.!?]+")
+_RX_PARAGRAPH = re.compile(r"\n{2,}")
+_RX_TOKENS = re.compile(r"\w+")
+_RX_TAG_NAME = re.compile(r"<\s*(\w+)", re.I)
+_RX_IFRAME = re.compile(r"<\s*iframe\b", re.I)
+_RX_LINK = re.compile(r'href=["\']([^"\']+)["\']', re.I)
+EXPRS = {
+    "i_x_that_is_not_y_but_z": re.compile(
+        r"\bI\s+\w+\s+that\s+is\s+not\s+\w+,\s*but\s+\w+", re.I
+    ),
+    "as_i_x_i_will_y": re.compile(r"\bAs\s+I\s+\w+,\s*I\s+will\s+\w+", re.I),
+}
+def _feature_dict(html: str) -> dict:
+    cleaned = _RX_SCRIPT_STYLE.sub("", html)
+    text = _RX_TAG.sub(" ", cleaned)
+    tokens = _RX_TOKENS.findall(text.lower())
+    paragraphs = [p for p in _RX_PARAGRAPH.split(text) if p.strip()]
+    total_bytes, text_bytes = len(html), len(text)
+    tags = _RX_TAG_NAME.findall(html.lower())
+    n_tags = len(tags) or 1
+    iframe_count = len(_RX_IFRAME.findall(html))
+    hrefs = _RX_LINK.findall(html)
+    total_links = len(hrefs)
+    links_per_kb = total_links / (total_bytes / 1024) if total_bytes else 0
+    sw_count = sum(1 for t in tokens if t in STOPWORDS)
+    stopword_ratio = sw_count / len(tokens) if tokens else 0
+    spp_list = [len(_RX_SENTENCE_SPLIT.split(p)) for p in paragraphs]
+    sentences_per_paragraph = sum(spp_list) / len(spp_list) if spp_list else 0
+    freq = Counter(tokens)
+    type_token_ratio = len(freq) / len(tokens) if tokens else 0
+    prp_count = len(
+        re.findall(r"\b(?:I|me|you|he|she|it|we|they|him|her|us|them)\b", text, re.I)
+    )
+    prp_ratio = prp_count / len(tokens) if tokens else 0
+    vbg_count = len(re.findall(r"\b\w+ing\b", text))
+    straight_apostrophe = text.count("'")
+    markup_to_text_ratio = (
+        (total_bytes - text_bytes) / total_bytes if total_bytes else 0
+    )
+    inline_css_ratio = html.lower().count("style=") / n_tags
+    ix_not = len(EXPRS["i_x_that_is_not_y_but_z"].findall(text))
+    as_i = len(EXPRS["as_i_x_i_will_y"].findall(text))
+    return {
+        "stopword_ratio": stopword_ratio,
+        "links_per_kb": links_per_kb,
+        "type_token_ratio": type_token_ratio,
+        "i_x_that_is_not_y_but_z": ix_not,
+        "prp_ratio": prp_ratio,
+        "sentences_per_paragraph": sentences_per_paragraph,
+        "markup_to_text_ratio": markup_to_text_ratio,
+        "inline_css_ratio": inline_css_ratio,
+        "iframe_count": iframe_count,
+        "as_i_x_i_will_y": as_i,
+        "vbg": vbg_count,
+        "straight_apostrophe": straight_apostrophe,
+    }
+def load_weights():
+    with open(
+        os.path.join(os.path.dirname(__file__), "weights.json"), encoding="utf-8"
+    ) as f:
+        weights = json.load(f)
+        weight_names = ["W_num", "bias", "U", "mu", "sigma"]
+        w_num, bias, u_lst, mu, sigma = (weights[elem] for elem in weight_names)
+        w_num, bias, mu, sigma = (
+            np.array(weights[w]) for w in weight_names if w != "U"
+        )
+        u = {k: np.array(v) for k, v in u_lst.items()}
+    return w_num, bias, u, mu, sigma
+def interpretability_viz(html: str):
+    re_tok = re.compile(r"\w+|[^\w\s]+")
+    allowed_lengths = {4, 5, 6, 7, 8, 9, 10}
+    allowed_tokens = [
+        "onee",
+        "rdle",
+        "reduction",
+        "efits",
+        "ssic",
+        "citizens",
+        "ideas",
+        "unlike",
+        "ueak",
+        "aked",
+        "bark",
+        "loak",
+        "udic",
+        "myste",
+        "eekl",
+        "oten",
+        "obal",
+        "cerem",
+        "eeds",
+        "arli",
+        "auty",
+        "research",
+        "bann",
+        "governor",
+        "ikel",
+        "regis",
+        "sparked",
+        "generous",
+        "ered",
+        "etal",
+        "efor",
+        "ghes",
+        "epit",
+        "ility",
+        "dynam",
+        "vente",
+        "oache",
+        "nuin",
+        "democratic",
+        "payw",
+        "cono",
+        "passi",
+    ]
+    num_columns = [
+        "as_i_x_i_will_y",
+        "i_x_that_is_not_y_but_z",
+        "iframe_count",
+        "inline_css_ratio",
+        "links_per_kb",
+        "markup_to_text_ratio",
+        "prp_ratio",
+        "sentences_per_paragraph",
+        "stopword_ratio",
+        "straight_apostrophe",
+        "type_token_ratio",
+        "vbg",
+    ]
+    w_num, bias, u, mu, sigma = load_weights()
+    tokens = re_tok.findall(html.lower())
+    matched_subs: list[str] = []
+    word_scores = []
+    emb_dim = next(iter(u.values())).shape[-1] if u else 2
+    for word in tokens:
+        embs = []
+        subs_for_word = []
+        for length in allowed_lengths:
+            if len(word) < length:
+                continue
+            for i in range(len(word) - length + 1):
+                sub = word[i : i + length]
+                if sub in allowed_tokens:
+                    embs.append(u[sub])
+                    subs_for_word.append(sub)
+        if subs_for_word:
+            matched_subs.extend(set(subs_for_word))
+            word_scores.append(np.mean(embs, axis=0))
+        else:
+            word_scores.append(np.zeros(emb_dim, dtype=np.float32))
+    text_score = (
+        np.mean(np.stack(word_scores, axis=0), axis=0)
+        if word_scores
+        else np.zeros(emb_dim, dtype=np.float32)
+    )
+    feats = _feature_dict(html)
+    num_vec = np.array([feats.get(col, 0.0) for col in num_columns], dtype=np.float32)
+    num_std = (num_vec - mu.reshape(-1)) / sigma.reshape(-1)
+    numeric_score = num_std @ w_num
+    logits = text_score + numeric_score + bias
+    exp_shift = np.exp(logits - np.max(logits))
+    probs = exp_shift / np.sum(exp_shift)
+    feature_info = []
+    for i, col in enumerate(num_columns):
+        delta = w_num[i, 1] - w_num[i, 0]
+        cval = num_std[i] * delta
+        abs_cval = abs(cval)
+        direction = cval > 0  # True = slop, False = not-slop
+        feature_info.append(
+            {
+                "col": col,
+                "value": feats.get(col, 0),
+                "abs_cval": abs_cval,
+                "direction": direction,
+                "cval": cval,
+            }
+        )
+    verdict = "slop" if probs[1] > probs[0] else "not slop"
+    for f in feature_info:
+        f["signed"] = (
+            f["abs_cval"] if f["direction"] == (verdict == "slop") else -f["abs_cval"]
+        )
+    feature_info.sort(key=lambda x: x["signed"], reverse=True)
+    feature_info = feature_info[:5]
+    feature_map = {
+        "as_i_x_i_will_y": "Phrases: <b>'As I …, I will …'</b>",
+        "i_x_that_is_not_y_but_z": "Phrases: <b>'I … that is not …, but …'</b>",
+        "iframe_count": "Contains &lt;iframe&gt; elements",
+        "inline_css_ratio": "Uses lots of inline CSS styling",
+        "links_per_kb": "Has many hyperlinks",
+        "markup_to_text_ratio": "High markup-to-text proportion",
+        "prp_ratio": "Uses personal pronouns",
+        "sentences_per_paragraph": "Multiple sentences per paragraph",
+        "stopword_ratio": "High use of common words",
+        "straight_apostrophe": "Contains straight apostrophes",
+        "type_token_ratio": "Diverse vocabulary",
+        "vbg": "Contains words ending in <b>-ing</b>",
+    }
+    cleaned = _RX_SCRIPT_STYLE.sub("", html)
+    text_only = _RX_TAG.sub(" ", cleaned)
+    pattern_matches = {
+        "as_i_x_i_will_y": "('"
+        + "', '".join(EXPRS["as_i_x_i_will_y"].findall(text_only)[:3])
+        + "')",
+        "i_x_that_is_not_y_but_z": "('"
+        + "', '".join(EXPRS["i_x_that_is_not_y_but_z"].findall(text_only)[:3])
+        + "')",
+    }
+    def feat_color(strength, direction, max_strength):
+        if max_strength <= 0:
+            return "background:#fffde7;color:#333;"
+        norm = min(strength / max_strength, 1.0)
+        yellow, red, green = (227, 213, 123), (196, 70, 67), (92, 173, 95)
+        if direction:
+            r, g, b = (y + (norm * (r - y)) for y, r in zip(yellow, red))
+        else:
+            r, g, b = (y + (norm * (g - y)) for y, g in zip(yellow, green))
+        return f"background:rgb({r},{g},{b});color:#111;"
+    top_feats_table = (
+        "<table style='border-collapse:collapse;width:100%;margin-bottom:12px;'>"
+    )
+    top_feats_table += "<tr><th style='padding:4px 8px;text-align:center;'>Top Features</th><th style='padding:4px 8px;text-align:center;'>Value</th></tr>"
+    tot_abs = sum(f["abs_cval"] for f in feature_info) or 1.0
+    for f in feature_info:
+        f["norm01"] = f["abs_cval"] / tot_abs
+    for feat in feature_info:
+        feat_col = feat["col"]
+        human = feature_map[feat_col]
+        extra = pattern_matches.get(feat_col, "") if "Phrases" in human else ""
+        color = feat_color(
+            feat["abs_cval"],
+            feat["direction"],
+            max(f["abs_cval"] for f in feature_info),
+        )
+        sign = "+" if feat["signed"] > 0 else "-"
+        cell = f"{sign}{abs(feat['norm01']):.2f}"
+        if cell[1:] != "0.00":
+            top_feats_table += (
+                f"<tr style='{color}'>"
+                f"<td style='padding:4px 8px;'>{human}{extra}</td>"
+                f"<td style='padding:4px 8px;text-align:right;'>{cell}</td>"
+                f"</tr>"
+            )
+    def verdict_button(verdict):
+        if verdict == "not slop":
+            return "<button style='background:#43a047;color:white;font-weight:800;font-size:1.2em;padding:16px 32px;border-radius:10px;border:none;margin-bottom:14px;box-shadow:0 2px 8px #1111;'>NOT SLOP</button>"
+        else:
+            return "<button style='background:#e53935;color:white;font-weight:800;font-size:1.2em;padding:16px 32px;border-radius:10px;border:none;margin-bottom:14px;box-shadow:0 2px 8px #1111;'>SLOP</button>"
+    ngram_html = ""
+    if matched_subs:
+        unique_subs = sorted(set(matched_subs))
+        subs_info: list[dict[str, Any]] = []
+        for s in unique_subs:
+            emb = u.get(s, np.zeros(emb_dim, dtype=np.float32))
+            delta_sub = float(emb[1] - emb[0])
+            abs_delta = abs(delta_sub)
+            direction_sub = delta_sub > 0
+            subs_info.append(
+                {
+                    "sub": s,
+                    "score": delta_sub,
+                    "abs_score": abs_delta,
+                    "direction": direction_sub,
+                }
+            )
+        subs_info.sort(key=lambda x: x["abs_score"], reverse=True)
+        subs_info = subs_info[:5]
+        for s_i in subs_info:
+            s_i["signed"] = (
+                s_i["abs_score"]
+                if s_i["direction"] == (verdict == "slop")
+                else -s_i["abs_score"]
+            )
+        subs_info.sort(key=lambda x: x["signed"], reverse=True)
+        max_abs_sub = max(s["abs_score"] for s in subs_info) or 1.0
+        ngram_html = "<div style='margin:8px 0;'>Matched n-grams:<br>"
+        for s_i in subs_info:
+            color = feat_color(s_i["abs_score"], s_i["direction"], max_abs_sub)
+            sign = "+" if s_i["signed"] > 0 else "-"
+            ngram_html += (
+                f"<span style='{color} border-radius:4px; padding:2px 5px; margin:2px; display:inline-block; font-family:monospace;'>"
+                f"{sign}{s_i['sub']}"
+                f"</span>"
+            )
+        ngram_html += "</div>"
+    overall = f"""
+    <div style='padding:18px; background:#fff; border-radius:16px; box-shadow:0 2px 8px #0001;'>
+      <div style='text-align:center;'>{verdict_button(verdict)}</div>
+      {top_feats_table}
+      {ngram_html}
+    </div>
+    """
+    return overall
+def process_input_viz(url_input, html_input):
+    user_input = (url_input or "").strip()
+    html = (html_input or "").strip()
+    if user_input:
+        try:
+            resp = requests.get(user_input, timeout=6)
+            html = resp.text
+        except Exception as e:
+            return f"<span style='color:red;'>Error fetching URL: {e}</span>"
+    elif html:
+        pass
+    else:
+        return "<span style='color:red;'>Please provide a URL or HTML code.</span>"
+    return interpretability_viz(html)
+desc = (
+    "This is a demo for Stop-Slop, an AI model that detects slop "
+    "(low-quality, unoriginal, or spammy material—often AI-generated—that "
+    "adds noise rather than value) websites.\n"
+    "\n\n\n"
+    "To start, input a <b>valid URL (top box)</b> <span style='color:#888;"
+    "'>or</span> some <b>HTML code (bottom box)</b>."
+)
+iface = gr.Interface(
+    fn=process_input_viz,
+    inputs=[
+        gr.Textbox(
+            lines=1,
+            label="URL",
+            placeholder="https://nymag.com/intelligencer/article/ai-generated-content-internet-online-slop-spam.html",
+        ),
+        gr.Textbox(lines=10, label="HTML", placeholder="<html>...</html>"),
+    ],
+    outputs=gr.HTML(label="Result"),
+    description=desc,
+    title="🚫🧟 Stop Slop",
+)
+if __name__ == "__main__":
+    iface.launch()