JBHF commited on
Commit
6d1c685
·
verified ·
1 Parent(s): 0da0f23

Initial commit

Browse files
Files changed (2) hide show
  1. app.py +750 -0
  2. requirements.txt +8 -0
app.py ADDED
@@ -0,0 +1,750 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ # Slop Detector
3
+ # Gradio app
4
+ # 24-02-2026
5
+ #
6
+ # EVERNOTE:
7
+ # https://share.evernote.com/note/0fb9b438-7842-4eff-a93f-ba0850e6ae83
8
+ #
9
+ # F:\DATA SCIENCE\MIJN DATA SCIENCE PROJECTS\FAKE NEWS DETECTOR - LOCAL LLM - SIRAJ RAVAL FEB 2026\SlopShield-main\SlopShield-PYTHON\GRADIO_APP
10
+
11
+ # app.py
12
+ # Gradio app for automated slop detection (Hugging Face Spaces ready).
13
+ #
14
+ # ✅ Features:
15
+ # - User can input a URL OR paste text
16
+ # - Extracts main content (trafilatura preferred, BeautifulSoup fallback)
17
+ # - Calls an OpenAI "mini" model (default: gpt-4o-mini) using Structured Outputs (JSON Schema)
18
+ # - Displays results neatly (score, subscores, contributions, interpretation, radar chart)
19
+ # - Allows downloading a Markdown (.md) report and a PDF (.pdf) report
20
+ #
21
+ # --- HF Spaces setup notes ---
22
+ # 1) Add an environment variable in your Space:
23
+ # OPENAI_API_KEY = "..."
24
+ # 2) Recommended requirements.txt:
25
+ # gradio
26
+ # openai
27
+ # requests
28
+ # trafilatura
29
+ # beautifulsoup4
30
+ # lxml
31
+ # matplotlib
32
+ # reportlab
33
+ #
34
+ # OpenAI docs referenced for Structured Outputs + model listing:
35
+ # - Structured Outputs: https://developers.openai.com/api/docs/guides/structured-outputs/ [oai_citation:0‡OpenAI Developers](https://developers.openai.com/api/docs/guides/structured-outputs/?utm_source=chatgpt.com)
36
+ # - Models (incl. gpt-4o-mini): https://developers.openai.com/api/docs/models [oai_citation:1‡OpenAI Developers](https://developers.openai.com/api/docs/models?utm_source=chatgpt.com)
37
+ # - gpt-4o-mini model page: https://developers.openai.com/api/docs/models/gpt-4o-mini [oai_citation:2‡OpenAI Developers](https://developers.openai.com/api/docs/models/gpt-4o-mini?utm_source=chatgpt.com)
38
+ # - Responses API: https://platform.openai.com/docs/api-reference/responses [oai_citation:3‡platform.openai.com](https://platform.openai.com/docs/api-reference/responses?utm_source=chatgpt.com)
39
+
40
+ # pip install -r requirements.txt --user
41
+
42
+
43
+ import os
44
+ import re
45
+ import json
46
+ import math
47
+ import time
48
+ import textwrap
49
+ import urllib.parse
50
+ from dataclasses import dataclass
51
+ from typing import Optional, Dict, Any, Tuple, List
52
+
53
+ import requests
54
+ import gradio as gr
55
+
56
+ # Optional extraction libs
57
+ try:
58
+ import trafilatura
59
+ except Exception:
60
+ trafilatura = None
61
+
62
+ try:
63
+ from bs4 import BeautifulSoup
64
+ except Exception:
65
+ BeautifulSoup = None
66
+
67
+ import matplotlib.pyplot as plt
68
+
69
+ from reportlab.lib.pagesizes import letter
70
+ from reportlab.lib.styles import getSampleStyleSheet
71
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Preformatted
72
+ from reportlab.lib.units import inch
73
+
74
+ from openai import OpenAI
75
+
76
+
77
+ # -----------------------------
78
+ # Config
79
+ # -----------------------------
80
+ DEFAULT_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
81
+ MAX_CHARS_SENT_TO_LLM = int(os.getenv("MAX_CHARS_SENT_TO_LLM", "35000")) # safety for context
82
+ HTTP_TIMEOUT = int(os.getenv("HTTP_TIMEOUT", "20"))
83
+
84
+ # Output dir for reports and radar chart (works on Windows and Linux)
85
+ _OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "slop_output")
86
+ os.makedirs(_OUTPUT_DIR, exist_ok=True)
87
+
88
+ # Preset URLs the user can choose from (DEV_LOG examples)
89
+ DEFAULT_URL_CHOICES = [
90
+ ("Custom — enter your own URL below", ""),
91
+ ("CNN Home", "https://www.cnn.com/"),
92
+ ("CNN Politics", "https://www.cnn.com/politics"),
93
+ ("CNN — US-Iran strike article", "https://edition.cnn.com/2026/02/19/politics/us-iran-strike-options-trump-military"),
94
+ ("CNN — China AI Seedance", "https://www.cnn.com/2026/02/20/china/china-ai-seedance-intl-hnk-dst"),
95
+ ("MattsWorld101 — SEO examples", "https://mattsworld101.com/examples-of-seo/"),
96
+ ("Scitechtalk — Genealogy", "http://www.scitechtalk.org/UITGEBREIDE_GENEALOGIE_VAN%20_SERVAAS_BOURS/HTu1-10.html"),
97
+ ("Scitechtalk — arXiv aggregator", "http://scitechtalk.org/ARXIV_AGGREGATOR/index.html"),
98
+ ("arXiv — paper abs/2410.14255", "https://arxiv.org/abs/2410.14255"),
99
+ ("Dumpert", "https://www.dumpert.nl/"),
100
+ ("Medium — P vs NP of AI", "https://medium.com/data-and-beyond/the-p-vs-np-of-ai-why-reasoning-is-mathematically-impossible-for-a-decoder-ee440f1d27ce"),
101
+ ("Medium — Creativity vector hallucination", "https://medium.com/data-and-beyond/i-extracted-a-creativity-vector-from-gpt-it-was-a-hallucination-95a033fb890a"),
102
+ ("Medium — Topology of matrix multiplication", "https://medium.com/data-and-beyond/the-topology-of-matrix-multiplication-why-your-ai-is-just-folding-space-cf8e408f2c91"),
103
+ ]
104
+
105
+ UA = (
106
+ "Mozilla/5.0 (X11; Linux x86_64) "
107
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
108
+ "Chrome/120.0 Safari/537.36 SlopDetector/1.0"
109
+ )
110
+
111
+
112
+ # -----------------------------
113
+ # Helpers
114
+ # -----------------------------
115
+ def clamp01(x: float) -> float:
116
+ return max(0.0, min(1.0, float(x)))
117
+
118
+
119
+ def safe_slug(s: str, max_len: int = 60) -> str:
120
+ s = (s or "").strip().lower()
121
+ s = re.sub(r"https?://", "", s)
122
+ s = re.sub(r"[^a-z0-9]+", "-", s).strip("-")
123
+ if not s:
124
+ s = "slop-report"
125
+ return s[:max_len].rstrip("-")
126
+
127
+
128
+ def now_ts() -> str:
129
+ return time.strftime("%Y%m%d-%H%M%S")
130
+
131
+
132
+ def infer_title_from_text(text: str) -> str:
133
+ # simple heuristic: first non-empty line (trim)
134
+ for line in (text or "").splitlines():
135
+ line = line.strip()
136
+ if len(line) >= 8:
137
+ return line[:120]
138
+ return "Untitled"
139
+
140
+
141
+ def compute_interpretation(slop_score_0_100: float) -> str:
142
+ # Interprets the 0–100 score; user can normalize by /100 if desired.
143
+ s = slop_score_0_100
144
+ if s <= 5:
145
+ band = "Extremely Low Slop"
146
+ desc = "Meaning-dense, highly specific, minimal repetition/templating."
147
+ elif s <= 15:
148
+ band = "Very Low Slop"
149
+ desc = "High information density; only mild stylistic templates."
150
+ elif s <= 30:
151
+ band = "Low Slop"
152
+ desc = "Mostly meaning-driven, with some rhetorical repetition or structure."
153
+ elif s <= 45:
154
+ band = "Mild–Moderate Slop"
155
+ desc = "Noticeable templating and/or generic framing; still contains substance."
156
+ elif s <= 60:
157
+ band = "Moderate Slop"
158
+ desc = "Substantial filler/templating; reduced specificity; repetition noticeable."
159
+ elif s <= 75:
160
+ band = "High Slop"
161
+ desc = "Strong low-value signals: repetition, template voice, low specificity."
162
+ elif s <= 90:
163
+ band = "Very High Slop"
164
+ desc = "Predominantly template/filler; weak grounding; attention/SEO patterns likely."
165
+ else:
166
+ band = "Extreme Slop"
167
+ desc = "Near-pure filler or spam-like content; minimal meaningful information."
168
+ return f"**{band}** — {desc}"
169
+
170
+
171
+ def weighted_contributions(result: Dict[str, Any]) -> Dict[str, float]:
172
+ # Uses the canonical weights from your spec.
173
+ info_density = clamp01(result.get("info_density", 0.0))
174
+ redundancy = clamp01(result.get("redundancy", 0.0))
175
+ template = clamp01(result.get("template_markers", 0.0))
176
+ incoherence = clamp01(result.get("incoherence", 0.0))
177
+ monetization = clamp01(result.get("monetization", 0.0))
178
+
179
+ contrib = {
180
+ "info_density_deficit": 0.30 * (1.0 - info_density),
181
+ "redundancy": 0.30 * redundancy,
182
+ "template_markers": 0.20 * template,
183
+ "incoherence": 0.10 * incoherence,
184
+ "monetization": 0.10 * monetization,
185
+ }
186
+ # normalized sum should equal slop (0..1) if model followed formula
187
+ contrib["slop_normalized_sum"] = sum(contrib.values())
188
+ contrib["slop_score_0_100_sum"] = 100.0 * contrib["slop_normalized_sum"]
189
+ return contrib
190
+
191
+
192
+ def make_radar_chart(subscores: Dict[str, float], out_path: str) -> str:
193
+ labels = ["info_density", "redundancy", "template_markers", "incoherence", "monetization"]
194
+ values = [clamp01(subscores.get(k, 0.0)) for k in labels]
195
+
196
+ # Radar chart setup
197
+ angles = [n / float(len(labels)) * 2 * math.pi for n in range(len(labels))]
198
+ angles += angles[:1]
199
+ vals = values + values[:1]
200
+
201
+ plt.figure(figsize=(6, 6))
202
+ ax = plt.subplot(111, polar=True)
203
+ ax.set_theta_offset(math.pi / 2)
204
+ ax.set_theta_direction(-1)
205
+
206
+ plt.xticks(angles[:-1], labels)
207
+ ax.set_rlabel_position(0)
208
+ plt.yticks([0.25, 0.5, 0.75], ["0.25", "0.50", "0.75"], alpha=0.7)
209
+ plt.ylim(0, 1)
210
+
211
+ # Do not set explicit colors (per system guidance)
212
+ ax.plot(angles, vals, linewidth=2)
213
+ ax.fill(angles, vals, alpha=0.15)
214
+
215
+ plt.title("Subscores Radar (0–1)", y=1.08)
216
+ plt.tight_layout()
217
+ plt.savefig(out_path, dpi=160)
218
+ plt.close()
219
+ return out_path
220
+
221
+
222
+ # -----------------------------
223
+ # Webpage extraction
224
+ # -----------------------------
225
+ def normalize_url(url: str) -> str:
226
+ """Ensure URL has a scheme (default https://)."""
227
+ url = (url or "").strip()
228
+ if not url:
229
+ return url
230
+ if not url.startswith(("http://", "https://")):
231
+ url = "https://" + url
232
+ return url
233
+
234
+
235
+ def fetch_url(url: str) -> Tuple[str, str]:
236
+ """Return (final_url, html)."""
237
+ url = normalize_url(url)
238
+ headers = {"User-Agent": UA}
239
+ resp = requests.get(url, headers=headers, timeout=HTTP_TIMEOUT, allow_redirects=True)
240
+ resp.raise_for_status()
241
+ final_url = resp.url
242
+ html = resp.text
243
+ return final_url, html
244
+
245
+
246
+ def extract_main_text(url: str) -> Tuple[str, str, str]:
247
+ """
248
+ Returns (final_url, extracted_text, extraction_method).
249
+ """
250
+ url = normalize_url(url)
251
+ final_url, html = fetch_url(url)
252
+
253
+ if trafilatura is not None:
254
+ try:
255
+ downloaded = trafilatura.extract(
256
+ html,
257
+ include_comments=False,
258
+ include_tables=False,
259
+ include_formatting=False,
260
+ url=final_url,
261
+ )
262
+ if downloaded and len(downloaded.strip()) > 200:
263
+ return final_url, downloaded.strip(), "trafilatura"
264
+ except Exception:
265
+ pass
266
+
267
+ # Fallback: BeautifulSoup get_text
268
+ if BeautifulSoup is not None:
269
+ soup = BeautifulSoup(html, "lxml") if "lxml" in globals() else BeautifulSoup(html, "html.parser")
270
+ # Remove scripts/styles
271
+ for tag in soup(["script", "style", "noscript"]):
272
+ tag.decompose()
273
+ text = soup.get_text("\n")
274
+ # Normalize whitespace
275
+ lines = [ln.strip() for ln in text.splitlines()]
276
+ lines = [ln for ln in lines if ln]
277
+ cleaned = "\n".join(lines)
278
+ cleaned = re.sub(r"\n{3,}", "\n\n", cleaned).strip()
279
+ return final_url, cleaned, "beautifulsoup_fallback"
280
+
281
+ # Last resort: raw html stripped
282
+ stripped = re.sub(r"<[^>]+>", " ", html)
283
+ stripped = re.sub(r"\s+", " ", stripped).strip()
284
+ return final_url, stripped, "regex_fallback"
285
+
286
+
287
+ # -----------------------------
288
+ # OpenAI call (Structured Outputs JSON Schema)
289
+ # -----------------------------
290
+ SLOP_SCHEMA = {
291
+ "name": "slop_score_output",
292
+ "schema": {
293
+ "type": "object",
294
+ "additionalProperties": False,
295
+ "properties": {
296
+ "info_density": {"type": "number"},
297
+ "redundancy": {"type": "number"},
298
+ "template_markers": {"type": "number"},
299
+ "incoherence": {"type": "number"},
300
+ "monetization": {"type": "number"},
301
+ "slop_score": {"type": "number"},
302
+ "top_contributing_factors": {
303
+ "type": "array",
304
+ "items": {"type": "string"},
305
+ "minItems": 1,
306
+ },
307
+ "confidence": {"type": "number"},
308
+ },
309
+ "required": [
310
+ "info_density",
311
+ "redundancy",
312
+ "template_markers",
313
+ "incoherence",
314
+ "monetization",
315
+ "slop_score",
316
+ "top_contributing_factors",
317
+ "confidence",
318
+ ],
319
+ },
320
+ "strict": True,
321
+ }
322
+
323
+
324
+ def build_prompt(url: str, text: str) -> str:
325
+ # Your prompt, adapted to accept either URL or pasted text.
326
+ # We do NOT ask the model to add interpretation outside JSON; the app does that deterministically.
327
+ return f"""
328
+ You are given extracted main text from a webpage.
329
+
330
+ WEBPAGE:
331
+ {url if url else ""}
332
+
333
+ TEXT:
334
+ Read the text from webpage:
335
+ {url if url else "(user-provided text)"}
336
+
337
+ MAIN_TEXT:
338
+ \"\"\"
339
+ {text}
340
+ \"\"\"
341
+
342
+ Goal:
343
+ Estimate Sloppiness (0–100).
344
+
345
+ Definition:
346
+ Sloppiness = degree to which text is low-information, generic, repetitive, templated, incoherent, or monetization-optimized rather than meaning-dense.
347
+
348
+ Constraints:
349
+ - Evaluate only intrinsic writing properties.
350
+ - Ignore topic, politics, and site type.
351
+ - Do not speculate beyond text evidence.
352
+
353
+ Step 1 — Produce normalized subscores (0–1):
354
+ - info_density: 1 = high specificity, 0 = generic.
355
+ - redundancy: 1 = heavy repetition.
356
+ - template_markers: 1 = strongly templated.
357
+ - incoherence: 1 = incoherent.
358
+ - monetization: 1 = heavy monetization cues.
359
+
360
+ Step 2 — Compute score:
361
+ slop_score = 100 * (
362
+ 0.30 * (1 - info_density) +
363
+ 0.30 * redundancy +
364
+ 0.20 * template_markers +
365
+ 0.10 * incoherence +
366
+ 0.10 * monetization
367
+ )
368
+
369
+ Step 3 — Output ONLY valid JSON matching the provided schema.
370
+ """.strip()
371
+
372
+
373
+ def call_openai_slop(api_key: str, model: str, url: str, text: str, temperature: float) -> Dict[str, Any]:
374
+ api_key = (api_key or "").strip()
375
+ if not api_key:
376
+ raise RuntimeError("Please enter your OpenAI API key above before running analysis.")
377
+
378
+ client = OpenAI(api_key=api_key)
379
+
380
+ # Trim text for safety
381
+ trimmed = text[:MAX_CHARS_SENT_TO_LLM]
382
+ prompt = build_prompt(url=url, text=trimmed)
383
+
384
+ # Chat Completions API with Structured Outputs (JSON Schema)
385
+ resp = client.chat.completions.create(
386
+ model=model,
387
+ messages=[
388
+ {"role": "system", "content": "You are a careful evaluator. Follow the schema exactly."},
389
+ {"role": "user", "content": prompt},
390
+ ],
391
+ temperature=temperature,
392
+ response_format={"type": "json_schema", "json_schema": SLOP_SCHEMA},
393
+ )
394
+
395
+ raw = (resp.choices[0].message.content or "").strip()
396
+ if not raw:
397
+ raise RuntimeError("Model returned empty content.")
398
+
399
+ try:
400
+ data = json.loads(raw)
401
+ except Exception as e:
402
+ raise RuntimeError(f"Model returned non-JSON or malformed JSON. Raw output:\n{raw}") from e
403
+
404
+ # Clamp and sanity-check
405
+ for k in ["info_density", "redundancy", "template_markers", "incoherence", "monetization", "confidence"]:
406
+ data[k] = clamp01(data.get(k, 0.0))
407
+ # slop_score should be 0..100
408
+ data["slop_score"] = float(data.get("slop_score", 0.0))
409
+ data["slop_score"] = max(0.0, min(100.0, data["slop_score"]))
410
+
411
+ # Ensure list exists
412
+ if not isinstance(data.get("top_contributing_factors"), list):
413
+ data["top_contributing_factors"] = []
414
+
415
+ return data
416
+
417
+
418
+ # -----------------------------
419
+ # Report generation (MD + PDF)
420
+ # -----------------------------
421
+ def format_report_markdown(
422
+ url: str,
423
+ title: str,
424
+ extraction_method: str,
425
+ text_preview: str,
426
+ result: Dict[str, Any],
427
+ ) -> str:
428
+ contrib = weighted_contributions(result)
429
+ slop = result["slop_score"]
430
+ interp = compute_interpretation(slop)
431
+ normalized = slop / 100.0
432
+
433
+ md = []
434
+ md.append(f"# Slop Detection Report")
435
+ md.append("")
436
+ md.append(f"- **Title (heuristic):** {title}")
437
+ md.append(f"- **URL:** {url if url else '(user-provided text)'}")
438
+ md.append(f"- **Extraction method:** {extraction_method}")
439
+ md.append(f"- **Generated at:** {time.strftime('%Y-%m-%d %H:%M:%S')}")
440
+ md.append("")
441
+ md.append("## Overall Score")
442
+ md.append("")
443
+ md.append(f"- **slop_score (0–100):** {slop:.1f}")
444
+ md.append(f"- **slop (0–1):** {normalized:.3f}")
445
+ md.append(f"- **confidence (0–1):** {result.get('confidence', 0.0):.2f}")
446
+ md.append("")
447
+ md.append("### Interpretation")
448
+ md.append("")
449
+ md.append(interp)
450
+ md.append("")
451
+ md.append("## Subscores (0–1)")
452
+ md.append("")
453
+ md.append("| Subscore | Value |")
454
+ md.append("|---|---:|")
455
+ md.append(f"| info_density | {result['info_density']:.2f} |")
456
+ md.append(f"| redundancy | {result['redundancy']:.2f} |")
457
+ md.append(f"| template_markers | {result['template_markers']:.2f} |")
458
+ md.append(f"| incoherence | {result['incoherence']:.2f} |")
459
+ md.append(f"| monetization | {result['monetization']:.2f} |")
460
+ md.append("")
461
+ md.append("## Weighted Contribution Breakdown (normalized)")
462
+ md.append("")
463
+ md.append("| Term | Weight Contribution | Share |")
464
+ md.append("|---|---:|---:|")
465
+ total = contrib["slop_normalized_sum"] if contrib["slop_normalized_sum"] > 0 else 1.0
466
+ for key in ["info_density_deficit", "redundancy", "template_markers", "incoherence", "monetization"]:
467
+ val = contrib[key]
468
+ share = val / total
469
+ md.append(f"| {key} | {val:.4f} | {share:.1%} |")
470
+ md.append("")
471
+ md.append("## Top Contributing Factors (model)")
472
+ md.append("")
473
+ for f in result.get("top_contributing_factors", [])[:10]:
474
+ md.append(f"- {f}")
475
+ md.append("")
476
+ md.append("## Raw JSON Output (model)")
477
+ md.append("")
478
+ md.append("```json")
479
+ md.append(json.dumps(result, ensure_ascii=False, indent=2))
480
+ md.append("```")
481
+ md.append("")
482
+ md.append("## Text Preview (first ~1200 chars after extraction)")
483
+ md.append("")
484
+ md.append("```")
485
+ md.append(text_preview)
486
+ md.append("```")
487
+ md.append("")
488
+ return "\n".join(md)
489
+
490
+
491
+ def save_markdown(md_text: str, base_slug: str) -> str:
492
+ path = os.path.join(_OUTPUT_DIR, f"slop_report_{base_slug}_{now_ts()}.md")
493
+ with open(path, "w", encoding="utf-8") as f:
494
+ f.write(md_text)
495
+ return path
496
+
497
+
498
+ def save_pdf(md_text: str, base_slug: str) -> str:
499
+ path = os.path.join(_OUTPUT_DIR, f"slop_report_{base_slug}_{now_ts()}.pdf")
500
+ doc = SimpleDocTemplate(path, pagesize=letter, rightMargin=54, leftMargin=54, topMargin=54, bottomMargin=54)
501
+ styles = getSampleStyleSheet()
502
+ story = []
503
+
504
+ # Convert markdown-ish to simple paragraphs
505
+ # Keep it robust: strip heavy markdown and preserve code blocks as Preformatted.
506
+ lines = md_text.splitlines()
507
+ in_code = False
508
+ code_buf = []
509
+
510
+ def flush_code():
511
+ nonlocal code_buf
512
+ if code_buf:
513
+ story.append(Preformatted("\n".join(code_buf), styles["Code"]))
514
+ story.append(Spacer(1, 0.15 * inch))
515
+ code_buf = []
516
+
517
+ for ln in lines:
518
+ if ln.strip().startswith("```"):
519
+ if not in_code:
520
+ in_code = True
521
+ code_buf = []
522
+ else:
523
+ in_code = False
524
+ flush_code()
525
+ continue
526
+
527
+ if in_code:
528
+ code_buf.append(ln.rstrip("\n"))
529
+ continue
530
+
531
+ # headings
532
+ if ln.startswith("# "):
533
+ story.append(Paragraph(ln[2:].strip(), styles["Title"]))
534
+ story.append(Spacer(1, 0.15 * inch))
535
+ elif ln.startswith("## "):
536
+ story.append(Paragraph(ln[3:].strip(), styles["Heading2"]))
537
+ story.append(Spacer(1, 0.10 * inch))
538
+ elif ln.startswith("### "):
539
+ story.append(Paragraph(ln[4:].strip(), styles["Heading3"]))
540
+ story.append(Spacer(1, 0.08 * inch))
541
+ elif ln.strip().startswith("- "):
542
+ story.append(Paragraph("• " + ln.strip()[2:], styles["BodyText"]))
543
+ elif ln.strip() == "":
544
+ story.append(Spacer(1, 0.08 * inch))
545
+ else:
546
+ # light markdown bold -> remove ** for PDF
547
+ clean = ln.replace("**", "")
548
+ story.append(Paragraph(clean, styles["BodyText"]))
549
+
550
+ if in_code:
551
+ flush_code()
552
+
553
+ doc.build(story)
554
+ return path
555
+
556
+
557
+ # -----------------------------
558
+ # Gradio pipeline
559
+ # -----------------------------
560
+ @dataclass
561
+ class AnalysisInputs:
562
+ api_key: str
563
+ url: str
564
+ pasted_text: str
565
+ model: str
566
+ temperature: float
567
+
568
+
569
+ def analyze(inputs: AnalysisInputs) -> Tuple[str, Dict[str, Any], str, str, str]:
570
+ url = (inputs.url or "").strip()
571
+ pasted_text = (inputs.pasted_text or "").strip()
572
+
573
+ if not url and not pasted_text:
574
+ raise ValueError("Please provide either a URL or paste text to analyze.")
575
+
576
+ extraction_method = "user_text"
577
+ final_url = normalize_url(url) if url else ""
578
+ text = pasted_text
579
+
580
+ if url and not pasted_text:
581
+ final_url, text, extraction_method = extract_main_text(url)
582
+
583
+ # Basic title heuristic
584
+ title = infer_title_from_text(text)
585
+ base_slug = safe_slug(final_url or title)
586
+
587
+ # Make a preview
588
+ preview = text[:1200].strip()
589
+ if len(text) > 1200:
590
+ preview += "\n\n…(truncated preview)…"
591
+
592
+ # Call OpenAI (API key from user input)
593
+ result = call_openai_slop(
594
+ api_key=inputs.api_key or "",
595
+ model=inputs.model or DEFAULT_MODEL,
596
+ url=final_url,
597
+ text=text,
598
+ temperature=float(inputs.temperature),
599
+ )
600
+
601
+ # Build UI markdown summary
602
+ interp = compute_interpretation(result["slop_score"])
603
+ normalized = result["slop_score"] / 100.0
604
+ contrib = weighted_contributions(result)
605
+
606
+ summary_md = f"""
607
+ ## Results
608
+
609
+ **slop_score (0–100):** `{result["slop_score"]:.1f}`
610
+ **slop (0–1):** `{normalized:.3f}`
611
+ **confidence (0–1):** `{result.get("confidence", 0.0):.2f}`
612
+
613
+ ### Interpretation
614
+ {interp}
615
+
616
+ ### Subscores (0–1)
617
+ - info_density: `{result["info_density"]:.2f}`
618
+ - redundancy: `{result["redundancy"]:.2f}`
619
+ - template_markers: `{result["template_markers"]:.2f}`
620
+ - incoherence: `{result["incoherence"]:.2f}`
621
+ - monetization: `{result["monetization"]:.2f}`
622
+
623
+ ### Dominant contributors (weighted shares)
624
+ - redundancy: `{(contrib["redundancy"]/contrib["slop_normalized_sum"] if contrib["slop_normalized_sum"] else 0):.1%}`
625
+ - template_markers: `{(contrib["template_markers"]/contrib["slop_normalized_sum"] if contrib["slop_normalized_sum"] else 0):.1%}`
626
+ - info_density_deficit: `{(contrib["info_density_deficit"]/contrib["slop_normalized_sum"] if contrib["slop_normalized_sum"] else 0):.1%}`
627
+ - incoherence: `{(contrib["incoherence"]/contrib["slop_normalized_sum"] if contrib["slop_normalized_sum"] else 0):.1%}`
628
+ - monetization: `{(contrib["monetization"]/contrib["slop_normalized_sum"] if contrib["slop_normalized_sum"] else 0):.1%}`
629
+
630
+ ### Top contributing factors (model)
631
+ {chr(10).join([f"- {x}" for x in result.get("top_contributing_factors", [])[:8]]) if result.get("top_contributing_factors") else "- (none provided)"}
632
+
633
+ ### Extraction preview
634
+ <details>
635
+ <summary>Show extracted text preview</summary>
636
+
637
+ {preview}
638
+
639
+ </details>
640
+ """.strip()
641
+
642
+ # Radar chart
643
+ radar_path = os.path.join(_OUTPUT_DIR, f"radar_{base_slug}_{now_ts()}.png")
644
+ make_radar_chart(
645
+ {
646
+ "info_density": result["info_density"],
647
+ "redundancy": result["redundancy"],
648
+ "template_markers": result["template_markers"],
649
+ "incoherence": result["incoherence"],
650
+ "monetization": result["monetization"],
651
+ },
652
+ radar_path,
653
+ )
654
+
655
+ # Reports
656
+ report_md = format_report_markdown(
657
+ url=final_url,
658
+ title=title,
659
+ extraction_method=extraction_method,
660
+ text_preview=preview,
661
+ result=result,
662
+ )
663
+ md_path = save_markdown(report_md, base_slug)
664
+ pdf_path = save_pdf(report_md, base_slug)
665
+
666
+ return summary_md, result, radar_path, md_path, pdf_path
667
+
668
+
669
+ # -----------------------------
670
+ # Gradio UI
671
+ # -----------------------------
672
+ def run_analysis(api_key: str, url: str, pasted_text: str, model: str, temperature: float):
673
+ inputs = AnalysisInputs(api_key=api_key, url=url, pasted_text=pasted_text, model=model, temperature=temperature)
674
+ return analyze(inputs)
675
+
676
+
677
+ with gr.Blocks(title="Automated Slop Detection") as demo:
678
+ gr.Markdown(
679
+ "# Automated Slop Detection\n"
680
+ "Analyze a webpage (URL) or pasted text and estimate **Sloppiness** with subscores.\n\n"
681
+ "**Tip:** For best results, analyze a single article page (not a homepage/feed)."
682
+ )
683
+
684
+ api_key_in = gr.Textbox(
685
+ label="OpenAI API Key (required)",
686
+ type="password",
687
+ placeholder="sk-...",
688
+ info="Enter your OpenAI API key to run analysis. It is not stored.",
689
+ )
690
+
691
+ url_preset_in = gr.Dropdown(
692
+ label="Choose a preset URL (or Custom to enter your own)",
693
+ choices=[(label, url) for label, url in DEFAULT_URL_CHOICES],
694
+ value="",
695
+ allow_custom_value=False,
696
+ )
697
+ url_in = gr.Textbox(
698
+ label="URL (optional — used when preset is Custom)",
699
+ value="",
700
+ placeholder="https://example.com/article",
701
+ lines=1,
702
+ )
703
+ text_in = gr.Textbox(
704
+ label="Paste text (optional)",
705
+ placeholder="Paste extracted main text here (leave URL empty if using pasted text).",
706
+ lines=10,
707
+ )
708
+
709
+ with gr.Row():
710
+ model_in = gr.Textbox(label="OpenAI model", value=DEFAULT_MODEL)
711
+ temp_in = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, value=0.0, step=0.05, info="Set to 0 for stable, deterministic results.")
712
+
713
+ analyze_btn = gr.Button("Analyze", variant="primary")
714
+
715
+ gr.Markdown("---")
716
+
717
+ out_md = gr.Markdown(label="Summary")
718
+ out_json = gr.JSON(label="Model JSON output (schema)")
719
+ out_plot = gr.Image(label="Subscores radar chart", type="filepath")
720
+
721
+ with gr.Row():
722
+ out_md_file = gr.File(label="Download Markdown report (.md)")
723
+ out_pdf_file = gr.File(label="Download PDF report (.pdf)")
724
+
725
+ def _on_click(api_key, url_preset, url_custom, text, model, temp):
726
+ url = (url_preset or "").strip() or (url_custom or "").strip()
727
+ summary_md, result_json, radar_path, md_path, pdf_path = run_analysis(api_key, url, text, model, temp)
728
+ return summary_md, result_json, radar_path, md_path, pdf_path
729
+
730
+ analyze_btn.click(
731
+ _on_click,
732
+ inputs=[api_key_in, url_preset_in, url_in, text_in, model_in, temp_in],
733
+ outputs=[out_md, out_json, out_plot, out_md_file, out_pdf_file],
734
+ )
735
+
736
+ gr.Markdown(
737
+ "### Notes\n"
738
+ "- **slop_score (0–100)** is the scaled score. Divide by 100 for normalized slop in **[0,1]**.\n"
739
+ "- The app generates its own interpretation from slop_score bands to keep the model output strictly JSON.\n"
740
+ "- OpenAI usage and billing: [platform.openai.com/usage](https://platform.openai.com/usage)\n"
741
+ )
742
+
743
+ if __name__ == "__main__":
744
+ demo.launch()
745
+
746
+
747
+ # python app.py
748
+
749
+ # =========================================================================================
750
+
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ openai
3
+ requests
4
+ trafilatura
5
+ beautifulsoup4
6
+ lxml
7
+ matplotlib
8
+ reportlab