dusan-presswhizz commited on
Commit
3840a00
·
verified ·
1 Parent(s): 2e33344

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +167 -357
app.py CHANGED
@@ -1,6 +1,6 @@
1
- import os, re, json, requests, urllib.parse, traceback
2
  import torch, torch.nn.functional as F
3
- from bs4 import BeautifulSoup, Tag
4
  from transformers import AutoTokenizer, AutoModel
5
  import tldextract
6
  import gradio as gr
@@ -12,9 +12,10 @@ MODEL = "michiyasunaga/LinkBERT-base"
12
  UA = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"}
13
 
14
  # --- OpenAI settings ---
15
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # Settings → Variables & secrets
16
- PREF_CHAIN = [os.getenv("OPENAI_MODEL", "gpt-5o"), "gpt-4o", "gpt-4o-mini"] # try in this order automatically
17
- OPENAI_CHAT_URL = "https://api.openai.com/v1/chat/completions"
 
18
 
19
  # =========================
20
  # Load LinkBERT
@@ -23,7 +24,7 @@ tok = AutoTokenizer.from_pretrained(MODEL)
23
  enc = AutoModel.from_pretrained(MODEL)
24
 
25
  # =========================
26
- # General helpers
27
  # =========================
28
  def looks_like_url(text: str) -> bool:
29
  if not text:
@@ -41,148 +42,19 @@ def normalize_url(url: str) -> str:
41
  return "https://" + url
42
  return url
43
 
44
- def to_plain_text(html_or_text: str) -> str:
45
- return BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
46
-
47
- # =========================
48
- # Main-content extraction (filters bios/sidebars/comments/etc.) — **hardened**
49
- # =========================
50
  def get_text_blocks(url):
51
- try:
52
- resp = requests.get(url, timeout=25, headers=UA)
53
- resp.raise_for_status()
54
- except Exception as e:
55
- raise RuntimeError(f"Failed to fetch Source URL ({url}): {e}")
56
-
57
  soup = BeautifulSoup(resp.text, "html.parser")
58
-
59
- # Remove global noise early
60
- for tag in soup(["script","style","noscript","svg","form","header","footer","nav","aside"]):
61
- try:
62
- tag.decompose()
63
- except Exception:
64
- pass
65
-
66
- # Prefer a main/article container
67
- candidates = []
68
- for sel in [
69
- "article",
70
- '[itemprop="articleBody"]',
71
- '[role="main"]',
72
- "main",
73
- ".entry-content",
74
- ".post-content",
75
- ".post__content",
76
- ".single-post",
77
- ".blog-post",
78
- ".content__body",
79
- # add these:
80
- ".article-content",
81
- ".post",
82
- ".content",
83
- ".entry",
84
- ".site-content",
85
- ".page-content",
86
- ]:
87
- try:
88
- found = soup.select_one(sel)
89
- except Exception:
90
- found = None
91
- if isinstance(found, Tag):
92
- txtlen = len(found.get_text(strip=True))
93
- if txtlen > 200:
94
- candidates.append(found)
95
-
96
- root = None
97
- if candidates:
98
- root = max(candidates, key=lambda n: len(n.get_text(strip=True)))
99
- else:
100
- root = soup.body if isinstance(soup.body, Tag) else soup
101
-
102
- if not isinstance(root, Tag):
103
- # last-ditch: use the whole doc as a string
104
- text = soup.get_text(" ", strip=True)
105
- return [text] if len(text) > 80 else []
106
-
107
- # Drop common noisy sections within root (robust to odd nodes)
108
- blacklist = [
109
- "author","about-author","post-author","authorbox","byline","bio","profile",
110
- "share","sharing","social","follow",
111
- "comment","comments","reply",
112
- "related","recommend",
113
- "newsletter","subscribe",
114
- "sidebar","widget",
115
- "tag-cloud","tags","breadcrumbs","pagination",
116
- "advert","ad-","promo","sponsored"
117
- ]
118
- for el in list(root.find_all(True)):
119
- if not isinstance(el, Tag):
120
- continue
121
- try:
122
- cls = " ".join(el.get("class") or []).lower()
123
- idv = (el.get("id") or "").lower()
124
- except Exception:
125
- cls, idv = "", ""
126
- if any(key in cls or key in idv for key in blacklist):
127
- try:
128
- el.decompose()
129
- except Exception:
130
- pass
131
-
132
- # Collect paragraphs/list items/headings that look like article content
133
  blocks = []
134
- for el in root.find_all(["p","li","h2","h3","h4","blockquote"]):
135
- if not isinstance(el, Tag):
136
- continue
137
- try:
138
- txt = " ".join(el.get_text(" ").split())
139
- except Exception:
140
- continue
141
- if not txt:
142
- continue
143
- if txt.lstrip().startswith("#"): # skip hashtaggy lines
144
- continue
145
- if len(txt) < 40: # too short to be useful
146
- continue
147
- # light bio filter: many first-person cues in a context that mentions "author"
148
- try:
149
- context_text = root.get_text(" ").lower()
150
- except Exception:
151
- context_text = ""
152
- first_person_hits = sum(w in txt.lower() for w in [" i ", " i'm ", " i’m ", " my ", " me ", " myself "])
153
- if first_person_hits >= 2 and "author" in context_text:
154
- continue
155
- blocks.append(txt)
156
-
157
- # Fallback: if we found nothing, do a lenient sweep over body paragraphs
158
- if not blocks:
159
- body = soup.body if soup and soup.body else soup
160
- if body:
161
- for el in body.find_all(["p","li","h2","h3","h4","blockquote"]):
162
- if not isinstance(el, Tag):
163
- continue
164
- txt = " ".join(el.get_text(" ").split())
165
- if len(txt) >= 40 and not txt.lstrip().startswith("#"):
166
- blocks.append(txt)
167
-
168
- # Last resort: try AMP version if still empty
169
- if not blocks:
170
- try:
171
- amp_url = (url.rstrip("/") + "/amp") if "/amp" not in url else url
172
- r2 = requests.get(amp_url, timeout=20, headers=UA)
173
- if r2.ok:
174
- s2 = BeautifulSoup(r2.text, "html.parser")
175
- for el in s2.find_all(["p","li","h2","h3","h4","blockquote"]):
176
- t = " ".join(el.get_text(" ").split())
177
- if len(t) >= 40:
178
- blocks.append(t)
179
- except Exception:
180
- pass
181
-
182
  return blocks
183
- # =========================
184
- # Embeddings
185
- # =========================
186
  def mean_pool(last_hidden_state, mask):
187
  x = last_hidden_state
188
  mask = mask.unsqueeze(-1)
@@ -194,176 +66,111 @@ def embed(texts):
194
  out = enc(**batch)
195
  return mean_pool(out.last_hidden_state, batch["attention_mask"])
196
 
197
- # =========================
198
- # Target page type classification (content / ecom / generic)
199
- # =========================
200
- def classify_target_type(url: str, title: str, desc: str) -> str:
201
- u = (url or "").lower()
202
- m = f"{title or ''} {desc or ''}".lower()
203
-
204
- content_hits = any(k in u for k in ["/blog", "/blogs", "/article", "/how-to", "/news"]) \
205
- or any(k in m for k in ["blog","article","how to","guide","tips","news"])
206
- if content_hits:
207
- return "content"
208
-
209
- ecom_hits = any(k in u for k in ["/product","/products","/collection","/collections","/category","/cart","/shop"]) \
210
- or any(k in m for k in ["price","add to cart","sku","in stock","buy now","free shipping"])
211
- if ecom_hits:
212
- return "ecom"
213
-
214
- return "generic"
215
-
216
- # =========================
217
- # Type-aware fallback injection (when GPT is OFF or fails)
218
- # =========================
219
- def inject_anchor_into_sentence(sentence, anchor_text, target_url, target_type="generic"):
220
  """
221
- Wrap anchor if present; else integrate mid-sentence with a type-aware neutral adjunct.
222
- Avoid em-dash and CTA clichés. Prefer add-after if the sentence is clearly about itself ("This guide…").
 
223
  """
224
  def norm(x): return re.sub(r'[^a-z0-9 ]','',x.lower())
225
  n_sent, n_anchor = norm(sentence), norm(anchor_text)
226
 
 
227
  if n_sent.startswith("this guide") or n_sent.startswith("our platform") or n_sent.startswith("base casino"):
228
  html = sentence
229
  add_after = f' Related resource: <a href="{target_url}">{anchor_text}</a>.'
230
  return html + add_after, False
231
 
 
232
  if n_anchor and n_anchor in n_sent:
233
  pattern = re.compile(re.escape(anchor_text), re.IGNORECASE)
234
  return pattern.sub(f'<a href="{target_url}">{anchor_text}</a>', sentence), True
235
 
 
236
  insert_html = f'<a href="{target_url}">{anchor_text}</a>'
237
 
238
- if target_type == "ecom":
239
- adjuncts = [
240
- f' with supplies available from {insert_html}',
241
- f' with equipment available from {insert_html}',
242
- f' from {insert_html}',
243
- ]
244
- elif target_type == "content":
245
- adjuncts = [
246
- f' with tips from {insert_html}',
247
- f' in an article on {insert_html}',
248
- f' with guidance from {insert_html}',
249
- ]
250
- else:
251
- adjuncts = [
252
- f' with additional context at {insert_html}',
253
- f' with resources at {insert_html}',
254
- f' at {insert_html}',
255
- ]
256
- clause = adjuncts[0]
257
-
258
- m = re.search(r'\b(games?|content|options?|features?|benefits?|floors?|surfaces?|beauty|makeup|lashes?)\b',
259
- sentence, flags=re.I)
260
  if m:
261
  idx = m.start()
262
- return (sentence[:idx] + clause + ' ' + sentence[idx:]).strip(), False
263
 
 
264
  m2 = re.search(r',\s*', sentence)
265
  if m2:
266
  idx = m2.end()
267
- return (sentence[:idx] + clause + ' ' + sentence[idx:]).strip(), False
268
 
 
269
  m3 = re.search(r'\bto\b', sentence, flags=re.I)
270
  if m3:
271
  idx = m3.start()
272
- return (sentence[:idx] + clause + ' ' + sentence[idx:]).strip(), False
273
 
 
274
  if sentence.endswith(('.', '!', '?')):
275
  base, punct = sentence[:-1], sentence[-1]
276
  else:
277
  base, punct = sentence, '.'
278
- rewritten = f'{base}{clause}{punct}'
279
  return rewritten, False
280
 
281
- # =========================
282
- # Selection (keywords + similarity + threshold) and metadata
283
- # =========================
284
- def _kw_set(s: str):
285
- s = re.sub(r'[^a-z0-9 ]+', ' ', (s or "").lower())
286
- toks = [t for t in s.split() if len(t) > 2 and t not in {"the","and","for","with","from","this","that","are","you","your"}]
287
- return set(toks[:8])
288
-
289
  def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
290
  blocks = get_text_blocks(source_url)
291
  if not blocks:
292
- return [{"error":"No article text blocks found on the page."}]
293
 
294
- # ---- target metadata
295
- tgt_title, tgt_desc = "", ""
296
  try:
297
- tgt_html = requests.get(target_url, timeout=25, headers=UA).text
298
  soup_tgt = BeautifulSoup(tgt_html, "html.parser")
299
- if soup_tgt and getattr(soup_tgt, "title", None) and soup_tgt.title:
300
- tgt_title = (soup_tgt.title.get_text() or "").strip()
301
- md = soup_tgt.find("meta", attrs={"name": "description"}) if soup_tgt else None
302
- tgt_desc = ((md.get("content") or "").strip()) if md else ""
303
- except Exception as e:
304
- print(f"[WARN] Failed to fetch Target metadata: {e}")
305
-
306
- target_type = classify_target_type(target_url, tgt_title, tgt_desc)
307
-
308
- # soft keyword gate
309
- kw = _kw_set(anchor_text) | _kw_set(tgt_title)
310
- candidate_blocks = [b for b in blocks if (not kw or any(k in b.lower() for k in kw))]
311
- if not candidate_blocks:
312
- candidate_blocks = blocks
313
 
314
  ext = tldextract.extract(target_url)
315
  tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
 
 
316
  query = f"{anchor_text} — relevant to: {tgt_title} ({tgt_domain})"
317
  q_emb = embed([query])[0]
318
 
319
- blk_embs = embed(candidate_blocks)
320
- sims = F.cosine_similarity(blk_embs, q_emb.repeat(len(candidate_blocks),1))
321
-
322
- # similarity threshold (avoid random bios)
323
- max_sim = float(torch.max(sims))
324
- min_accept = max(0.18, max_sim - 0.10)
325
- filtered = [(i, float(s)) for i, s in enumerate(sims) if float(s) >= min_accept]
326
-
327
- if not filtered:
328
- safe_paragraph = blocks[min(2, len(blocks)-1)]
329
- return [{
330
- "anchor_was_present": False,
331
- "best_sentence_original": safe_paragraph,
332
- "best_sentence_with_anchor": safe_paragraph + f' Related resource: <a href="{target_url}">{anchor_text}</a>.',
333
- "best_paragraph": safe_paragraph,
334
- "tgt_title": tgt_title,
335
- "tgt_desc": tgt_desc,
336
- "target_type": target_type
337
- }]
338
-
339
- filtered.sort(key=lambda x: x[1], reverse=True)
340
- top_idx = [i for (i, _) in filtered[:min(top_k, len(filtered))]]
341
 
342
  results = []
343
- for local_i in top_idx:
344
- blk = candidate_blocks[local_i]
345
  sents = re.split(r'(?<=[.!?])\s+', blk)
346
  s_embs = embed(sents)
347
  s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
348
  si = int(torch.argmax(s_sims))
349
  best_sent = sents[si]
350
- rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url, target_type)
351
  results.append({
352
  "anchor_was_present": exact_found,
353
  "best_sentence_original": best_sent,
354
  "best_sentence_with_anchor": rewritten_sent,
355
  "best_paragraph": blk,
356
  "tgt_title": tgt_title,
357
- "tgt_desc": tgt_desc,
358
- "target_type": target_type
359
  })
360
  return results
361
 
362
- # =========================
363
- # Distortion / safety helpers
364
- # =========================
 
 
365
  def detect_primary_brand(paragraph: str) -> str:
366
- """Heuristic: catch brand phrases like 'Base Casino', 'Something Platform' etc."""
 
 
 
367
  p = paragraph.strip()
368
  m = re.search(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2})\s+(Casino|Platform|Site|Service|App)\b', p)
369
  if m:
@@ -373,8 +180,9 @@ def detect_primary_brand(paragraph: str) -> str:
373
 
374
  def rewrite_would_distort_meaning(original_text: str, rewritten_html: str, anchor_text: str, paragraph_text: str = "") -> bool:
375
  """
376
- True if rewrite misattributes the subject or positions the anchor as the mechanism.
377
- Also if the anchor appears before the paragraph brand, too early overall, or introduces content-type nouns not in original.
 
378
  """
379
  plain_rewrite = BeautifulSoup(rewritten_html, "html.parser").get_text(" ").strip().lower()
380
  plain_orig = original_text.strip().lower()
@@ -385,13 +193,15 @@ def rewrite_would_distort_meaning(original_text: str, rewritten_html: str, ancho
385
  pos_a = plain_rewrite.find(a)
386
  pos_b = plain_rewrite.find(brand)
387
  if pos_b != -1 and pos_a != -1 and pos_a < pos_b:
388
- return True
389
 
 
390
  if a in plain_rewrite:
391
  pos = plain_rewrite.find(a)
392
  if pos != -1 and pos <= max(4, int(0.20 * len(plain_rewrite))):
393
  return True
394
 
 
395
  mechanism_patterns = [
396
  rf'\bthrough\s+{re.escape(a)}\b',
397
  rf'\bvia\s+{re.escape(a)}\b',
@@ -402,6 +212,7 @@ def rewrite_would_distort_meaning(original_text: str, rewritten_html: str, ancho
402
  if re.search(pat, plain_rewrite):
403
  return True
404
 
 
405
  bad_hosting = [
406
  rf'(this|the)\s+guide\s+(at|on|from)\s+{re.escape(a)}\b',
407
  rf'\b{re.escape(a)}\b\s+(explains|shows|details|covers)\b',
@@ -411,7 +222,8 @@ def rewrite_would_distort_meaning(original_text: str, rewritten_html: str, ancho
411
  if re.search(pat, plain_rewrite):
412
  return True
413
 
414
- content_nouns = ["guide","article","post","review","platform","site","resource"]
 
415
  if any(n in plain_rewrite for n in content_nouns) and not any(n in plain_orig for n in content_nouns):
416
  return True
417
 
@@ -422,7 +234,7 @@ def build_related_resource_line(target_url: str, anchor_text: str, plain_text=Fa
422
  return to_plain_text(html) if plain_text else html
423
 
424
  # =========================
425
- # GPT decision (inline vs add-after) with paragraph context + auto-fallback chain
426
  # =========================
427
  def _openai_chat(model_name: str, system: str, user_json: dict):
428
  headers = {"Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json"}
@@ -437,22 +249,26 @@ def _openai_chat(model_name: str, system: str, user_json: dict):
437
  }
438
  r = requests.post(OPENAI_CHAT_URL, headers=headers, json=body, timeout=60)
439
  print(f"[GPT] Model={model_name} HTTP {r.status_code}")
440
- if r.status_code >= 400:
441
- raise RuntimeError(f"OpenAI error {r.status_code}: {r.text[:400]}")
442
  txt = r.json()["choices"][0]["message"]["content"]
443
  return json.loads(txt)
444
 
445
- def gpt_decide_and_rewrite(paragraph_text, chosen_sentence, anchor_text, target_url, tgt_title, tgt_desc, target_type):
 
 
 
 
 
 
 
 
446
  if not OPENAI_API_KEY:
447
  print("[GPT] No OPENAI_API_KEY found → using fallback inline.")
448
- return {"mode": "inline", "sentence_html": chosen_sentence, "used_model": "none"}
449
 
450
- if target_type == "ecom":
451
- preferred_adjuncts = ["from", "available from", "supplies from", "equipment from", "shop at"]
452
- elif target_type == "content":
453
- preferred_adjuncts = ["in", "on", "from", "tips from", "article on", "guide on", "explained on"]
454
- else:
455
- preferred_adjuncts = ["at", "from", "with context at", "resources at"]
456
 
457
  system = (
458
  "You are a professional content editor.\n"
@@ -465,16 +281,14 @@ def gpt_decide_and_rewrite(paragraph_text, chosen_sentence, anchor_text, target_
465
  "HARD RULES:\n"
466
  "1) If inline: include an <a href> with the EXACT anchor text; keep length close; no em-dash; avoid 'for details', "
467
  "'click here', 'learn more', 'visit', 'read more', 'via', 'through'. Do NOT present the anchor as the mechanism "
468
- "for the action (never 'through ANCHOR', 'via ANCHOR'). Prefer one of these adjuncts before the anchor when inlining: "
469
- f"{', '.join(preferred_adjuncts)}. Place the anchor within the first 70% of the sentence and after the paragraph’s brand/subject.\n"
 
470
  "2) If add_after: return a single short line like 'Related resource: <a href=\"URL\">ANCHOR</a>.' "
471
  "(12–14 words max, neutral tone).\n\n"
472
  "OUTPUT JSON ONLY with keys: mode ('inline'|'add_after'), sentence_html (if inline), add_after_html (if add_after)."
473
  )
474
 
475
- meta = f"{tgt_title} {tgt_desc}".lower()
476
- allowed_nouns = [w for w in ["guide","article","blog","review","platform","site","resource"] if w in meta]
477
-
478
  user = {
479
  "paragraph_text": paragraph_text,
480
  "chosen_sentence": chosen_sentence,
@@ -482,102 +296,98 @@ def gpt_decide_and_rewrite(paragraph_text, chosen_sentence, anchor_text, target_
482
  "target_url": target_url,
483
  "target_metadata": {"title": tgt_title, "description": tgt_desc},
484
  "allowed_nouns_from_metadata": allowed_nouns,
485
- "target_type": target_type,
486
  "constraints": {
487
- "avoid": ["for details","click here","learn more","visit","read more","via","through","—","--"," - "],
488
- "preferred_connectors": preferred_adjuncts,
 
 
 
489
  "place_anchor": "inside_first_70_percent"
490
  }
491
  }
492
 
493
- last_err = None
494
- for model_name in PREF_CHAIN:
 
 
495
  try:
496
- obj = _openai_chat(model_name, system, user)
497
- mode = obj.get("mode", "inline")
498
- if mode not in ("inline", "add_after"):
499
- mode = "inline"
500
- return {
501
- "mode": mode,
502
- "sentence_html": obj.get("sentence_html", ""),
503
- "add_after_html": obj.get("add_after_html", ""),
504
- "used_model": model_name
505
- }
506
- except Exception as e:
507
- print(f"[GPT] {model_name} failed: {e}")
508
- last_err = e
509
- continue
510
-
511
- print(f"[GPT] All models failed, using inline fallback. Last error: {last_err}")
512
- return {"mode": "inline", "sentence_html": chosen_sentence, "used_model": "fallback-inline"}
513
 
514
  # =========================
515
- # Gradio UI / Orchestration
516
  # =========================
517
  def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text):
518
- try:
519
- if not source_url or not target_url or not anchor_text:
520
- return "❌ Please provide Source URL, Target URL, and Anchor Text."
521
-
522
- diag = [] # diagnostics to show at the end
523
-
524
- warn = ""
525
- if looks_like_url(anchor_text) and not looks_like_url(target_url):
526
- anchor_text, target_url = target_url, anchor_text
527
- warn = "ℹ️ Detected swapped inputs. I used the URL as Target URL and the text as Anchor.\n\n"
528
-
529
- target_url = normalize_url(target_url)
530
-
531
- res_list = suggest_insertions(source_url, target_url, anchor_text, top_k=1)
532
- res = res_list[0]
533
- if "error" in res:
534
- return f"❌ {res['error']}"
535
-
536
- draft_html = res["best_sentence_with_anchor"]
537
- orig_sentence = res["best_sentence_original"]
538
- paragraph = res["best_paragraph"]
539
- tgt_title = res.get("tgt_title", "")
540
- tgt_desc = res.get("tgt_desc", "")
541
- target_type = res.get("target_type", "generic")
542
-
543
- if smart_rewrite:
544
- decision = gpt_decide_and_rewrite(paragraph, orig_sentence, anchor_text, target_url, tgt_title, tgt_desc, target_type)
545
- used_model = decision.get("used_model", "unknown")
546
- if used_model:
547
- diag.append(f"Model: {used_model}")
548
-
549
- mode = decision.get("mode", "inline")
550
-
551
- if mode == "inline":
552
- final_html = decision.get("sentence_html", "") or draft_html
553
- if rewrite_would_distort_meaning(orig_sentence, final_html, anchor_text, paragraph):
554
- add_after = build_related_resource_line(target_url, anchor_text, plain_text)
555
- body = warn + "Add this mini-line after the paragraph (to avoid changing its meaning):\n\n" + add_after
556
- else:
557
- final_output = to_plain_text(final_html) if plain_text else final_html
558
- body = warn + f"Change this sentence:\n\n{orig_sentence}\n\nWith this one:\n\n{final_output}"
559
- else: # add_after
560
- add_line = decision.get("add_after_html") or build_related_resource_line(target_url, anchor_text, False)
561
- add_line_out = to_plain_text(add_line) if plain_text else add_line
562
- body = warn + "Add this mini-line after the paragraph:\n\n" + add_line_out
 
563
 
 
 
 
 
 
564
  else:
565
- final_output = to_plain_text(draft_html) if plain_text else draft_html
566
- body = warn + f"Change this sentence:\n\n{orig_sentence}\n\nWith this one:\n\n{final_output}"
567
-
568
- if diag:
569
- body += "\n\n—\n" + " · ".join(diag)
570
- return body
571
 
572
- except Exception as e:
573
- tb = traceback.format_exc(limit=50) # show enough context
574
- return f"❌ Error: {e}\n\n{tb}"
575
-
576
- # =========================
577
- # Launch
578
- # =========================
579
  gpt_status = "ON" if OPENAI_API_KEY else "OFF"
580
- title_model = PREF_CHAIN[0] if OPENAI_API_KEY else "OFF"
581
 
582
  demo = gr.Interface(
583
  fn=run_tool,
@@ -586,11 +396,11 @@ demo = gr.Interface(
586
  gr.Textbox(label="Target URL"),
587
  gr.Textbox(label="Anchor Text"),
588
  gr.Checkbox(label="Smart rewrite (GPT)", value=True),
589
- gr.Checkbox(label="Plain text (no URL)", value=False),
590
  ],
591
- outputs=gr.Textbox(label="Result", lines=14),
592
  title=f"Link Insertion Helper • GPT: {gpt_status} • Model: {title_model}",
593
- description="Chooses safe inline rewrite vs neutral add-after using full paragraph context. Filters bios/comments/hashtags; uses a keyword + similarity gate; auto-falls back across models."
594
  )
595
 
596
  if __name__ == "__main__":
 
1
+ import os, re, json, requests, urllib.parse
2
  import torch, torch.nn.functional as F
3
+ from bs4 import BeautifulSoup
4
  from transformers import AutoTokenizer, AutoModel
5
  import tldextract
6
  import gradio as gr
 
12
  UA = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"}
13
 
14
  # --- OpenAI settings ---
15
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # add in HF Spaces: Settings → Variables & secrets
16
+ PREFERRED_OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5o") # preferred model
17
+ FALLBACK_OPENAI_MODEL = "gpt-4o-mini" # automatic fallback
18
+ OPENAI_CHAT_URL = "https://api.openai.com/v1/chat/completions"
19
 
20
  # =========================
21
  # Load LinkBERT
 
24
  enc = AutoModel.from_pretrained(MODEL)
25
 
26
  # =========================
27
+ # Helpers
28
  # =========================
29
  def looks_like_url(text: str) -> bool:
30
  if not text:
 
42
  return "https://" + url
43
  return url
44
 
 
 
 
 
 
 
45
  def get_text_blocks(url):
46
+ resp = requests.get(url, timeout=20, headers=UA)
47
+ resp.raise_for_status()
 
 
 
 
48
  soup = BeautifulSoup(resp.text, "html.parser")
49
+ for tag in soup(["script","style","noscript","header","footer","nav","aside","form"]):
50
+ tag.decompose()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  blocks = []
52
+ for el in soup.find_all(["p","li","h2","h3","h4","blockquote"]):
53
+ txt = " ".join(el.get_text(" ").split())
54
+ if len(txt) > 60:
55
+ blocks.append(txt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  return blocks
57
+
 
 
58
  def mean_pool(last_hidden_state, mask):
59
  x = last_hidden_state
60
  mask = mask.unsqueeze(-1)
 
66
  out = enc(**batch)
67
  return mean_pool(out.last_hidden_state, batch["attention_mask"])
68
 
69
+ # ---------- Fallback: integrate anchor mid-sentence (no em-dash, no clichés, neutral nouns)
70
+ def inject_anchor_into_sentence(sentence, anchor_text, target_url):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  """
72
+ Wrap anchor if present; otherwise integrate mid-sentence with a neutral preposition.
73
+ No em-dash. Avoid CTA clichés. Do not assert target content type.
74
+ Prefer 'Related resource' add-after if sentence begins with 'This guide' etc.
75
  """
76
  def norm(x): return re.sub(r'[^a-z0-9 ]','',x.lower())
77
  n_sent, n_anchor = norm(sentence), norm(anchor_text)
78
 
79
+ # If sentence clearly has its own subject ("This guide", "Our platform", "Base Casino"), prefer add-after
80
  if n_sent.startswith("this guide") or n_sent.startswith("our platform") or n_sent.startswith("base casino"):
81
  html = sentence
82
  add_after = f' Related resource: <a href="{target_url}">{anchor_text}</a>.'
83
  return html + add_after, False
84
 
85
+ # 1) If anchor words already present, wrap them
86
  if n_anchor and n_anchor in n_sent:
87
  pattern = re.compile(re.escape(anchor_text), re.IGNORECASE)
88
  return pattern.sub(f'<a href="{target_url}">{anchor_text}</a>', sentence), True
89
 
90
+ # 2) Otherwise, insert "at/on/from <a>anchor</a>" near a suitable noun
91
  insert_html = f'<a href="{target_url}">{anchor_text}</a>'
92
 
93
+ m = re.search(r'\b(games?|content|options?|features?|benefits?)\b', sentence, flags=re.I)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  if m:
95
  idx = m.start()
96
+ return (sentence[:idx] + f' at {insert_html} ' + sentence[idx:]).strip(), False
97
 
98
+ # after first comma
99
  m2 = re.search(r',\s*', sentence)
100
  if m2:
101
  idx = m2.end()
102
+ return (sentence[:idx] + f' at {insert_html} ' + sentence[idx:]).strip(), False
103
 
104
+ # around "to"
105
  m3 = re.search(r'\bto\b', sentence, flags=re.I)
106
  if m3:
107
  idx = m3.start()
108
+ return (sentence[:idx] + f' at {insert_html} ' + sentence[idx:]).strip(), False
109
 
110
+ # last resort: short neutral phrase
111
  if sentence.endswith(('.', '!', '?')):
112
  base, punct = sentence[:-1], sentence[-1]
113
  else:
114
  base, punct = sentence, '.'
115
+ rewritten = f'{base} with additional context available at {insert_html}{punct}'
116
  return rewritten, False
117
 
 
 
 
 
 
 
 
 
118
  def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
119
  blocks = get_text_blocks(source_url)
120
  if not blocks:
121
+ return [{"error":"No text blocks found on the page."}]
122
 
123
+ # -------- target context (title + meta desc)
 
124
  try:
125
+ tgt_html = requests.get(target_url, timeout=20, headers=UA).text
126
  soup_tgt = BeautifulSoup(tgt_html, "html.parser")
127
+ tt = soup_tgt.title.get_text().strip() if soup_tgt.title else ""
128
+ md = soup_tgt.find("meta", attrs={"name": "description"})
129
+ tgt_desc = (md.get("content") or "").strip() if md else ""
130
+ tgt_title = tt
131
+ except Exception:
132
+ tgt_title, tgt_desc = "", ""
 
 
 
 
 
 
 
 
133
 
134
  ext = tldextract.extract(target_url)
135
  tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
136
+
137
+ # NOTE: internal query string only (not shown to users)
138
  query = f"{anchor_text} — relevant to: {tgt_title} ({tgt_domain})"
139
  q_emb = embed([query])[0]
140
 
141
+ blk_embs = embed(blocks)
142
+ sims = F.cosine_similarity(blk_embs, q_emb.repeat(len(blocks),1))
143
+ top_idx = torch.topk(sims, k=min(top_k, len(blocks))).indices.tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  results = []
146
+ for idx in top_idx:
147
+ blk = blocks[idx] # full paragraph
148
  sents = re.split(r'(?<=[.!?])\s+', blk)
149
  s_embs = embed(sents)
150
  s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
151
  si = int(torch.argmax(s_sims))
152
  best_sent = sents[si]
153
+ rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
154
  results.append({
155
  "anchor_was_present": exact_found,
156
  "best_sentence_original": best_sent,
157
  "best_sentence_with_anchor": rewritten_sent,
158
  "best_paragraph": blk,
159
  "tgt_title": tgt_title,
160
+ "tgt_desc": tgt_desc
 
161
  })
162
  return results
163
 
164
+ # ---------- Plain-text helper (preserve spacing between tags)
165
+ def to_plain_text(html_or_text):
166
+ return BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
167
+
168
+ # ---------- Distortion / safety helpers
169
  def detect_primary_brand(paragraph: str) -> str:
170
+ """
171
+ Heuristic: catch brand phrases like 'Base Casino', 'Acme Platform', 'Something App'.
172
+ Returns lowercased brand phrase or ''.
173
+ """
174
  p = paragraph.strip()
175
  m = re.search(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2})\s+(Casino|Platform|Site|Service|App)\b', p)
176
  if m:
 
180
 
181
  def rewrite_would_distort_meaning(original_text: str, rewritten_html: str, anchor_text: str, paragraph_text: str = "") -> bool:
182
  """
183
+ True if the rewrite likely misattributes the subject or positions the anchor as the mechanism.
184
+ Also flags if the anchor appears before the paragraph's main brand or too early overall,
185
+ or if it introduces content-type nouns that weren't present in the original.
186
  """
187
  plain_rewrite = BeautifulSoup(rewritten_html, "html.parser").get_text(" ").strip().lower()
188
  plain_orig = original_text.strip().lower()
 
193
  pos_a = plain_rewrite.find(a)
194
  pos_b = plain_rewrite.find(brand)
195
  if pos_b != -1 and pos_a != -1 and pos_a < pos_b:
196
+ return True # anchor introduced before the paragraph’s brand
197
 
198
+ # Anchor appears very early -> often implies subject shift
199
  if a in plain_rewrite:
200
  pos = plain_rewrite.find(a)
201
  if pos != -1 and pos <= max(4, int(0.20 * len(plain_rewrite))):
202
  return True
203
 
204
+ # Anchor as the mechanism or double "at"
205
  mechanism_patterns = [
206
  rf'\bthrough\s+{re.escape(a)}\b',
207
  rf'\bvia\s+{re.escape(a)}\b',
 
212
  if re.search(pat, plain_rewrite):
213
  return True
214
 
215
+ # Re-attribute authorship/hosting to anchor
216
  bad_hosting = [
217
  rf'(this|the)\s+guide\s+(at|on|from)\s+{re.escape(a)}\b',
218
  rf'\b{re.escape(a)}\b\s+(explains|shows|details|covers)\b',
 
222
  if re.search(pat, plain_rewrite):
223
  return True
224
 
225
+ # Introducing content-type nouns when not present in original
226
+ content_nouns = ["guide", "article", "post", "review", "platform", "site", "resource"]
227
  if any(n in plain_rewrite for n in content_nouns) and not any(n in plain_orig for n in content_nouns):
228
  return True
229
 
 
234
  return to_plain_text(html) if plain_text else html
235
 
236
  # =========================
237
+ # GPT rewrite (editorial with paragraph context; can choose inline vs add-after)
238
  # =========================
239
  def _openai_chat(model_name: str, system: str, user_json: dict):
240
  headers = {"Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json"}
 
249
  }
250
  r = requests.post(OPENAI_CHAT_URL, headers=headers, json=body, timeout=60)
251
  print(f"[GPT] Model={model_name} HTTP {r.status_code}")
252
+ r.raise_for_status()
 
253
  txt = r.json()["choices"][0]["message"]["content"]
254
  return json.loads(txt)
255
 
256
+ def gpt_decide_and_rewrite(paragraph_text, chosen_sentence, anchor_text, target_url, tgt_title, tgt_desc):
257
+ """
258
+ Sends FULL PARAGRAPH + CHOSEN SENTENCE + TARGET METADATA to GPT.
259
+ GPT must return:
260
+ - mode: "inline" or "add_after"
261
+ - sentence_html (required if mode=inline)
262
+ - add_after_html (required if mode=add_after)
263
+ Enforces: no em-dash, no CTA clichés, neutral attribution unless metadata allows.
264
+ """
265
  if not OPENAI_API_KEY:
266
  print("[GPT] No OPENAI_API_KEY found → using fallback inline.")
267
+ return {"mode": "inline", "sentence_html": chosen_sentence}
268
 
269
+ # Determine which content-type nouns are allowed based on metadata
270
+ meta = f"{tgt_title} {tgt_desc}".lower()
271
+ allowed_nouns = [w for w in ["guide","article","blog","review","platform","site","resource"] if w in meta]
 
 
 
272
 
273
  system = (
274
  "You are a professional content editor.\n"
 
281
  "HARD RULES:\n"
282
  "1) If inline: include an <a href> with the EXACT anchor text; keep length close; no em-dash; avoid 'for details', "
283
  "'click here', 'learn more', 'visit', 'read more', 'via', 'through'. Do NOT present the anchor as the mechanism "
284
+ "for the action (never 'through ANCHOR', 'via ANCHOR'). Prefer neutral adjuncts like 'also at', 'with context at', "
285
+ "'additional information at', or 'resources at' before the anchor. Place the anchor within the first 70% of the sentence "
286
+ "but after the paragraph’s brand/subject.\n"
287
  "2) If add_after: return a single short line like 'Related resource: <a href=\"URL\">ANCHOR</a>.' "
288
  "(12–14 words max, neutral tone).\n\n"
289
  "OUTPUT JSON ONLY with keys: mode ('inline'|'add_after'), sentence_html (if inline), add_after_html (if add_after)."
290
  )
291
 
 
 
 
292
  user = {
293
  "paragraph_text": paragraph_text,
294
  "chosen_sentence": chosen_sentence,
 
296
  "target_url": target_url,
297
  "target_metadata": {"title": tgt_title, "description": tgt_desc},
298
  "allowed_nouns_from_metadata": allowed_nouns,
 
299
  "constraints": {
300
+ "avoid": [
301
+ "for details", "click here", "learn more", "visit", "read more",
302
+ "via", "through", "—", "--", " - "
303
+ ],
304
+ "preferred_connectors": ["at", "on", "from", "in"],
305
  "place_anchor": "inside_first_70_percent"
306
  }
307
  }
308
 
309
+ try:
310
+ obj = _openai_chat(PREFERRED_OPENAI_MODEL, system, user)
311
+ except Exception as e:
312
+ print(f"[GPT] Preferred model failed: {e}. Falling back to {FALLBACK_OPENAI_MODEL}.")
313
  try:
314
+ obj = _openai_chat(FALLBACK_OPENAI_MODEL, system, user)
315
+ except Exception as e2:
316
+ print(f"[GPT] Fallback failed: {e2}. Using inline fallback.")
317
+ return {"mode": "inline", "sentence_html": chosen_sentence}
318
+
319
+ # Normalize output
320
+ mode = obj.get("mode", "inline")
321
+ if mode not in ("inline", "add_after"):
322
+ mode = "inline"
323
+ return {
324
+ "mode": mode,
325
+ "sentence_html": obj.get("sentence_html", ""),
326
+ "add_after_html": obj.get("add_after_html", "")
327
+ }
 
 
 
328
 
329
  # =========================
330
+ # Gradio UI
331
  # =========================
332
  def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text):
333
+ if not source_url or not target_url or not anchor_text:
334
+ return "❌ Please provide Source URL, Target URL, and Anchor Text."
335
+
336
+ # Auto-correct swapped inputs
337
+ warn = ""
338
+ if looks_like_url(anchor_text) and not looks_like_url(target_url):
339
+ anchor_text, target_url = target_url, anchor_text
340
+ warn = "ℹ️ Detected swapped inputs. I used the URL as Target URL and the text as Anchor.\n\n"
341
+
342
+ target_url = normalize_url(target_url)
343
+
344
+ res = suggest_insertions(source_url, target_url, anchor_text, top_k=1)[0]
345
+ if "error" in res:
346
+ return f"❌ {res['error']}"
347
+
348
+ draft_html = res["best_sentence_with_anchor"]
349
+ orig_sentence = res["best_sentence_original"]
350
+ paragraph = res["best_paragraph"]
351
+ tgt_title = res.get("tgt_title", "")
352
+ tgt_desc = res.get("tgt_desc", "")
353
+
354
+ # Optional conservative rule: force add-after for "This guide ..."
355
+ # if orig_sentence.strip().lower().startswith("this guide"):
356
+ # add_after = build_related_resource_line(target_url, anchor_text, plain_text)
357
+ # return warn + "Add this mini-line after the paragraph:\n\n" + add_after
358
+
359
+ if smart_rewrite:
360
+ # Ask GPT to decide: inline vs add-after (with full paragraph context)
361
+ decision = gpt_decide_and_rewrite(paragraph, orig_sentence, anchor_text, target_url, tgt_title, tgt_desc)
362
+ mode = decision.get("mode", "inline")
363
+
364
+ if mode == "inline":
365
+ final_html = decision.get("sentence_html", "") or draft_html
366
+ # Safety gate: reject if it would distort meaning
367
+ if rewrite_would_distort_meaning(orig_sentence, final_html, anchor_text, paragraph):
368
+ add_after = build_related_resource_line(target_url, anchor_text, plain_text)
369
+ return warn + "Add this mini-line after the paragraph (to avoid changing its meaning):\n\n" + add_after
370
+
371
+ final_output = to_plain_text(final_html) if plain_text else final_html
372
+ # We propose a replacement to ensure the exact integrated version is used
373
+ return warn + f"Change this sentence:\n\n{orig_sentence}\n\nWith this one:\n\n{final_output}"
374
+
375
+ else: # add_after
376
+ add_line = decision.get("add_after_html") or build_related_resource_line(target_url, anchor_text, False)
377
+ add_line_out = to_plain_text(add_line) if plain_text else add_line
378
+ return warn + "Add this mini-line after the paragraph:\n\n" + add_line_out
379
 
380
+ else:
381
+ # No GPT: use heuristic inline fallback already injected in draft_html
382
+ final_output = to_plain_text(draft_html) if plain_text else draft_html
383
+ if res.get("anchor_was_present", False):
384
+ return warn + f"✅ Add link here:\n\n{final_output}"
385
  else:
386
+ return warn + f"Change this sentence:\n\n{orig_sentence}\n\nWith this one:\n\n{final_output}"
 
 
 
 
 
387
 
388
+ # Show GPT status / model in the header
 
 
 
 
 
 
389
  gpt_status = "ON" if OPENAI_API_KEY else "OFF"
390
+ title_model = PREFERRED_OPENAI_MODEL if OPENAI_API_KEY else "OFF"
391
 
392
  demo = gr.Interface(
393
  fn=run_tool,
 
396
  gr.Textbox(label="Target URL"),
397
  gr.Textbox(label="Anchor Text"),
398
  gr.Checkbox(label="Smart rewrite (GPT)", value=True),
399
+ gr.Checkbox(label="Plain text (no URL)", value=False)
400
  ],
401
+ outputs=gr.Textbox(label="Result", lines=12),
402
  title=f"Link Insertion Helper • GPT: {gpt_status} • Model: {title_model}",
403
+ description="Chooses safe inline rewrite vs neutral add-after using full paragraph context. Toggle GPT and Plain text (no URL) as needed."
404
  )
405
 
406
  if __name__ == "__main__":