Spaces:

dusan-presswhizz
/

PressWhizz-Link-Insert-Suggestion-API

Sleeping

App Files Files Community

dusan-presswhizz commited on Aug 23, 2025

Commit

3840a00

verified ·

1 Parent(s): 2e33344

Update app.py

Browse files

Files changed (1) hide show

app.py +167 -357

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
-import os, re, json, requests, urllib.parse, traceback
 import torch, torch.nn.functional as F
-from bs4 import BeautifulSoup, Tag
 from transformers import AutoTokenizer, AutoModel
 import tldextract
 import gradio as gr
@@ -12,9 +12,10 @@ MODEL = "michiyasunaga/LinkBERT-base"
 UA = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"}
 # --- OpenAI settings ---
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")            # Settings → Variables & secrets
-PREF_CHAIN = [os.getenv("OPENAI_MODEL", "gpt-5o"), "gpt-4o", "gpt-4o-mini"]  # try in this order automatically
-OPENAI_CHAT_URL = "https://api.openai.com/v1/chat/completions"
 # =========================
 # Load LinkBERT
@@ -23,7 +24,7 @@ tok = AutoTokenizer.from_pretrained(MODEL)
 enc = AutoModel.from_pretrained(MODEL)
 # =========================
-# General helpers
 # =========================
 def looks_like_url(text: str) -> bool:
     if not text:
@@ -41,148 +42,19 @@ def normalize_url(url: str) -> str:
         return "https://" + url
     return url
-def to_plain_text(html_or_text: str) -> str:
-    return BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
-# =========================
-# Main-content extraction (filters bios/sidebars/comments/etc.) — **hardened**
-# =========================
 def get_text_blocks(url):
-    try:
-        resp = requests.get(url, timeout=25, headers=UA)
-        resp.raise_for_status()
-    except Exception as e:
-        raise RuntimeError(f"Failed to fetch Source URL ({url}): {e}")
     soup = BeautifulSoup(resp.text, "html.parser")
-    # Remove global noise early
-    for tag in soup(["script","style","noscript","svg","form","header","footer","nav","aside"]):
-        try:
-            tag.decompose()
-        except Exception:
-            pass
-    # Prefer a main/article container
-    candidates = []
-    for sel in [
-       "article",
-    '[itemprop="articleBody"]',
-    '[role="main"]',
-    "main",
-    ".entry-content",
-    ".post-content",
-    ".post__content",
-    ".single-post",
-    ".blog-post",
-    ".content__body",
-    # add these:
-    ".article-content",
-    ".post",
-    ".content",
-    ".entry",
-    ".site-content",
-    ".page-content",
-    ]:
-        try:
-            found = soup.select_one(sel)
-        except Exception:
-            found = None
-        if isinstance(found, Tag):
-            txtlen = len(found.get_text(strip=True))
-            if txtlen > 200:
-                candidates.append(found)
-    root = None
-    if candidates:
-        root = max(candidates, key=lambda n: len(n.get_text(strip=True)))
-    else:
-        root = soup.body if isinstance(soup.body, Tag) else soup
-    if not isinstance(root, Tag):
-        # last-ditch: use the whole doc as a string
-        text = soup.get_text(" ", strip=True)
-        return [text] if len(text) > 80 else []
-    # Drop common noisy sections within root (robust to odd nodes)
-    blacklist = [
-        "author","about-author","post-author","authorbox","byline","bio","profile",
-        "share","sharing","social","follow",
-        "comment","comments","reply",
-        "related","recommend",
-        "newsletter","subscribe",
-        "sidebar","widget",
-        "tag-cloud","tags","breadcrumbs","pagination",
-        "advert","ad-","promo","sponsored"
-    ]
-    for el in list(root.find_all(True)):
-        if not isinstance(el, Tag):
-            continue
-        try:
-            cls = " ".join(el.get("class") or []).lower()
-            idv = (el.get("id") or "").lower()
-        except Exception:
-            cls, idv = "", ""
-        if any(key in cls or key in idv for key in blacklist):
-            try:
-                el.decompose()
-            except Exception:
-                pass
-    # Collect paragraphs/list items/headings that look like article content
     blocks = []
-    for el in root.find_all(["p","li","h2","h3","h4","blockquote"]):
-        if not isinstance(el, Tag):
-            continue
-        try:
-            txt = " ".join(el.get_text(" ").split())
-        except Exception:
-            continue
-        if not txt:
-            continue
-        if txt.lstrip().startswith("#"):        # skip hashtaggy lines
-            continue
-        if len(txt) < 40:                       # too short to be useful
-            continue
-        # light bio filter: many first-person cues in a context that mentions "author"
-        try:
-            context_text = root.get_text(" ").lower()
-        except Exception:
-            context_text = ""
-        first_person_hits = sum(w in txt.lower() for w in [" i ", " i'm ", " i’m ", " my ", " me ", " myself "])
-        if first_person_hits >= 2 and "author" in context_text:
-            continue
-        blocks.append(txt)
-# Fallback: if we found nothing, do a lenient sweep over body paragraphs
-    if not blocks:
-        body = soup.body if soup and soup.body else soup
-        if body:
-            for el in body.find_all(["p","li","h2","h3","h4","blockquote"]):
-                if not isinstance(el, Tag):
-                    continue
-                txt = " ".join(el.get_text(" ").split())
-                if len(txt) >= 40 and not txt.lstrip().startswith("#"):
-                    blocks.append(txt)
-    # Last resort: try AMP version if still empty
-    if not blocks:
-        try:
-            amp_url = (url.rstrip("/") + "/amp") if "/amp" not in url else url
-            r2 = requests.get(amp_url, timeout=20, headers=UA)
-            if r2.ok:
-                s2 = BeautifulSoup(r2.text, "html.parser")
-                for el in s2.find_all(["p","li","h2","h3","h4","blockquote"]):
-                    t = " ".join(el.get_text(" ").split())
-                    if len(t) >= 40:
-                        blocks.append(t)
-        except Exception:
-            pass
     return blocks
-# =========================
-# Embeddings
-# =========================
 def mean_pool(last_hidden_state, mask):
     x = last_hidden_state
     mask = mask.unsqueeze(-1)
@@ -194,176 +66,111 @@ def embed(texts):
         out = enc(**batch)
     return mean_pool(out.last_hidden_state, batch["attention_mask"])
-# =========================
-# Target page type classification (content / ecom / generic)
-# =========================
-def classify_target_type(url: str, title: str, desc: str) -> str:
-    u = (url or "").lower()
-    m = f"{title or ''} {desc or ''}".lower()
-    content_hits = any(k in u for k in ["/blog", "/blogs", "/article", "/how-to", "/news"]) \
-                   or any(k in m for k in ["blog","article","how to","guide","tips","news"])
-    if content_hits:
-        return "content"
-    ecom_hits = any(k in u for k in ["/product","/products","/collection","/collections","/category","/cart","/shop"]) \
-                or any(k in m for k in ["price","add to cart","sku","in stock","buy now","free shipping"])
-    if ecom_hits:
-        return "ecom"
-    return "generic"
-# =========================
-# Type-aware fallback injection (when GPT is OFF or fails)
-# =========================
-def inject_anchor_into_sentence(sentence, anchor_text, target_url, target_type="generic"):
     """
-    Wrap anchor if present; else integrate mid-sentence with a type-aware neutral adjunct.
-    Avoid em-dash and CTA clichés. Prefer add-after if the sentence is clearly about itself ("This guide…").
     """
     def norm(x): return re.sub(r'[^a-z0-9 ]','',x.lower())
     n_sent, n_anchor = norm(sentence), norm(anchor_text)
     if n_sent.startswith("this guide") or n_sent.startswith("our platform") or n_sent.startswith("base casino"):
         html = sentence
         add_after = f' Related resource: <a href="{target_url}">{anchor_text}</a>.'
         return html + add_after, False
     if n_anchor and n_anchor in n_sent:
         pattern = re.compile(re.escape(anchor_text), re.IGNORECASE)
         return pattern.sub(f'<a href="{target_url}">{anchor_text}</a>', sentence), True
     insert_html = f'<a href="{target_url}">{anchor_text}</a>'
-    if target_type == "ecom":
-        adjuncts = [
-            f' with supplies available from {insert_html}',
-            f' with equipment available from {insert_html}',
-            f' from {insert_html}',
-        ]
-    elif target_type == "content":
-        adjuncts = [
-            f' with tips from {insert_html}',
-            f' in an article on {insert_html}',
-            f' with guidance from {insert_html}',
-        ]
-    else:
-        adjuncts = [
-            f' with additional context at {insert_html}',
-            f' with resources at {insert_html}',
-            f' at {insert_html}',
-        ]
-    clause = adjuncts[0]
-    m = re.search(r'\b(games?|content|options?|features?|benefits?|floors?|surfaces?|beauty|makeup|lashes?)\b',
-                  sentence, flags=re.I)
     if m:
         idx = m.start()
-        return (sentence[:idx] + clause + ' ' + sentence[idx:]).strip(), False
     m2 = re.search(r',\s*', sentence)
     if m2:
         idx = m2.end()
-        return (sentence[:idx] + clause + ' ' + sentence[idx:]).strip(), False
     m3 = re.search(r'\bto\b', sentence, flags=re.I)
     if m3:
         idx = m3.start()
-        return (sentence[:idx] + clause + ' ' + sentence[idx:]).strip(), False
     if sentence.endswith(('.', '!', '?')):
         base, punct = sentence[:-1], sentence[-1]
     else:
         base, punct = sentence, '.'
-    rewritten = f'{base}{clause}{punct}'
     return rewritten, False
-# =========================
-# Selection (keywords + similarity + threshold) and metadata
-# =========================
-def _kw_set(s: str):
-    s = re.sub(r'[^a-z0-9 ]+', ' ', (s or "").lower())
-    toks = [t for t in s.split() if len(t) > 2 and t not in {"the","and","for","with","from","this","that","are","you","your"}]
-    return set(toks[:8])
 def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
     blocks = get_text_blocks(source_url)
     if not blocks:
-        return [{"error":"No article text blocks found on the page."}]
-    # ---- target metadata
-    tgt_title, tgt_desc = "", ""
     try:
-        tgt_html = requests.get(target_url, timeout=25, headers=UA).text
         soup_tgt = BeautifulSoup(tgt_html, "html.parser")
-        if soup_tgt and getattr(soup_tgt, "title", None) and soup_tgt.title:
-            tgt_title = (soup_tgt.title.get_text() or "").strip()
-        md = soup_tgt.find("meta", attrs={"name": "description"}) if soup_tgt else None
-        tgt_desc = ((md.get("content") or "").strip()) if md else ""
-    except Exception as e:
-        print(f"[WARN] Failed to fetch Target metadata: {e}")
-    target_type = classify_target_type(target_url, tgt_title, tgt_desc)
-    # soft keyword gate
-    kw = _kw_set(anchor_text) | _kw_set(tgt_title)
-    candidate_blocks = [b for b in blocks if (not kw or any(k in b.lower() for k in kw))]
-    if not candidate_blocks:
-        candidate_blocks = blocks
     ext = tldextract.extract(target_url)
     tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
     query = f"{anchor_text} — relevant to: {tgt_title} ({tgt_domain})"
     q_emb = embed([query])[0]
-    blk_embs = embed(candidate_blocks)
-    sims = F.cosine_similarity(blk_embs, q_emb.repeat(len(candidate_blocks),1))
-    # similarity threshold (avoid random bios)
-    max_sim = float(torch.max(sims))
-    min_accept = max(0.18, max_sim - 0.10)
-    filtered = [(i, float(s)) for i, s in enumerate(sims) if float(s) >= min_accept]
-    if not filtered:
-        safe_paragraph = blocks[min(2, len(blocks)-1)]
-        return [{
-            "anchor_was_present": False,
-            "best_sentence_original": safe_paragraph,
-            "best_sentence_with_anchor": safe_paragraph + f' Related resource: <a href="{target_url}">{anchor_text}</a>.',
-            "best_paragraph": safe_paragraph,
-            "tgt_title": tgt_title,
-            "tgt_desc": tgt_desc,
-            "target_type": target_type
-        }]
-    filtered.sort(key=lambda x: x[1], reverse=True)
-    top_idx = [i for (i, _) in filtered[:min(top_k, len(filtered))]]
     results = []
-    for local_i in top_idx:
-        blk = candidate_blocks[local_i]
         sents = re.split(r'(?<=[.!?])\s+', blk)
         s_embs = embed(sents)
         s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
         si = int(torch.argmax(s_sims))
         best_sent = sents[si]
-        rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url, target_type)
         results.append({
             "anchor_was_present": exact_found,
             "best_sentence_original": best_sent,
             "best_sentence_with_anchor": rewritten_sent,
             "best_paragraph": blk,
             "tgt_title": tgt_title,
-            "tgt_desc": tgt_desc,
-            "target_type": target_type
         })
     return results
-# =========================
-# Distortion / safety helpers
-# =========================
 def detect_primary_brand(paragraph: str) -> str:
-    """Heuristic: catch brand phrases like 'Base Casino', 'Something Platform' etc."""
     p = paragraph.strip()
     m = re.search(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2})\s+(Casino|Platform|Site|Service|App)\b', p)
     if m:
@@ -373,8 +180,9 @@ def detect_primary_brand(paragraph: str) -> str:
 def rewrite_would_distort_meaning(original_text: str, rewritten_html: str, anchor_text: str, paragraph_text: str = "") -> bool:
     """
-    True if rewrite misattributes the subject or positions the anchor as the mechanism.
-    Also if the anchor appears before the paragraph brand, too early overall, or introduces content-type nouns not in original.
     """
     plain_rewrite = BeautifulSoup(rewritten_html, "html.parser").get_text(" ").strip().lower()
     plain_orig    = original_text.strip().lower()
@@ -385,13 +193,15 @@ def rewrite_would_distort_meaning(original_text: str, rewritten_html: str, ancho
         pos_a = plain_rewrite.find(a)
         pos_b = plain_rewrite.find(brand)
         if pos_b != -1 and pos_a != -1 and pos_a < pos_b:
-            return True
     if a in plain_rewrite:
         pos = plain_rewrite.find(a)
         if pos != -1 and pos <= max(4, int(0.20 * len(plain_rewrite))):
             return True
     mechanism_patterns = [
         rf'\bthrough\s+{re.escape(a)}\b',
         rf'\bvia\s+{re.escape(a)}\b',
@@ -402,6 +212,7 @@ def rewrite_would_distort_meaning(original_text: str, rewritten_html: str, ancho
         if re.search(pat, plain_rewrite):
             return True
     bad_hosting = [
         rf'(this|the)\s+guide\s+(at|on|from)\s+{re.escape(a)}\b',
         rf'\b{re.escape(a)}\b\s+(explains|shows|details|covers)\b',
@@ -411,7 +222,8 @@ def rewrite_would_distort_meaning(original_text: str, rewritten_html: str, ancho
         if re.search(pat, plain_rewrite):
             return True
-    content_nouns = ["guide","article","post","review","platform","site","resource"]
     if any(n in plain_rewrite for n in content_nouns) and not any(n in plain_orig for n in content_nouns):
         return True
@@ -422,7 +234,7 @@ def build_related_resource_line(target_url: str, anchor_text: str, plain_text=Fa
     return to_plain_text(html) if plain_text else html
 # =========================
-# GPT decision (inline vs add-after) with paragraph context + auto-fallback chain
 # =========================
 def _openai_chat(model_name: str, system: str, user_json: dict):
     headers = {"Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json"}
@@ -437,22 +249,26 @@ def _openai_chat(model_name: str, system: str, user_json: dict):
     }
     r = requests.post(OPENAI_CHAT_URL, headers=headers, json=body, timeout=60)
     print(f"[GPT] Model={model_name} HTTP {r.status_code}")
-    if r.status_code >= 400:
-        raise RuntimeError(f"OpenAI error {r.status_code}: {r.text[:400]}")
     txt = r.json()["choices"][0]["message"]["content"]
     return json.loads(txt)
-def gpt_decide_and_rewrite(paragraph_text, chosen_sentence, anchor_text, target_url, tgt_title, tgt_desc, target_type):
     if not OPENAI_API_KEY:
         print("[GPT] No OPENAI_API_KEY found → using fallback inline.")
-        return {"mode": "inline", "sentence_html": chosen_sentence, "used_model": "none"}
-    if target_type == "ecom":
-        preferred_adjuncts = ["from", "available from", "supplies from", "equipment from", "shop at"]
-    elif target_type == "content":
-        preferred_adjuncts = ["in", "on", "from", "tips from", "article on", "guide on", "explained on"]
-    else:
-        preferred_adjuncts = ["at", "from", "with context at", "resources at"]
     system = (
         "You are a professional content editor.\n"
@@ -465,16 +281,14 @@ def gpt_decide_and_rewrite(paragraph_text, chosen_sentence, anchor_text, target_
         "HARD RULES:\n"
         "1) If inline: include an <a href> with the EXACT anchor text; keep length close; no em-dash; avoid 'for details', "
         "'click here', 'learn more', 'visit', 'read more', 'via', 'through'. Do NOT present the anchor as the mechanism "
-        "for the action (never 'through ANCHOR', 'via ANCHOR'). Prefer one of these adjuncts before the anchor when inlining: "
-        f"{', '.join(preferred_adjuncts)}. Place the anchor within the first 70% of the sentence and after the paragraph’s brand/subject.\n"
         "2) If add_after: return a single short line like 'Related resource: <a href=\"URL\">ANCHOR</a>.' "
         "(12–14 words max, neutral tone).\n\n"
         "OUTPUT JSON ONLY with keys: mode ('inline'|'add_after'), sentence_html (if inline), add_after_html (if add_after)."
     )
-    meta = f"{tgt_title} {tgt_desc}".lower()
-    allowed_nouns = [w for w in ["guide","article","blog","review","platform","site","resource"] if w in meta]
     user = {
         "paragraph_text": paragraph_text,
         "chosen_sentence": chosen_sentence,
@@ -482,102 +296,98 @@ def gpt_decide_and_rewrite(paragraph_text, chosen_sentence, anchor_text, target_
         "target_url": target_url,
         "target_metadata": {"title": tgt_title, "description": tgt_desc},
         "allowed_nouns_from_metadata": allowed_nouns,
-        "target_type": target_type,
         "constraints": {
-            "avoid": ["for details","click here","learn more","visit","read more","via","through","—","--"," - "],
-            "preferred_connectors": preferred_adjuncts,
             "place_anchor": "inside_first_70_percent"
         }
     }
-    last_err = None
-    for model_name in PREF_CHAIN:
         try:
-            obj = _openai_chat(model_name, system, user)
-            mode = obj.get("mode", "inline")
-            if mode not in ("inline", "add_after"):
-                mode = "inline"
-            return {
-                "mode": mode,
-                "sentence_html": obj.get("sentence_html", ""),
-                "add_after_html": obj.get("add_after_html", ""),
-                "used_model": model_name
-            }
-        except Exception as e:
-            print(f"[GPT] {model_name} failed: {e}")
-            last_err = e
-            continue
-    print(f"[GPT] All models failed, using inline fallback. Last error: {last_err}")
-    return {"mode": "inline", "sentence_html": chosen_sentence, "used_model": "fallback-inline"}
 # =========================
-# Gradio UI / Orchestration
 # =========================
 def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text):
-    try:
-        if not source_url or not target_url or not anchor_text:
-            return "❌ Please provide Source URL, Target URL, and Anchor Text."
-        diag = []  # diagnostics to show at the end
-        warn = ""
-        if looks_like_url(anchor_text) and not looks_like_url(target_url):
-            anchor_text, target_url = target_url, anchor_text
-            warn = "ℹ️ Detected swapped inputs. I used the URL as Target URL and the text as Anchor.\n\n"
-        target_url = normalize_url(target_url)
-        res_list = suggest_insertions(source_url, target_url, anchor_text, top_k=1)
-        res = res_list[0]
-        if "error" in res:
-            return f"❌ {res['error']}"
-        draft_html   = res["best_sentence_with_anchor"]
-        orig_sentence = res["best_sentence_original"]
-        paragraph     = res["best_paragraph"]
-        tgt_title     = res.get("tgt_title", "")
-        tgt_desc      = res.get("tgt_desc", "")
-        target_type   = res.get("target_type", "generic")
-        if smart_rewrite:
-            decision = gpt_decide_and_rewrite(paragraph, orig_sentence, anchor_text, target_url, tgt_title, tgt_desc, target_type)
-            used_model = decision.get("used_model", "unknown")
-            if used_model:
-                diag.append(f"Model: {used_model}")
-            mode = decision.get("mode", "inline")
-            if mode == "inline":
-                final_html = decision.get("sentence_html", "") or draft_html
-                if rewrite_would_distort_meaning(orig_sentence, final_html, anchor_text, paragraph):
-                    add_after = build_related_resource_line(target_url, anchor_text, plain_text)
-                    body = warn + "Add this mini-line after the paragraph (to avoid changing its meaning):\n\n" + add_after
-                else:
-                    final_output = to_plain_text(final_html) if plain_text else final_html
-                    body = warn + f"Change this sentence:\n\n{orig_sentence}\n\nWith this one:\n\n{final_output}"
-            else:  # add_after
-                add_line = decision.get("add_after_html") or build_related_resource_line(target_url, anchor_text, False)
-                add_line_out = to_plain_text(add_line) if plain_text else add_line
-                body = warn + "Add this mini-line after the paragraph:\n\n" + add_line_out
         else:
-            final_output = to_plain_text(draft_html) if plain_text else draft_html
-            body = warn + f"Change this sentence:\n\n{orig_sentence}\n\nWith this one:\n\n{final_output}"
-        if diag:
-            body += "\n\n—\n" + " · ".join(diag)
-        return body
-    except Exception as e:
-        tb = traceback.format_exc(limit=50)  # show enough context
-        return f"❌ Error: {e}\n\n{tb}"
-# =========================
-# Launch
-# =========================
 gpt_status = "ON" if OPENAI_API_KEY else "OFF"
-title_model = PREF_CHAIN[0] if OPENAI_API_KEY else "OFF"
 demo = gr.Interface(
     fn=run_tool,
@@ -586,11 +396,11 @@ demo = gr.Interface(
         gr.Textbox(label="Target URL"),
         gr.Textbox(label="Anchor Text"),
         gr.Checkbox(label="Smart rewrite (GPT)", value=True),
-        gr.Checkbox(label="Plain text (no URL)", value=False),
     ],
-    outputs=gr.Textbox(label="Result", lines=14),
     title=f"Link Insertion Helper • GPT: {gpt_status} • Model: {title_model}",
-    description="Chooses safe inline rewrite vs neutral add-after using full paragraph context. Filters bios/comments/hashtags; uses a keyword + similarity gate; auto-falls back across models."
 )
 if __name__ == "__main__":

+import os, re, json, requests, urllib.parse
 import torch, torch.nn.functional as F
+from bs4 import BeautifulSoup
 from transformers import AutoTokenizer, AutoModel
 import tldextract
 import gradio as gr
 UA = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"}
 # --- OpenAI settings ---
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")  # add in HF Spaces: Settings → Variables & secrets
+PREFERRED_OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5o")  # preferred model
+FALLBACK_OPENAI_MODEL  = "gpt-4o-mini"                        # automatic fallback
+OPENAI_CHAT_URL        = "https://api.openai.com/v1/chat/completions"
 # =========================
 # Load LinkBERT
 enc = AutoModel.from_pretrained(MODEL)
 # =========================
+# Helpers
 # =========================
 def looks_like_url(text: str) -> bool:
     if not text:
         return "https://" + url
     return url
 def get_text_blocks(url):
+    resp = requests.get(url, timeout=20, headers=UA)
+    resp.raise_for_status()
     soup = BeautifulSoup(resp.text, "html.parser")
+    for tag in soup(["script","style","noscript","header","footer","nav","aside","form"]):
+        tag.decompose()
     blocks = []
+    for el in soup.find_all(["p","li","h2","h3","h4","blockquote"]):
+        txt = " ".join(el.get_text(" ").split())
+        if len(txt) > 60:
+            blocks.append(txt)
     return blocks
 def mean_pool(last_hidden_state, mask):
     x = last_hidden_state
     mask = mask.unsqueeze(-1)
         out = enc(**batch)
     return mean_pool(out.last_hidden_state, batch["attention_mask"])
+# ---------- Fallback: integrate anchor mid-sentence (no em-dash, no clichés, neutral nouns)
+def inject_anchor_into_sentence(sentence, anchor_text, target_url):
     """
+    Wrap anchor if present; otherwise integrate mid-sentence with a neutral preposition.
+    No em-dash. Avoid CTA clichés. Do not assert target content type.
+    Prefer 'Related resource' add-after if sentence begins with 'This guide' etc.
     """
     def norm(x): return re.sub(r'[^a-z0-9 ]','',x.lower())
     n_sent, n_anchor = norm(sentence), norm(anchor_text)
+    # If sentence clearly has its own subject ("This guide", "Our platform", "Base Casino"), prefer add-after
     if n_sent.startswith("this guide") or n_sent.startswith("our platform") or n_sent.startswith("base casino"):
         html = sentence
         add_after = f' Related resource: <a href="{target_url}">{anchor_text}</a>.'
         return html + add_after, False
+    # 1) If anchor words already present, wrap them
     if n_anchor and n_anchor in n_sent:
         pattern = re.compile(re.escape(anchor_text), re.IGNORECASE)
         return pattern.sub(f'<a href="{target_url}">{anchor_text}</a>', sentence), True
+    # 2) Otherwise, insert "at/on/from <a>anchor</a>" near a suitable noun
     insert_html = f'<a href="{target_url}">{anchor_text}</a>'
+    m = re.search(r'\b(games?|content|options?|features?|benefits?)\b', sentence, flags=re.I)
     if m:
         idx = m.start()
+        return (sentence[:idx] + f' at {insert_html} ' + sentence[idx:]).strip(), False
+    # after first comma
     m2 = re.search(r',\s*', sentence)
     if m2:
         idx = m2.end()
+        return (sentence[:idx] + f' at {insert_html} ' + sentence[idx:]).strip(), False
+    # around "to"
     m3 = re.search(r'\bto\b', sentence, flags=re.I)
     if m3:
         idx = m3.start()
+        return (sentence[:idx] + f' at {insert_html} ' + sentence[idx:]).strip(), False
+    # last resort: short neutral phrase
     if sentence.endswith(('.', '!', '?')):
         base, punct = sentence[:-1], sentence[-1]
     else:
         base, punct = sentence, '.'
+    rewritten = f'{base} with additional context available at {insert_html}{punct}'
     return rewritten, False
 def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
     blocks = get_text_blocks(source_url)
     if not blocks:
+        return [{"error":"No text blocks found on the page."}]
+    # -------- target context (title + meta desc)
     try:
+        tgt_html = requests.get(target_url, timeout=20, headers=UA).text
         soup_tgt = BeautifulSoup(tgt_html, "html.parser")
+        tt = soup_tgt.title.get_text().strip() if soup_tgt.title else ""
+        md = soup_tgt.find("meta", attrs={"name": "description"})
+        tgt_desc = (md.get("content") or "").strip() if md else ""
+        tgt_title = tt
+    except Exception:
+        tgt_title, tgt_desc = "", ""
     ext = tldextract.extract(target_url)
     tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
+    # NOTE: internal query string only (not shown to users)
     query = f"{anchor_text} — relevant to: {tgt_title} ({tgt_domain})"
     q_emb = embed([query])[0]
+    blk_embs = embed(blocks)
+    sims = F.cosine_similarity(blk_embs, q_emb.repeat(len(blocks),1))
+    top_idx = torch.topk(sims, k=min(top_k, len(blocks))).indices.tolist()
     results = []
+    for idx in top_idx:
+        blk = blocks[idx]                   # full paragraph
         sents = re.split(r'(?<=[.!?])\s+', blk)
         s_embs = embed(sents)
         s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
         si = int(torch.argmax(s_sims))
         best_sent = sents[si]
+        rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
         results.append({
             "anchor_was_present": exact_found,
             "best_sentence_original": best_sent,
             "best_sentence_with_anchor": rewritten_sent,
             "best_paragraph": blk,
             "tgt_title": tgt_title,
+            "tgt_desc": tgt_desc
         })
     return results
+# ---------- Plain-text helper (preserve spacing between tags)
+def to_plain_text(html_or_text):
+    return BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
+# ---------- Distortion / safety helpers
 def detect_primary_brand(paragraph: str) -> str:
+    """
+    Heuristic: catch brand phrases like 'Base Casino', 'Acme Platform', 'Something App'.
+    Returns lowercased brand phrase or ''.
+    """
     p = paragraph.strip()
     m = re.search(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2})\s+(Casino|Platform|Site|Service|App)\b', p)
     if m:
 def rewrite_would_distort_meaning(original_text: str, rewritten_html: str, anchor_text: str, paragraph_text: str = "") -> bool:
     """
+    True if the rewrite likely misattributes the subject or positions the anchor as the mechanism.
+    Also flags if the anchor appears before the paragraph's main brand or too early overall,
+    or if it introduces content-type nouns that weren't present in the original.
     """
     plain_rewrite = BeautifulSoup(rewritten_html, "html.parser").get_text(" ").strip().lower()
     plain_orig    = original_text.strip().lower()
         pos_a = plain_rewrite.find(a)
         pos_b = plain_rewrite.find(brand)
         if pos_b != -1 and pos_a != -1 and pos_a < pos_b:
+            return True  # anchor introduced before the paragraph’s brand
+    # Anchor appears very early -> often implies subject shift
     if a in plain_rewrite:
         pos = plain_rewrite.find(a)
         if pos != -1 and pos <= max(4, int(0.20 * len(plain_rewrite))):
             return True
+    # Anchor as the mechanism or double "at"
     mechanism_patterns = [
         rf'\bthrough\s+{re.escape(a)}\b',
         rf'\bvia\s+{re.escape(a)}\b',
         if re.search(pat, plain_rewrite):
             return True
+    # Re-attribute authorship/hosting to anchor
     bad_hosting = [
         rf'(this|the)\s+guide\s+(at|on|from)\s+{re.escape(a)}\b',
         rf'\b{re.escape(a)}\b\s+(explains|shows|details|covers)\b',
         if re.search(pat, plain_rewrite):
             return True
+    # Introducing content-type nouns when not present in original
+    content_nouns = ["guide", "article", "post", "review", "platform", "site", "resource"]
     if any(n in plain_rewrite for n in content_nouns) and not any(n in plain_orig for n in content_nouns):
         return True
     return to_plain_text(html) if plain_text else html
 # =========================
+# GPT rewrite (editorial with paragraph context; can choose inline vs add-after)
 # =========================
 def _openai_chat(model_name: str, system: str, user_json: dict):
     headers = {"Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json"}
     }
     r = requests.post(OPENAI_CHAT_URL, headers=headers, json=body, timeout=60)
     print(f"[GPT] Model={model_name} HTTP {r.status_code}")
+    r.raise_for_status()
     txt = r.json()["choices"][0]["message"]["content"]
     return json.loads(txt)
+def gpt_decide_and_rewrite(paragraph_text, chosen_sentence, anchor_text, target_url, tgt_title, tgt_desc):
+    """
+    Sends FULL PARAGRAPH + CHOSEN SENTENCE + TARGET METADATA to GPT.
+    GPT must return:
+      - mode: "inline" or "add_after"
+      - sentence_html (required if mode=inline)
+      - add_after_html (required if mode=add_after)
+    Enforces: no em-dash, no CTA clichés, neutral attribution unless metadata allows.
+    """
     if not OPENAI_API_KEY:
         print("[GPT] No OPENAI_API_KEY found → using fallback inline.")
+        return {"mode": "inline", "sentence_html": chosen_sentence}
+    # Determine which content-type nouns are allowed based on metadata
+    meta = f"{tgt_title} {tgt_desc}".lower()
+    allowed_nouns = [w for w in ["guide","article","blog","review","platform","site","resource"] if w in meta]
     system = (
         "You are a professional content editor.\n"
         "HARD RULES:\n"
         "1) If inline: include an <a href> with the EXACT anchor text; keep length close; no em-dash; avoid 'for details', "
         "'click here', 'learn more', 'visit', 'read more', 'via', 'through'. Do NOT present the anchor as the mechanism "
+        "for the action (never 'through ANCHOR', 'via ANCHOR'). Prefer neutral adjuncts like 'also at', 'with context at', "
+        "'additional information at', or 'resources at' before the anchor. Place the anchor within the first 70% of the sentence "
+        "but after the paragraph’s brand/subject.\n"
         "2) If add_after: return a single short line like 'Related resource: <a href=\"URL\">ANCHOR</a>.' "
         "(12–14 words max, neutral tone).\n\n"
         "OUTPUT JSON ONLY with keys: mode ('inline'|'add_after'), sentence_html (if inline), add_after_html (if add_after)."
     )
     user = {
         "paragraph_text": paragraph_text,
         "chosen_sentence": chosen_sentence,
         "target_url": target_url,
         "target_metadata": {"title": tgt_title, "description": tgt_desc},
         "allowed_nouns_from_metadata": allowed_nouns,
         "constraints": {
+            "avoid": [
+                "for details", "click here", "learn more", "visit", "read more",
+                "via", "through", "—", "--", " - "
+            ],
+            "preferred_connectors": ["at", "on", "from", "in"],
             "place_anchor": "inside_first_70_percent"
         }
     }
+    try:
+        obj = _openai_chat(PREFERRED_OPENAI_MODEL, system, user)
+    except Exception as e:
+        print(f"[GPT] Preferred model failed: {e}. Falling back to {FALLBACK_OPENAI_MODEL}.")
         try:
+            obj = _openai_chat(FALLBACK_OPENAI_MODEL, system, user)
+        except Exception as e2:
+            print(f"[GPT] Fallback failed: {e2}. Using inline fallback.")
+            return {"mode": "inline", "sentence_html": chosen_sentence}
+    # Normalize output
+    mode = obj.get("mode", "inline")
+    if mode not in ("inline", "add_after"):
+        mode = "inline"
+    return {
+        "mode": mode,
+        "sentence_html": obj.get("sentence_html", ""),
+        "add_after_html": obj.get("add_after_html", "")
+    }
 # =========================
+# Gradio UI
 # =========================
 def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text):
+    if not source_url or not target_url or not anchor_text:
+        return "❌ Please provide Source URL, Target URL, and Anchor Text."
+    # Auto-correct swapped inputs
+    warn = ""
+    if looks_like_url(anchor_text) and not looks_like_url(target_url):
+        anchor_text, target_url = target_url, anchor_text
+        warn = "ℹ️ Detected swapped inputs. I used the URL as Target URL and the text as Anchor.\n\n"
+    target_url = normalize_url(target_url)
+    res = suggest_insertions(source_url, target_url, anchor_text, top_k=1)[0]
+    if "error" in res:
+        return f"❌ {res['error']}"
+    draft_html    = res["best_sentence_with_anchor"]
+    orig_sentence = res["best_sentence_original"]
+    paragraph     = res["best_paragraph"]
+    tgt_title     = res.get("tgt_title", "")
+    tgt_desc      = res.get("tgt_desc", "")
+    # Optional conservative rule: force add-after for "This guide ..."
+    # if orig_sentence.strip().lower().startswith("this guide"):
+    #     add_after = build_related_resource_line(target_url, anchor_text, plain_text)
+    #     return warn + "Add this mini-line after the paragraph:\n\n" + add_after
+    if smart_rewrite:
+        # Ask GPT to decide: inline vs add-after (with full paragraph context)
+        decision = gpt_decide_and_rewrite(paragraph, orig_sentence, anchor_text, target_url, tgt_title, tgt_desc)
+        mode = decision.get("mode", "inline")
+        if mode == "inline":
+            final_html = decision.get("sentence_html", "") or draft_html
+            # Safety gate: reject if it would distort meaning
+            if rewrite_would_distort_meaning(orig_sentence, final_html, anchor_text, paragraph):
+                add_after = build_related_resource_line(target_url, anchor_text, plain_text)
+                return warn + "Add this mini-line after the paragraph (to avoid changing its meaning):\n\n" + add_after
+            final_output = to_plain_text(final_html) if plain_text else final_html
+            # We propose a replacement to ensure the exact integrated version is used
+            return warn + f"Change this sentence:\n\n{orig_sentence}\n\nWith this one:\n\n{final_output}"
+        else:  # add_after
+            add_line = decision.get("add_after_html") or build_related_resource_line(target_url, anchor_text, False)
+            add_line_out = to_plain_text(add_line) if plain_text else add_line
+            return warn + "Add this mini-line after the paragraph:\n\n" + add_line_out
+    else:
+        # No GPT: use heuristic inline fallback already injected in draft_html
+        final_output = to_plain_text(draft_html) if plain_text else draft_html
+        if res.get("anchor_was_present", False):
+            return warn + f"✅ Add link here:\n\n{final_output}"
         else:
+            return warn + f"Change this sentence:\n\n{orig_sentence}\n\nWith this one:\n\n{final_output}"
+# Show GPT status / model in the header
 gpt_status = "ON" if OPENAI_API_KEY else "OFF"
+title_model = PREFERRED_OPENAI_MODEL if OPENAI_API_KEY else "OFF"
 demo = gr.Interface(
     fn=run_tool,
         gr.Textbox(label="Target URL"),
         gr.Textbox(label="Anchor Text"),
         gr.Checkbox(label="Smart rewrite (GPT)", value=True),
+        gr.Checkbox(label="Plain text (no URL)", value=False)
     ],
+    outputs=gr.Textbox(label="Result", lines=12),
     title=f"Link Insertion Helper • GPT: {gpt_status} • Model: {title_model}",
+    description="Chooses safe inline rewrite vs neutral add-after using full paragraph context. Toggle GPT and Plain text (no URL) as needed."
 )
 if __name__ == "__main__":