Spaces:

dusan-presswhizz
/

PressWhizz-Link-Insert-Suggestion-API

Sleeping

App Files Files Community

dusan-presswhizz commited on Aug 23, 2025

Commit

8a8251c

verified ·

1 Parent(s): 7839cb3

Create app.py

Browse files

Files changed (1) hide show

app.py +126 -0

app.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import re, requests, tldextract
+from bs4 import BeautifulSoup
+import torch, torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModel
+import gradio as gr
+# -----------------------------
+# Model load (cached on Space)
+# -----------------------------
+MODEL = "michiyasunaga/LinkBERT-base"
+tok = AutoTokenizer.from_pretrained(MODEL)
+enc = AutoModel.from_pretrained(MODEL)
+UA = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"}
+# -----------------------------
+# Utilities
+# -----------------------------
+def get_text_blocks(url):
+    resp = requests.get(url, timeout=20, headers=UA)
+    resp.raise_for_status()
+    html = resp.text
+    soup = BeautifulSoup(html, "html.parser")
+    for tag in soup(["script","style","noscript","header","footer","nav","aside","form"]):
+        tag.decompose()
+    blocks = []
+    for el in soup.find_all(["p","li","h2","h3","h4","blockquote"]):
+        txt = " ".join(el.get_text(" ").split())
+        if len(txt) > 60:
+            blocks.append(txt)
+    return blocks
+def mean_pool(last_hidden_state, mask):
+    x = last_hidden_state
+    mask = mask.unsqueeze(-1)
+    return (x * mask).sum(1) / mask.sum(1)
+def embed(texts):
+    batch = tok(texts, padding=True, truncation=True, return_tensors="pt")
+    with torch.no_grad():
+        out = enc(**batch)
+    return mean_pool(out.last_hidden_state, batch["attention_mask"])
+def inject_anchor_into_sentence(sentence, anchor_text, target_url):
+    def norm(x): return re.sub(r'[^a-z0-9 ]','',x.lower())
+    n_sent, n_anchor = norm(sentence), norm(anchor_text)
+    if n_anchor and n_anchor in n_sent:
+        pattern = re.compile(re.escape(anchor_text), re.IGNORECASE)
+        return pattern.sub(f'<a href="{target_url}">{anchor_text}</a>', sentence), True
+    if sentence.endswith(('.', '!', '?')):
+        base, punct = sentence[:-1], sentence[-1]
+    else:
+        base, punct = sentence, '.'
+    rewritten = f'{base} — see <a href="{target_url}">{anchor_text}</a> for details{punct}'
+    return rewritten, False
+def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
+    blocks = get_text_blocks(source_url)
+    if not blocks:
+        return [{"error":"No text blocks found on the page."}]
+    try:
+        tgt_html = requests.get(target_url, timeout=20, headers=UA).text
+        tt = BeautifulSoup(tgt_html, "html.parser").title
+        tgt_title = tt.get_text().strip() if tt else ""
+    except Exception:
+        tgt_title = ""
+    ext = tldextract.extract(target_url)
+    tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
+    query = f"{anchor_text} — relevant to: {tgt_title} ({tgt_domain})"
+    q_emb = embed([query])[0]
+    blk_embs = embed(blocks)
+    sims = F.cosine_similarity(blk_embs, q_emb.repeat(len(blocks),1))
+    top_idx = torch.topk(sims, k=min(top_k, len(blocks))).indices.tolist()
+    results = []
+    for idx in top_idx:
+        blk = blocks[idx]
+        sents = re.split(r'(?<=[.!?])\s+', blk)
+        s_embs = embed(sents)
+        s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
+        si = int(torch.argmax(s_sims))
+        best_sent = sents[si]
+        rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
+        results.append({
+            "anchor_was_present": exact_found,
+            "best_sentence_original": best_sent,
+            "best_sentence_with_anchor": rewritten_sent
+        })
+    return results
+# -----------------------------
+# Gradio UI
+# -----------------------------
+def run_tool(source_url, target_url, anchor_text):
+    if not source_url or not target_url or not anchor_text:
+        return "❌ Please provide Source URL, Target URL, and Anchor Text."
+    try:
+        result = suggest_insertions(source_url, target_url, anchor_text, top_k=1)[0]
+    except Exception as e:
+        return f"❌ Error: {e}"
+    if result.get("anchor_was_present", False):
+        return f"✅ Add link here:\n\n{result['best_sentence_with_anchor']}"
+    else:
+        return (
+            "Change this sentence:\n\n"
+            f"{result['best_sentence_original']}\n\n"
+            "With this one:\n\n"
+            f"{result['best_sentence_with_anchor']}"
+        )
+demo = gr.Interface(
+    fn=run_tool,
+    inputs=[
+        gr.Textbox(label="Source URL"),
+        gr.Textbox(label="Target URL"),
+        gr.Textbox(label="Anchor Text")
+    ],
+    outputs=gr.Textbox(label="Result", lines=10),
+    title="Link Insertion Helper",
+    description="Paste a Source URL, Target URL, and Anchor Text. The tool returns one clear instruction."
+)
+if __name__ == "__main__":
+    demo.launch()