dusan-presswhizz commited on
Commit
8a8251c
·
verified ·
1 Parent(s): 7839cb3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -0
app.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re, requests, tldextract
2
+ from bs4 import BeautifulSoup
3
+ import torch, torch.nn.functional as F
4
+ from transformers import AutoTokenizer, AutoModel
5
+ import gradio as gr
6
+
7
+ # -----------------------------
8
+ # Model load (cached on Space)
9
+ # -----------------------------
10
+ MODEL = "michiyasunaga/LinkBERT-base"
11
+ tok = AutoTokenizer.from_pretrained(MODEL)
12
+ enc = AutoModel.from_pretrained(MODEL)
13
+ UA = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"}
14
+
15
+ # -----------------------------
16
+ # Utilities
17
+ # -----------------------------
18
+ def get_text_blocks(url):
19
+ resp = requests.get(url, timeout=20, headers=UA)
20
+ resp.raise_for_status()
21
+ html = resp.text
22
+ soup = BeautifulSoup(html, "html.parser")
23
+ for tag in soup(["script","style","noscript","header","footer","nav","aside","form"]):
24
+ tag.decompose()
25
+ blocks = []
26
+ for el in soup.find_all(["p","li","h2","h3","h4","blockquote"]):
27
+ txt = " ".join(el.get_text(" ").split())
28
+ if len(txt) > 60:
29
+ blocks.append(txt)
30
+ return blocks
31
+
32
+ def mean_pool(last_hidden_state, mask):
33
+ x = last_hidden_state
34
+ mask = mask.unsqueeze(-1)
35
+ return (x * mask).sum(1) / mask.sum(1)
36
+
37
+ def embed(texts):
38
+ batch = tok(texts, padding=True, truncation=True, return_tensors="pt")
39
+ with torch.no_grad():
40
+ out = enc(**batch)
41
+ return mean_pool(out.last_hidden_state, batch["attention_mask"])
42
+
43
+ def inject_anchor_into_sentence(sentence, anchor_text, target_url):
44
+ def norm(x): return re.sub(r'[^a-z0-9 ]','',x.lower())
45
+ n_sent, n_anchor = norm(sentence), norm(anchor_text)
46
+ if n_anchor and n_anchor in n_sent:
47
+ pattern = re.compile(re.escape(anchor_text), re.IGNORECASE)
48
+ return pattern.sub(f'<a href="{target_url}">{anchor_text}</a>', sentence), True
49
+ if sentence.endswith(('.', '!', '?')):
50
+ base, punct = sentence[:-1], sentence[-1]
51
+ else:
52
+ base, punct = sentence, '.'
53
+ rewritten = f'{base} — see <a href="{target_url}">{anchor_text}</a> for details{punct}'
54
+ return rewritten, False
55
+
56
+ def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
57
+ blocks = get_text_blocks(source_url)
58
+ if not blocks:
59
+ return [{"error":"No text blocks found on the page."}]
60
+ try:
61
+ tgt_html = requests.get(target_url, timeout=20, headers=UA).text
62
+ tt = BeautifulSoup(tgt_html, "html.parser").title
63
+ tgt_title = tt.get_text().strip() if tt else ""
64
+ except Exception:
65
+ tgt_title = ""
66
+ ext = tldextract.extract(target_url)
67
+ tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
68
+
69
+ query = f"{anchor_text} — relevant to: {tgt_title} ({tgt_domain})"
70
+ q_emb = embed([query])[0]
71
+
72
+ blk_embs = embed(blocks)
73
+ sims = F.cosine_similarity(blk_embs, q_emb.repeat(len(blocks),1))
74
+ top_idx = torch.topk(sims, k=min(top_k, len(blocks))).indices.tolist()
75
+
76
+ results = []
77
+ for idx in top_idx:
78
+ blk = blocks[idx]
79
+ sents = re.split(r'(?<=[.!?])\s+', blk)
80
+ s_embs = embed(sents)
81
+ s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
82
+ si = int(torch.argmax(s_sims))
83
+ best_sent = sents[si]
84
+ rewritten_sent, exact_found = inject_anchor_into_sentence(best_sent, anchor_text, target_url)
85
+ results.append({
86
+ "anchor_was_present": exact_found,
87
+ "best_sentence_original": best_sent,
88
+ "best_sentence_with_anchor": rewritten_sent
89
+ })
90
+ return results
91
+
92
+ # -----------------------------
93
+ # Gradio UI
94
+ # -----------------------------
95
+ def run_tool(source_url, target_url, anchor_text):
96
+ if not source_url or not target_url or not anchor_text:
97
+ return "❌ Please provide Source URL, Target URL, and Anchor Text."
98
+ try:
99
+ result = suggest_insertions(source_url, target_url, anchor_text, top_k=1)[0]
100
+ except Exception as e:
101
+ return f"❌ Error: {e}"
102
+
103
+ if result.get("anchor_was_present", False):
104
+ return f"✅ Add link here:\n\n{result['best_sentence_with_anchor']}"
105
+ else:
106
+ return (
107
+ "Change this sentence:\n\n"
108
+ f"{result['best_sentence_original']}\n\n"
109
+ "With this one:\n\n"
110
+ f"{result['best_sentence_with_anchor']}"
111
+ )
112
+
113
+ demo = gr.Interface(
114
+ fn=run_tool,
115
+ inputs=[
116
+ gr.Textbox(label="Source URL"),
117
+ gr.Textbox(label="Target URL"),
118
+ gr.Textbox(label="Anchor Text")
119
+ ],
120
+ outputs=gr.Textbox(label="Result", lines=10),
121
+ title="Link Insertion Helper",
122
+ description="Paste a Source URL, Target URL, and Anchor Text. The tool returns one clear instruction."
123
+ )
124
+
125
+ if __name__ == "__main__":
126
+ demo.launch()