dusan-presswhizz commited on
Commit
de36d64
·
verified ·
1 Parent(s): 649addb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -67
app.py CHANGED
@@ -12,7 +12,7 @@ MODEL = "michiyasunaga/LinkBERT-base"
12
  UA = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"}
13
 
14
  # --- OpenAI settings ---
15
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # HF Spaces: Settings → Variables & secrets
16
  PREFERRED_OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5o") # try this first
17
  FALLBACK_OPENAI_MODEL = "gpt-4o-mini" # fallback
18
  OPENAI_CHAT_URL = "https://api.openai.com/v1/chat/completions"
@@ -42,7 +42,10 @@ def normalize_url(url: str) -> str:
42
  return "https://" + url
43
  return url
44
 
45
- def get_text_blocks(url):
 
 
 
46
  resp = requests.get(url, timeout=20, headers=UA)
47
  resp.raise_for_status()
48
  soup = BeautifulSoup(resp.text, "html.parser")
@@ -67,19 +70,22 @@ def embed(texts):
67
  return mean_pool(out.last_hidden_state, batch["attention_mask"])
68
 
69
  def wrap_existing_anchor(sentence: str, anchor_text: str, target_url: str) -> str:
70
- """
71
- If the sentence already contains the anchor text, wrap the FIRST occurrence with <a>.
72
- Case-insensitive; keeps original casing in the sentence.
73
- """
74
  pattern = re.compile(re.escape(anchor_text), re.IGNORECASE)
75
  return pattern.sub(f'<a href="{target_url}">{anchor_text}</a>', sentence, count=1)
76
 
 
 
 
 
 
 
77
  def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
78
  blocks = get_text_blocks(source_url)
79
  if not blocks:
80
  return [{"error":"No text blocks found on the page."}]
81
 
82
- # Target context helps similarity
83
  try:
84
  tgt_html = requests.get(target_url, timeout=20, headers=UA).text
85
  tt = BeautifulSoup(tgt_html, "html.parser").title
@@ -89,7 +95,6 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
89
 
90
  ext = tldextract.extract(target_url)
91
  tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
92
-
93
  query = f"{anchor_text} — relevant to: {tgt_title} ({tgt_domain})"
94
  q_emb = embed([query])[0]
95
 
@@ -99,30 +104,28 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
99
 
100
  results = []
101
  for idx in top_idx:
102
- blk = blocks[idx]
103
- sents = re.split(r'(?<=[.!?])\s+', blk)
104
  s_embs = embed(sents)
105
  s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
106
  si = int(torch.argmax(s_sims))
107
  best_sent = sents[si]
108
 
109
- # Check if anchor text already appears in the chosen sentence
110
- if re.search(re.escape(anchor_text), best_sent, flags=re.IGNORECASE):
111
  wrapped = wrap_existing_anchor(best_sent, anchor_text, target_url)
112
  results.append({
113
  "anchor_was_present": True,
 
114
  "best_sentence_original": best_sent,
115
- "best_sentence_with_anchor": wrapped,
116
- "best_paragraph": blk
117
  })
118
  else:
119
- # No injection here just return the raw original + paragraph;
120
- # GPT will do the smart integration.
121
  results.append({
122
  "anchor_was_present": False,
 
123
  "best_sentence_original": best_sent,
124
- "best_sentence_with_anchor": None, # GPT will produce
125
- "best_paragraph": blk
126
  })
127
  return results
128
 
@@ -146,49 +149,65 @@ def _openai_chat(model_name: str, system: str, user_json: dict):
146
  txt = r.json()["choices"][0]["message"]["content"]
147
  return json.loads(txt)
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  # =========================
150
- # GPT editor (single step; may expand to 1–3 sentences)
151
  # =========================
152
  def gpt_integrate_anchor(paragraph_text: str,
153
  original_sentence: str,
154
  anchor_text: str,
155
  target_url: str):
156
  """
157
- Replace the original sentence with 1–3 natural sentences that integrate the anchor text.
158
- Keep EXACT anchor text and SAME href. No em dashes; avoid clichés.
159
- Returns: {"replacement_html": "<html>"} (1–3 sentences)
160
  """
 
 
161
  if not OPENAI_API_KEY:
162
- # Fallback if no key: minimally append a neutral clause (not ideal)
163
  base = original_sentence.rstrip(".!? ")
164
- draft = f'{base} (see <a href="{target_url}">{anchor_text}</a>).'
165
- return {"replacement_html": draft}
166
 
167
  system = (
168
- "You are an elite human content editor.\n"
169
- "INPUT:\n"
170
  "• paragraph_text: the full paragraph from an article\n"
171
- "• original_sentence: the specific sentence inside that paragraph we will replace\n"
172
- "• anchor_text and target_url: must appear EXACTLY once as <a href=\"URL\">ANCHOR</a>\n\n"
173
  "TASK:\n"
174
- "Rewrite the original sentence so the result reads 100% natural. You may expand to 1–3 sentences if that\n"
175
- "improves clarity and flow. Integrate the anchor seamlessly (do NOT bolt it on awkwardly).\n\n"
176
- "STRICT RULES:\n"
177
- "• Keep the EXACT anchor_text inside the <a> tag; keep the SAME href.\n"
 
 
178
  "• No em dashes or '--'. Avoid clichés like 'for details', 'click here', 'learn more', 'visit', 'read more'.\n"
179
  "• Prefer simple connectors ('with', 'from', 'in', 'at') over 'via' or 'through'.\n"
180
- "• Tone: clear, professional blog prose. 1–3 sentences total.\n"
181
- "• Output JSON with key 'replacement_html' only."
 
182
  )
183
 
184
  user = {
185
  "paragraph_text": paragraph_text,
186
  "original_sentence": original_sentence,
187
- "anchor_text": anchor_text,
188
- "target_url": target_url
189
  }
190
 
191
- # Try preferred model, then fallback
192
  try:
193
  obj = _openai_chat(PREFERRED_OPENAI_MODEL, system, user)
194
  except Exception:
@@ -196,38 +215,22 @@ def gpt_integrate_anchor(paragraph_text: str,
196
  obj = _openai_chat(FALLBACK_OPENAI_MODEL, system, user)
197
  except Exception:
198
  base = original_sentence.rstrip(".!? ")
199
- draft = f'{base} (see <a href="{target_url}">{anchor_text}</a>).'
200
- return {"replacement_html": draft}
201
-
202
- out = obj.get("replacement_html", "").strip() or original_sentence
203
-
204
- # Safety: ensure anchor text + href exist in output
205
- soup = BeautifulSoup(out, "html.parser")
206
- a = soup.find("a")
207
- if not a:
208
- return {"replacement_html": original_sentence}
209
- if anchor_text.lower() not in a.get_text().lower():
210
- return {"replacement_html": original_sentence}
211
- if target_url.lower() not in (a.get("href") or "").lower():
212
- return {"replacement_html": original_sentence}
213
-
214
- # Keep result reasonably short (<= ~3 sentences)
215
- plain = soup.get_text(" ", strip=True)
216
- # (we don't hard block multiple sentences; model already instructed 1–3)
217
 
 
 
 
218
  return {"replacement_html": out}
219
 
220
- # ---------- Plain-text helper (preserve spacing between tags)
221
- def to_plain_text(html_or_text):
222
- return BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
223
-
224
  # =========================
225
- # Gradio UI
226
  # =========================
227
  def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text):
228
  if not source_url or not target_url or not anchor_text:
229
  return "❌ Please provide Source URL, Target URL, and Anchor Text."
230
 
 
231
  warn = ""
232
  if looks_like_url(anchor_text) and not looks_like_url(target_url):
233
  anchor_text, target_url = target_url, anchor_text
@@ -239,13 +242,13 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text):
239
  if "error" in res:
240
  return f"❌ {res['error']}"
241
 
242
- # Case A: anchor already present → just wrap it and show where to add
243
  if res.get("anchor_was_present", False):
244
  wrapped_html = res["best_sentence_with_anchor"]
245
  final_output = to_plain_text(wrapped_html) if plain_text else wrapped_html
246
  return warn + f"✅ Add link here:\n\n{final_output}"
247
 
248
- # Case B: anchor not present → ask GPT to integrate naturally (may expand 1–3 sentences)
249
  original_sentence = res["best_sentence_original"]
250
  paragraph = res.get("best_paragraph", original_sentence)
251
 
@@ -253,14 +256,16 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text):
253
  edit = gpt_integrate_anchor(paragraph, original_sentence, anchor_text, target_url)
254
  final_html = edit["replacement_html"]
255
  else:
256
- # deterministic fallback when GPT off
257
  base = original_sentence.rstrip(".!? ")
258
- final_html = f'{base} (see <a href="{target_url}">{anchor_text}</a>).'
259
 
260
  final_output = to_plain_text(final_html) if plain_text else final_html
261
  return warn + f"Change this sentence:\n\n{original_sentence}\n\nWith this one:\n\n{final_output}"
262
 
263
- # Show GPT status in the header
 
 
264
  gpt_status = "ON" if OPENAI_API_KEY else "OFF"
265
  title_model = PREFERRED_OPENAI_MODEL if OPENAI_API_KEY else "OFF"
266
 
@@ -275,7 +280,7 @@ demo = gr.Interface(
275
  ],
276
  outputs=gr.Textbox(label="Result", lines=12),
277
  title=f"Link Insertion Helper • GPT: {gpt_status} • Model: {title_model}",
278
- description="Suggests the best place to add your link and returns one clean instruction. If the anchor isn't present, GPT rewrites the sentence (1–3 sentences) to integrate it naturally."
279
  )
280
 
281
  if __name__ == "__main__":
 
12
  UA = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"}
13
 
14
  # --- OpenAI settings ---
15
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # Add in HF Spaces: Settings → Variables & secrets
16
  PREFERRED_OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5o") # try this first
17
  FALLBACK_OPENAI_MODEL = "gpt-4o-mini" # fallback
18
  OPENAI_CHAT_URL = "https://api.openai.com/v1/chat/completions"
 
42
  return "https://" + url
43
  return url
44
 
45
+ def to_plain_text(html_or_text: str) -> str:
46
+ return BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
47
+
48
+ def get_text_blocks(url: str):
49
  resp = requests.get(url, timeout=20, headers=UA)
50
  resp.raise_for_status()
51
  soup = BeautifulSoup(resp.text, "html.parser")
 
70
  return mean_pool(out.last_hidden_state, batch["attention_mask"])
71
 
72
  def wrap_existing_anchor(sentence: str, anchor_text: str, target_url: str) -> str:
73
+ """If the sentence already contains the anchor text (case-insensitive), wrap the first match with <a>."""
 
 
 
74
  pattern = re.compile(re.escape(anchor_text), re.IGNORECASE)
75
  return pattern.sub(f'<a href="{target_url}">{anchor_text}</a>', sentence, count=1)
76
 
77
+ def anchor_in_text(text: str, anchor_text: str) -> bool:
78
+ return re.search(re.escape(anchor_text), text or "", flags=re.IGNORECASE) is not None
79
+
80
+ # =========================
81
+ # LinkBERT selection
82
+ # =========================
83
  def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
84
  blocks = get_text_blocks(source_url)
85
  if not blocks:
86
  return [{"error":"No text blocks found on the page."}]
87
 
88
+ # Target context to bias similarity
89
  try:
90
  tgt_html = requests.get(target_url, timeout=20, headers=UA).text
91
  tt = BeautifulSoup(tgt_html, "html.parser").title
 
95
 
96
  ext = tldextract.extract(target_url)
97
  tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
 
98
  query = f"{anchor_text} — relevant to: {tgt_title} ({tgt_domain})"
99
  q_emb = embed([query])[0]
100
 
 
104
 
105
  results = []
106
  for idx in top_idx:
107
+ paragraph = blocks[idx]
108
+ sents = re.split(r'(?<=[.!?])\s+', paragraph)
109
  s_embs = embed(sents)
110
  s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
111
  si = int(torch.argmax(s_sims))
112
  best_sent = sents[si]
113
 
114
+ if anchor_in_text(best_sent, anchor_text):
 
115
  wrapped = wrap_existing_anchor(best_sent, anchor_text, target_url)
116
  results.append({
117
  "anchor_was_present": True,
118
+ "best_paragraph": paragraph,
119
  "best_sentence_original": best_sent,
120
+ "best_sentence_with_anchor": wrapped
 
121
  })
122
  else:
123
+ # No forced injection here. We will let GPT rebuild the sentence naturally.
 
124
  results.append({
125
  "anchor_was_present": False,
126
+ "best_paragraph": paragraph,
127
  "best_sentence_original": best_sent,
128
+ "best_sentence_with_anchor": None
 
129
  })
130
  return results
131
 
 
149
  txt = r.json()["choices"][0]["message"]["content"]
150
  return json.loads(txt)
151
 
152
+ def _mk_anchor_html(target_url: str, anchor_text: str) -> str:
153
+ return f'<a href="{target_url}">{anchor_text}</a>'
154
+
155
+ def _ensure_placeholder(text: str) -> str:
156
+ """
157
+ If GPT forgets the [[ANCHOR]] placeholder,
158
+ add a neutral second sentence with it so we can safely inject the link.
159
+ """
160
+ text = (text or "").strip()
161
+ if "[[ANCHOR]]" not in text:
162
+ if text and not text.endswith((".", "!", "?")):
163
+ text += "."
164
+ text += " See [[ANCHOR]]."
165
+ return text
166
+
167
  # =========================
168
+ # GPT editor: integrate anchor with placeholder
169
  # =========================
170
  def gpt_integrate_anchor(paragraph_text: str,
171
  original_sentence: str,
172
  anchor_text: str,
173
  target_url: str):
174
  """
175
+ Ask GPT to replace the original sentence with 1–3 natural sentences.
176
+ It must place EXACTLY ONE placeholder [[ANCHOR]] where the link should go.
177
+ Then we inject the exact <a href="...">anchor_text</a> ourselves.
178
  """
179
+ anchor_html = _mk_anchor_html(target_url, anchor_text)
180
+
181
  if not OPENAI_API_KEY:
182
+ # Fallback when no key
183
  base = original_sentence.rstrip(".!? ")
184
+ draft = f"{base}. See [[ANCHOR]]."
185
+ return {"replacement_html": draft.replace("[[ANCHOR]]", anchor_html)}
186
 
187
  system = (
188
+ "You are an elite human content editor.\n\n"
189
+ "INPUT FIELDS:\n"
190
  "• paragraph_text: the full paragraph from an article\n"
191
+ "• original_sentence: the specific sentence inside that paragraph we will replace\n\n"
 
192
  "TASK:\n"
193
+ "Rewrite the original sentence so the result reads 100% natural IN CONTEXT. "
194
+ "You may expand to 1–3 sentences if that improves clarity and flow.\n\n"
195
+ "CRITICAL INSTRUCTION:\n"
196
+ "• Insert the literal placeholder [[ANCHOR]] EXACTLY ONCE where the link should appear. "
197
+ "Do NOT write any HTML for the link—only [[ANCHOR]].\n\n"
198
+ "STYLE RULES:\n"
199
  "• No em dashes or '--'. Avoid clichés like 'for details', 'click here', 'learn more', 'visit', 'read more'.\n"
200
  "• Prefer simple connectors ('with', 'from', 'in', 'at') over 'via' or 'through'.\n"
201
+ "• Tone: clear, professional blog prose. Keep it tight (1–3 sentences total).\n\n"
202
+ "OUTPUT:\n"
203
+ "Return JSON with key 'replacement_html' only, where [[ANCHOR]] appears exactly once."
204
  )
205
 
206
  user = {
207
  "paragraph_text": paragraph_text,
208
  "original_sentence": original_sentence,
 
 
209
  }
210
 
 
211
  try:
212
  obj = _openai_chat(PREFERRED_OPENAI_MODEL, system, user)
213
  except Exception:
 
215
  obj = _openai_chat(FALLBACK_OPENAI_MODEL, system, user)
216
  except Exception:
217
  base = original_sentence.rstrip(".!? ")
218
+ draft = f"{base}. See [[ANCHOR]]."
219
+ return {"replacement_html": draft.replace("[[ANCHOR]]", anchor_html)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
221
+ out = obj.get("replacement_html", "").strip()
222
+ out = _ensure_placeholder(out)
223
+ out = out.replace("[[ANCHOR]]", anchor_html)
224
  return {"replacement_html": out}
225
 
 
 
 
 
226
  # =========================
227
+ # Gradio UI flow
228
  # =========================
229
  def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text):
230
  if not source_url or not target_url or not anchor_text:
231
  return "❌ Please provide Source URL, Target URL, and Anchor Text."
232
 
233
+ # Auto-correct swapped inputs
234
  warn = ""
235
  if looks_like_url(anchor_text) and not looks_like_url(target_url):
236
  anchor_text, target_url = target_url, anchor_text
 
242
  if "error" in res:
243
  return f"❌ {res['error']}"
244
 
245
+ # Case A: anchor already present in the chosen sentence → just wrap and return
246
  if res.get("anchor_was_present", False):
247
  wrapped_html = res["best_sentence_with_anchor"]
248
  final_output = to_plain_text(wrapped_html) if plain_text else wrapped_html
249
  return warn + f"✅ Add link here:\n\n{final_output}"
250
 
251
+ # Case B: anchor not present → GPT integrates naturally with paragraph context
252
  original_sentence = res["best_sentence_original"]
253
  paragraph = res.get("best_paragraph", original_sentence)
254
 
 
256
  edit = gpt_integrate_anchor(paragraph, original_sentence, anchor_text, target_url)
257
  final_html = edit["replacement_html"]
258
  else:
259
+ # deterministic fallback when GPT is off
260
  base = original_sentence.rstrip(".!? ")
261
+ final_html = f'{base}. See {_mk_anchor_html(target_url, anchor_text)}.'
262
 
263
  final_output = to_plain_text(final_html) if plain_text else final_html
264
  return warn + f"Change this sentence:\n\n{original_sentence}\n\nWith this one:\n\n{final_output}"
265
 
266
+ # =========================
267
+ # Gradio app
268
+ # =========================
269
  gpt_status = "ON" if OPENAI_API_KEY else "OFF"
270
  title_model = PREFERRED_OPENAI_MODEL if OPENAI_API_KEY else "OFF"
271
 
 
280
  ],
281
  outputs=gr.Textbox(label="Result", lines=12),
282
  title=f"Link Insertion Helper • GPT: {gpt_status} • Model: {title_model}",
283
+ description="Finds the best place to add your link. If the anchor isn't present, GPT rewrites the sentence (1–3 sentences) using a placeholder and the app injects the exact link."
284
  )
285
 
286
  if __name__ == "__main__":