Update app.py
Browse files
app.py
CHANGED
|
@@ -13,8 +13,9 @@ UA = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML,
|
|
| 13 |
|
| 14 |
# --- OpenAI settings ---
|
| 15 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # add in HF Spaces: Settings → Variables & secrets
|
| 16 |
-
|
| 17 |
-
|
|
|
|
| 18 |
OPENAI_CHAT_URL = "https://api.openai.com/v1/chat/completions"
|
| 19 |
|
| 20 |
# =========================
|
|
@@ -66,53 +67,28 @@ def embed(texts):
|
|
| 66 |
out = enc(**batch)
|
| 67 |
return mean_pool(out.last_hidden_state, batch["attention_mask"])
|
| 68 |
|
| 69 |
-
# ---------- Fallback: integrate anchor mid-sentence (no em-dash, no clichés, neutral nouns)
|
| 70 |
def inject_anchor_into_sentence(sentence, anchor_text, target_url):
|
| 71 |
-
"""
|
| 72 |
-
Wrap anchor if present; otherwise integrate mid-sentence with a neutral preposition.
|
| 73 |
-
No em-dash. Avoid CTA clichés. Do not assert target content type.
|
| 74 |
-
Prefer 'Related resource' add-after if sentence begins with 'This guide' etc.
|
| 75 |
-
"""
|
| 76 |
def norm(x): return re.sub(r'[^a-z0-9 ]','',x.lower())
|
| 77 |
n_sent, n_anchor = norm(sentence), norm(anchor_text)
|
| 78 |
|
| 79 |
-
# If sentence clearly has its own subject ("This guide", "Our platform", "Base Casino"), prefer add-after
|
| 80 |
-
if n_sent.startswith("this guide") or n_sent.startswith("our platform") or n_sent.startswith("base casino"):
|
| 81 |
-
html = sentence
|
| 82 |
-
add_after = f' Related resource: <a href="{target_url}">{anchor_text}</a>.'
|
| 83 |
-
return html + add_after, False
|
| 84 |
-
|
| 85 |
-
# 1) If anchor words already present, wrap them
|
| 86 |
if n_anchor and n_anchor in n_sent:
|
| 87 |
pattern = re.compile(re.escape(anchor_text), re.IGNORECASE)
|
| 88 |
return pattern.sub(f'<a href="{target_url}">{anchor_text}</a>', sentence), True
|
| 89 |
|
| 90 |
-
#
|
| 91 |
-
insert_html = f'<a href="{target_url}">{anchor_text}</a>'
|
| 92 |
-
|
| 93 |
-
m = re.search(r'\b(games?|content|options?|features?|benefits?)\b', sentence, flags=re.I)
|
| 94 |
-
if m:
|
| 95 |
-
idx = m.start()
|
| 96 |
-
return (sentence[:idx] + f' at {insert_html} ' + sentence[idx:]).strip(), False
|
| 97 |
-
|
| 98 |
-
# after first comma
|
| 99 |
-
m2 = re.search(r',\s*', sentence)
|
| 100 |
-
if m2:
|
| 101 |
-
idx = m2.end()
|
| 102 |
-
return (sentence[:idx] + f' at {insert_html} ' + sentence[idx:]).strip(), False
|
| 103 |
-
|
| 104 |
-
# around "to"
|
| 105 |
-
m3 = re.search(r'\bto\b', sentence, flags=re.I)
|
| 106 |
-
if m3:
|
| 107 |
-
idx = m3.start()
|
| 108 |
-
return (sentence[:idx] + f' at {insert_html} ' + sentence[idx:]).strip(), False
|
| 109 |
-
|
| 110 |
-
# last resort: short neutral phrase
|
| 111 |
if sentence.endswith(('.', '!', '?')):
|
| 112 |
base, punct = sentence[:-1], sentence[-1]
|
| 113 |
else:
|
| 114 |
base, punct = sentence, '.'
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
return rewritten, False
|
| 117 |
|
| 118 |
def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
|
|
@@ -120,21 +96,17 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
|
|
| 120 |
if not blocks:
|
| 121 |
return [{"error":"No text blocks found on the page."}]
|
| 122 |
|
| 123 |
-
#
|
| 124 |
try:
|
| 125 |
tgt_html = requests.get(target_url, timeout=20, headers=UA).text
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
md = soup_tgt.find("meta", attrs={"name": "description"})
|
| 129 |
-
tgt_desc = (md.get("content") or "").strip() if md else ""
|
| 130 |
-
tgt_title = tt
|
| 131 |
except Exception:
|
| 132 |
-
tgt_title
|
| 133 |
|
| 134 |
ext = tldextract.extract(target_url)
|
| 135 |
tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
|
| 136 |
|
| 137 |
-
# NOTE: internal query string only (not shown to users)
|
| 138 |
query = f"{anchor_text} — relevant to: {tgt_title} ({tgt_domain})"
|
| 139 |
q_emb = embed([query])[0]
|
| 140 |
|
|
@@ -144,7 +116,7 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
|
|
| 144 |
|
| 145 |
results = []
|
| 146 |
for idx in top_idx:
|
| 147 |
-
blk = blocks[idx]
|
| 148 |
sents = re.split(r'(?<=[.!?])\s+', blk)
|
| 149 |
s_embs = embed(sents)
|
| 150 |
s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
|
|
@@ -154,87 +126,12 @@ def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
|
|
| 154 |
results.append({
|
| 155 |
"anchor_was_present": exact_found,
|
| 156 |
"best_sentence_original": best_sent,
|
| 157 |
-
"best_sentence_with_anchor": rewritten_sent
|
| 158 |
-
"best_paragraph": blk,
|
| 159 |
-
"tgt_title": tgt_title,
|
| 160 |
-
"tgt_desc": tgt_desc
|
| 161 |
})
|
| 162 |
return results
|
| 163 |
|
| 164 |
-
# ---------- Plain-text helper (preserve spacing between tags)
|
| 165 |
-
def to_plain_text(html_or_text):
|
| 166 |
-
return BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
|
| 167 |
-
|
| 168 |
-
# ---------- Distortion / safety helpers
|
| 169 |
-
def detect_primary_brand(paragraph: str) -> str:
|
| 170 |
-
"""
|
| 171 |
-
Heuristic: catch brand phrases like 'Base Casino', 'Acme Platform', 'Something App'.
|
| 172 |
-
Returns lowercased brand phrase or ''.
|
| 173 |
-
"""
|
| 174 |
-
p = paragraph.strip()
|
| 175 |
-
m = re.search(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2})\s+(Casino|Platform|Site|Service|App)\b', p)
|
| 176 |
-
if m:
|
| 177 |
-
return (m.group(0)).lower()
|
| 178 |
-
m2 = re.search(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2})\b', p)
|
| 179 |
-
return m2.group(0).lower() if m2 else ""
|
| 180 |
-
|
| 181 |
-
def rewrite_would_distort_meaning(original_text: str, rewritten_html: str, anchor_text: str, paragraph_text: str = "") -> bool:
|
| 182 |
-
"""
|
| 183 |
-
True if the rewrite likely misattributes the subject or positions the anchor as the mechanism.
|
| 184 |
-
Also flags if the anchor appears before the paragraph's main brand or too early overall,
|
| 185 |
-
or if it introduces content-type nouns that weren't present in the original.
|
| 186 |
-
"""
|
| 187 |
-
plain_rewrite = BeautifulSoup(rewritten_html, "html.parser").get_text(" ").strip().lower()
|
| 188 |
-
plain_orig = original_text.strip().lower()
|
| 189 |
-
a = anchor_text.strip().lower()
|
| 190 |
-
|
| 191 |
-
brand = detect_primary_brand(paragraph_text)
|
| 192 |
-
if brand and a in plain_rewrite:
|
| 193 |
-
pos_a = plain_rewrite.find(a)
|
| 194 |
-
pos_b = plain_rewrite.find(brand)
|
| 195 |
-
if pos_b != -1 and pos_a != -1 and pos_a < pos_b:
|
| 196 |
-
return True # anchor introduced before the paragraph’s brand
|
| 197 |
-
|
| 198 |
-
# Anchor appears very early -> often implies subject shift
|
| 199 |
-
if a in plain_rewrite:
|
| 200 |
-
pos = plain_rewrite.find(a)
|
| 201 |
-
if pos != -1 and pos <= max(4, int(0.20 * len(plain_rewrite))):
|
| 202 |
-
return True
|
| 203 |
-
|
| 204 |
-
# Anchor as the mechanism or double "at"
|
| 205 |
-
mechanism_patterns = [
|
| 206 |
-
rf'\bthrough\s+{re.escape(a)}\b',
|
| 207 |
-
rf'\bvia\s+{re.escape(a)}\b',
|
| 208 |
-
rf'\bat\s+{re.escape(a)}\s+at\b',
|
| 209 |
-
rf'\bon\s+{re.escape(a)}\s+at\b',
|
| 210 |
-
]
|
| 211 |
-
for pat in mechanism_patterns:
|
| 212 |
-
if re.search(pat, plain_rewrite):
|
| 213 |
-
return True
|
| 214 |
-
|
| 215 |
-
# Re-attribute authorship/hosting to anchor
|
| 216 |
-
bad_hosting = [
|
| 217 |
-
rf'(this|the)\s+guide\s+(at|on|from)\s+{re.escape(a)}\b',
|
| 218 |
-
rf'\b{re.escape(a)}\b\s+(explains|shows|details|covers)\b',
|
| 219 |
-
r'\b(guide|article|post|review)\s+(at|on|from)\s+',
|
| 220 |
-
]
|
| 221 |
-
for pat in bad_hosting:
|
| 222 |
-
if re.search(pat, plain_rewrite):
|
| 223 |
-
return True
|
| 224 |
-
|
| 225 |
-
# Introducing content-type nouns when not present in original
|
| 226 |
-
content_nouns = ["guide", "article", "post", "review", "platform", "site", "resource"]
|
| 227 |
-
if any(n in plain_rewrite for n in content_nouns) and not any(n in plain_orig for n in content_nouns):
|
| 228 |
-
return True
|
| 229 |
-
|
| 230 |
-
return False
|
| 231 |
-
|
| 232 |
-
def build_related_resource_line(target_url: str, anchor_text: str, plain_text=False) -> str:
|
| 233 |
-
html = f'Related resource: <a href="{target_url}">{anchor_text}</a>.'
|
| 234 |
-
return to_plain_text(html) if plain_text else html
|
| 235 |
-
|
| 236 |
# =========================
|
| 237 |
-
# GPT rewrite (editorial
|
| 238 |
# =========================
|
| 239 |
def _openai_chat(model_name: str, system: str, user_json: dict):
|
| 240 |
headers = {"Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json"}
|
|
@@ -253,78 +150,61 @@ def _openai_chat(model_name: str, system: str, user_json: dict):
|
|
| 253 |
txt = r.json()["choices"][0]["message"]["content"]
|
| 254 |
return json.loads(txt)
|
| 255 |
|
| 256 |
-
def
|
| 257 |
"""
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
- add_after_html (required if mode=add_after)
|
| 263 |
-
Enforces: no em-dash, no CTA clichés, neutral attribution unless metadata allows.
|
| 264 |
"""
|
| 265 |
if not OPENAI_API_KEY:
|
| 266 |
-
print("[GPT] No OPENAI_API_KEY found → using fallback
|
| 267 |
-
return {"
|
| 268 |
-
|
| 269 |
-
# Determine which content-type nouns are allowed based on metadata
|
| 270 |
-
meta = f"{tgt_title} {tgt_desc}".lower()
|
| 271 |
-
allowed_nouns = [w for w in ["guide","article","blog","review","platform","site","resource"] if w in meta]
|
| 272 |
|
| 273 |
system = (
|
| 274 |
-
"You are a
|
| 275 |
-
"
|
| 276 |
-
"
|
| 277 |
-
"
|
| 278 |
-
|
| 279 |
-
"
|
| 280 |
-
"
|
| 281 |
-
"
|
| 282 |
-
"1) If inline: include an <a href> with the EXACT anchor text; keep length close; no em-dash; avoid 'for details', "
|
| 283 |
-
"'click here', 'learn more', 'visit', 'read more', 'via', 'through'. Do NOT present the anchor as the mechanism "
|
| 284 |
-
"for the action (never 'through ANCHOR', 'via ANCHOR'). Prefer neutral adjuncts like 'also at', 'with context at', "
|
| 285 |
-
"'additional information at', or 'resources at' before the anchor. Place the anchor within the first 70% of the sentence "
|
| 286 |
-
"but after the paragraph’s brand/subject.\n"
|
| 287 |
-
"2) If add_after: return a single short line like 'Related resource: <a href=\"URL\">ANCHOR</a>.' "
|
| 288 |
-
"(12–14 words max, neutral tone).\n\n"
|
| 289 |
-
"OUTPUT JSON ONLY with keys: mode ('inline'|'add_after'), sentence_html (if inline), add_after_html (if add_after)."
|
| 290 |
)
|
| 291 |
-
|
| 292 |
user = {
|
| 293 |
-
"
|
| 294 |
-
"
|
| 295 |
"anchor_text": anchor_text,
|
| 296 |
"target_url": target_url,
|
| 297 |
-
"
|
| 298 |
-
"allowed_nouns_from_metadata": allowed_nouns,
|
| 299 |
"constraints": {
|
| 300 |
-
"
|
| 301 |
-
|
| 302 |
-
"via", "through", "—", "--", " - "
|
| 303 |
-
],
|
| 304 |
-
"preferred_connectors": ["at", "on", "from", "in"],
|
| 305 |
-
"place_anchor": "inside_first_70_percent"
|
| 306 |
}
|
| 307 |
}
|
| 308 |
|
|
|
|
| 309 |
try:
|
|
|
|
| 310 |
obj = _openai_chat(PREFERRED_OPENAI_MODEL, system, user)
|
| 311 |
except Exception as e:
|
| 312 |
print(f"[GPT] Preferred model failed: {e}. Falling back to {FALLBACK_OPENAI_MODEL}.")
|
| 313 |
try:
|
| 314 |
obj = _openai_chat(FALLBACK_OPENAI_MODEL, system, user)
|
| 315 |
except Exception as e2:
|
| 316 |
-
print(f"[GPT] Fallback failed: {e2}. Using
|
| 317 |
-
return {"
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
|
|
|
| 328 |
|
| 329 |
# =========================
|
| 330 |
# Gradio UI
|
|
@@ -345,47 +225,25 @@ def run_tool(source_url, target_url, anchor_text, smart_rewrite, plain_text):
|
|
| 345 |
if "error" in res:
|
| 346 |
return f"❌ {res['error']}"
|
| 347 |
|
| 348 |
-
draft_html
|
| 349 |
-
orig_sentence = res["best_sentence_original"]
|
| 350 |
-
paragraph = res["best_paragraph"]
|
| 351 |
-
tgt_title = res.get("tgt_title", "")
|
| 352 |
-
tgt_desc = res.get("tgt_desc", "")
|
| 353 |
-
|
| 354 |
-
# Optional conservative rule: force add-after for "This guide ..."
|
| 355 |
-
# if orig_sentence.strip().lower().startswith("this guide"):
|
| 356 |
-
# add_after = build_related_resource_line(target_url, anchor_text, plain_text)
|
| 357 |
-
# return warn + "Add this mini-line after the paragraph:\n\n" + add_after
|
| 358 |
|
|
|
|
| 359 |
if smart_rewrite:
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
if mode == "inline":
|
| 365 |
-
final_html = decision.get("sentence_html", "") or draft_html
|
| 366 |
-
# Safety gate: reject if it would distort meaning
|
| 367 |
-
if rewrite_would_distort_meaning(orig_sentence, final_html, anchor_text, paragraph):
|
| 368 |
-
add_after = build_related_resource_line(target_url, anchor_text, plain_text)
|
| 369 |
-
return warn + "Add this mini-line after the paragraph (to avoid changing its meaning):\n\n" + add_after
|
| 370 |
-
|
| 371 |
-
final_output = to_plain_text(final_html) if plain_text else final_html
|
| 372 |
-
# We propose a replacement to ensure the exact integrated version is used
|
| 373 |
-
return warn + f"Change this sentence:\n\n{orig_sentence}\n\nWith this one:\n\n{final_output}"
|
| 374 |
-
|
| 375 |
-
else: # add_after
|
| 376 |
-
add_line = decision.get("add_after_html") or build_related_resource_line(target_url, anchor_text, False)
|
| 377 |
-
add_line_out = to_plain_text(add_line) if plain_text else add_line
|
| 378 |
-
return warn + "Add this mini-line after the paragraph:\n\n" + add_line_out
|
| 379 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
else:
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
else:
|
| 386 |
-
return warn + f"Change this sentence:\n\n{orig_sentence}\n\nWith this one:\n\n{final_output}"
|
| 387 |
-
|
| 388 |
-
# Show GPT status / model in the header
|
| 389 |
gpt_status = "ON" if OPENAI_API_KEY else "OFF"
|
| 390 |
title_model = PREFERRED_OPENAI_MODEL if OPENAI_API_KEY else "OFF"
|
| 391 |
|
|
@@ -400,7 +258,7 @@ demo = gr.Interface(
|
|
| 400 |
],
|
| 401 |
outputs=gr.Textbox(label="Result", lines=12),
|
| 402 |
title=f"Link Insertion Helper • GPT: {gpt_status} • Model: {title_model}",
|
| 403 |
-
description="
|
| 404 |
)
|
| 405 |
|
| 406 |
if __name__ == "__main__":
|
|
|
|
| 13 |
|
| 14 |
# --- OpenAI settings ---
|
| 15 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # add in HF Spaces: Settings → Variables & secrets
|
| 16 |
+
# Preferred model (you asked for “the new 5”): try it first, fallback to a widely-available fast model
|
| 17 |
+
PREFERRED_OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5o") # change here if you like
|
| 18 |
+
FALLBACK_OPENAI_MODEL = "gpt-4o-mini" # automatic fallback
|
| 19 |
OPENAI_CHAT_URL = "https://api.openai.com/v1/chat/completions"
|
| 20 |
|
| 21 |
# =========================
|
|
|
|
| 67 |
out = enc(**batch)
|
| 68 |
return mean_pool(out.last_hidden_state, batch["attention_mask"])
|
| 69 |
|
|
|
|
| 70 |
def inject_anchor_into_sentence(sentence, anchor_text, target_url):
|
| 71 |
+
"""Wrap anchor if present; otherwise integrate link smoothly (no em-dash, no clichés)."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
def norm(x): return re.sub(r'[^a-z0-9 ]','',x.lower())
|
| 73 |
n_sent, n_anchor = norm(sentence), norm(anchor_text)
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
if n_anchor and n_anchor in n_sent:
|
| 76 |
pattern = re.compile(re.escape(anchor_text), re.IGNORECASE)
|
| 77 |
return pattern.sub(f'<a href="{target_url}">{anchor_text}</a>', sentence), True
|
| 78 |
|
| 79 |
+
# Build a natural, short integration clause (no em-dash)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
if sentence.endswith(('.', '!', '?')):
|
| 81 |
base, punct = sentence[:-1], sentence[-1]
|
| 82 |
else:
|
| 83 |
base, punct = sentence, '.'
|
| 84 |
+
|
| 85 |
+
clause_options = [
|
| 86 |
+
f' with insights from <a href="{target_url}">{anchor_text}</a>',
|
| 87 |
+
f' through <a href="{target_url}">{anchor_text}</a>',
|
| 88 |
+
f' via <a href="{target_url}">{anchor_text}</a>',
|
| 89 |
+
]
|
| 90 |
+
clause = clause_options[0]
|
| 91 |
+
rewritten = f'{base}{clause}{punct}'
|
| 92 |
return rewritten, False
|
| 93 |
|
| 94 |
def suggest_insertions(source_url, target_url, anchor_text, top_k=1):
|
|
|
|
| 96 |
if not blocks:
|
| 97 |
return [{"error":"No text blocks found on the page."}]
|
| 98 |
|
| 99 |
+
# target context
|
| 100 |
try:
|
| 101 |
tgt_html = requests.get(target_url, timeout=20, headers=UA).text
|
| 102 |
+
tt = BeautifulSoup(tgt_html, "html.parser").title
|
| 103 |
+
tgt_title = tt.get_text().strip() if tt else ""
|
|
|
|
|
|
|
|
|
|
| 104 |
except Exception:
|
| 105 |
+
tgt_title = ""
|
| 106 |
|
| 107 |
ext = tldextract.extract(target_url)
|
| 108 |
tgt_domain = ".".join([p for p in [ext.domain, ext.suffix] if p])
|
| 109 |
|
|
|
|
| 110 |
query = f"{anchor_text} — relevant to: {tgt_title} ({tgt_domain})"
|
| 111 |
q_emb = embed([query])[0]
|
| 112 |
|
|
|
|
| 116 |
|
| 117 |
results = []
|
| 118 |
for idx in top_idx:
|
| 119 |
+
blk = blocks[idx]
|
| 120 |
sents = re.split(r'(?<=[.!?])\s+', blk)
|
| 121 |
s_embs = embed(sents)
|
| 122 |
s_sims = F.cosine_similarity(s_embs, q_emb.repeat(len(sents),1))
|
|
|
|
| 126 |
results.append({
|
| 127 |
"anchor_was_present": exact_found,
|
| 128 |
"best_sentence_original": best_sent,
|
| 129 |
+
"best_sentence_with_anchor": rewritten_sent
|
|
|
|
|
|
|
|
|
|
| 130 |
})
|
| 131 |
return results
|
| 132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
# =========================
|
| 134 |
+
# GPT rewrite (editorial, no em-dash, no clichés)
|
| 135 |
# =========================
|
| 136 |
def _openai_chat(model_name: str, system: str, user_json: dict):
|
| 137 |
headers = {"Authorization": f"Bearer {OPENAI_API_KEY}", "Content-Type": "application/json"}
|
|
|
|
| 150 |
txt = r.json()["choices"][0]["message"]["content"]
|
| 151 |
return json.loads(txt)
|
| 152 |
|
| 153 |
+
def gpt_rewrite(sentence_html, anchor_text, target_url, style="neutral"):
|
| 154 |
"""
|
| 155 |
+
Stronger editorial rewrite:
|
| 156 |
+
- Integrates the anchor naturally (subject/object/prepositional phrase)
|
| 157 |
+
- No em-dash; avoid “for details / click here / learn more / visit / read more”
|
| 158 |
+
- Returns: {"sentence_html": "<final html>"}
|
|
|
|
|
|
|
| 159 |
"""
|
| 160 |
if not OPENAI_API_KEY:
|
| 161 |
+
print("[GPT] No OPENAI_API_KEY found → using fallback.")
|
| 162 |
+
return {"sentence_html": sentence_html}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
system = (
|
| 165 |
+
"You are a skilled content editor. Improve fluency and integrate the given anchor naturally "
|
| 166 |
+
"into ONE sentence of similar length. Use clear, publication-quality English. "
|
| 167 |
+
"STRICT RULES: (1) Include an <a href> tag that uses the EXACT anchor text. "
|
| 168 |
+
"(2) Do NOT use an em dash or any dash. "
|
| 169 |
+
'(3) Avoid phrases like "for details", "click here", "learn more", "visit", "read more". '
|
| 170 |
+
"Prefer integrating the anchor as part of the sentence (subject, object, or prepositional phrase), "
|
| 171 |
+
"e.g., “with insights from <a ...>ANCHOR</a>”, “through <a ...>ANCHOR</a>”, or “via <a ...>ANCHOR</a>”. "
|
| 172 |
+
"Return a compact JSON object with key sentence_html only. No extra keys, no markdown."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
)
|
|
|
|
| 174 |
user = {
|
| 175 |
+
"task": "rewrite_for_link_insertion",
|
| 176 |
+
"sentence_html": sentence_html,
|
| 177 |
"anchor_text": anchor_text,
|
| 178 |
"target_url": target_url,
|
| 179 |
+
"style": style,
|
|
|
|
| 180 |
"constraints": {
|
| 181 |
+
"max_extra_words": 20,
|
| 182 |
+
"avoid": ["for details", "click here", "learn more", "visit", "read more", "—", "--", " - "]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
}
|
| 184 |
}
|
| 185 |
|
| 186 |
+
# Try preferred model first, then fallback if needed
|
| 187 |
try:
|
| 188 |
+
print("[GPT] Calling OpenAI Chat Completions with preferred model…")
|
| 189 |
obj = _openai_chat(PREFERRED_OPENAI_MODEL, system, user)
|
| 190 |
except Exception as e:
|
| 191 |
print(f"[GPT] Preferred model failed: {e}. Falling back to {FALLBACK_OPENAI_MODEL}.")
|
| 192 |
try:
|
| 193 |
obj = _openai_chat(FALLBACK_OPENAI_MODEL, system, user)
|
| 194 |
except Exception as e2:
|
| 195 |
+
print(f"[GPT] Fallback failed: {e2}. Using fallback sentence.")
|
| 196 |
+
return {"sentence_html": sentence_html}
|
| 197 |
+
|
| 198 |
+
out = obj.get("sentence_html", sentence_html)
|
| 199 |
+
|
| 200 |
+
# Safety: ensure the anchor words are present (model must not drop the anchor)
|
| 201 |
+
if anchor_text.lower() not in BeautifulSoup(out, "html.parser").get_text().lower():
|
| 202 |
+
return {"sentence_html": sentence_html}
|
| 203 |
+
return {"sentence_html": out}
|
| 204 |
+
|
| 205 |
+
# ---------- Plain-text helper (preserve spacing between tags)
|
| 206 |
+
def to_plain_text(html_or_text):
|
| 207 |
+
return BeautifulSoup(html_or_text, "html.parser").get_text(separator=" ", strip=True)
|
| 208 |
|
| 209 |
# =========================
|
| 210 |
# Gradio UI
|
|
|
|
| 225 |
if "error" in res:
|
| 226 |
return f"❌ {res['error']}"
|
| 227 |
|
| 228 |
+
draft_html = res["best_sentence_with_anchor"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
+
# Optionally pass through GPT for a cleaner sentence
|
| 231 |
if smart_rewrite:
|
| 232 |
+
g = gpt_rewrite(draft_html, anchor_text, target_url, style="neutral")
|
| 233 |
+
final_html = g["sentence_html"]
|
| 234 |
+
else:
|
| 235 |
+
final_html = draft_html
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
+
# Optionally convert to plain text (no <a>, no tags)
|
| 238 |
+
final_output = to_plain_text(final_html) if plain_text else final_html
|
| 239 |
+
|
| 240 |
+
if res.get("anchor_was_present", False):
|
| 241 |
+
return warn + f"✅ Add link here:\n\n{final_output}"
|
| 242 |
else:
|
| 243 |
+
original_sentence = res['best_sentence_original']
|
| 244 |
+
return warn + f"Change this sentence:\n\n{original_sentence}\n\nWith this one:\n\n{final_output}"
|
| 245 |
+
|
| 246 |
+
# Show GPT status in the header
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
gpt_status = "ON" if OPENAI_API_KEY else "OFF"
|
| 248 |
title_model = PREFERRED_OPENAI_MODEL if OPENAI_API_KEY else "OFF"
|
| 249 |
|
|
|
|
| 258 |
],
|
| 259 |
outputs=gr.Textbox(label="Result", lines=12),
|
| 260 |
title=f"Link Insertion Helper • GPT: {gpt_status} • Model: {title_model}",
|
| 261 |
+
description="Suggests the best place to add your link and returns one clean instruction. Toggle GPT and Plain text (no URL) as needed."
|
| 262 |
)
|
| 263 |
|
| 264 |
if __name__ == "__main__":
|